def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if "en-noun" not in text: return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "en-noun": must_continue = False for param in t.params: pn = pname(param) if pn != "1": pagemsg("Template has %s=, not touching: %s" % (pn, origt)) must_continue = True break if must_continue: continue par1 = getparam(t, "1") if par1 == pagetitle + "s" or par1 == "s": rmparam(t, "1") notes.append("remove redundant 1=%s from {{%s}}" % (par1, tn)) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_adj_headt(t): origt = unicode(t) def getp(param): return getparam(t, param) tr = getp("tr") head = getp("head") if getp("1"): pagemsg("WARNING: Has 1=%s: %s" % (getp("1"), origt)) return must_continue = False for param in t.params: pn = pname(param) if pn not in ["head", "tr"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False del t.params[:] if not head: head = pagetitle if belib.needs_accents(head): pagemsg("WARNING: Head %s missing accents: %s" % (head, origt)) t.add("1", head) if tr: t.add("tr", tr) if origt != unicode(t): notes.append("fix up {{be-adj}} to use new param convention") pagemsg("Replaced %s with %s" % (origt, unicode(t))) return True
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "uk-conj-manual": aspect = getparam(t, "1") t.add("aspect", aspect, before="1", preserve_spacing=False) rmparam(t, "1") for param in t.params: pn = pname(param) if "_futr_" in pn: param.name = pn.replace("_futr_", "_fut_") to_fix = [] for param in t.params: pn = pname(param) pv = unicode(param.value) if pn.endswith("2"): to_fix.append((pn, pv)) for param in t.params: pn = pname(param) pv = unicode(param.value) if pn.endswith("3"): to_fix.append((pn, pv)) for pn, pv in to_fix: if pv.strip() and pv.strip() not in ["-", u"—"]: existing = getparam(t, pn[:-1]) if not existing: existing = pv else: existing = re.sub(r"(\s*)$", r", %s\1" % pv.strip(), existing) t.add(pn[:-1], existing, preserve_spacing=False) rmparam(t, pn) blib.set_template_name(t, "uk-conj-table") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{uk-conj-table}}" % tn) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "vi-hantu": if not one_char(pagetitle): pagemsg("WARNING: Length of page title is %s > 1, skipping" % len(pagetitle)) continue if getparam(t, "pos"): pagemsg("WARNING: Saw pos=, skipping: %s" % unicode(t)) continue chu = getparam(t, "chu") if chu and chu != "Nom": pagemsg("WARNING: Saw chu=%s not 'Nom', skipping: %s" % (chu, unicode(t))) continue if chu == "Nom": newparam = "nom" else: newparam = "reading" reading = blib.remove_links(getparam(t, "1")) if not reading: pagemsg("WARNING: Empty reading, skipping: %s" % unicode(t)) continue must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "rs", "chu"]: pagemsg( "WARNING: Unrecognized parameter %s=%s, skipping: %s" % (pn, unicode(param.value), unicode(t))) must_continue = True break if must_continue: continue t.add(newparam, reading, before="1") rmparam(t, "1") blib.set_template_name(t, "vi-readings") notes.append("{{vi-hantu}} -> {{vi-readings}}") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def insert_into_existing_pron_section(k): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn in pronun_templates: pagemsg("Already saw pronunciation template: %s" % unicode(t)) break else: # no break new_pron_template, pron_prefix = construct_new_pron_template() # Remove existing rhymes/hyphenation/pl-IPA lines for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template m = re.search(regex, subsections[k], re.M) if m: pagemsg("Removed existing %s" % m.group(1).strip()) notes.append("remove existing {{%s}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) for template in ["audio|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template all_audios = re.findall(regex, subsections[k], re.M) if len(all_audios) > 1: pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios())) return if len(all_audios) == 1: audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0] assert(tname(audiot) == "audio") if getparam(audiot, "1") != "pl": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) return audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, audio_line)) return if audiogloss in ["Audio", "audio"]: audiogloss = "" params = "|a=%s" % audiofile if audiogloss: params += "|ac=%s" % audiogloss new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:] pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip()) notes.append("incorporate existing {{%s}} into {{pl-p}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k] notes.append("insert %s into existing Pronunciation section" % new_pron_template) return True
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "ang" and getparam( t, "2") in ["adjective", "adjectives"]: for param in t.params: pn = pname(param) if pn not in ["1", "2", "head"]: pagemsg( "WARNING: head|ang|adjective with extra params: %s" % unicode(t)) break else: # no break blib.set_template_name(t, "ang-adj") rmparam(t, "1") rmparam(t, "2") notes.append("convert {{head|ang|adjective}} into {{ang-adj}}") elif tn == "ang-adj": if getparam(t, "2"): t.add("1", "") notes.append("remove unneeded 1= from {{ang-adj}}") else: param1 = getparam(t, "1") if param1: t.add("1", "") t.add("2", param1) notes.append("move 1= to 2= in {{ang-adj}}") param4 = getparam(t, "4") if param4: rmparam(t, "4") if not getparam(t, "1"): t.add("1", "") if not getparam(t, "2"): t.add("2", "") t.add("3", param4) notes.append("move 4= to 3= in {{ang-adj}}") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "ang" and getparam( t, "2") in ["verb", "verbs"]: for param in t.params: pn = pname(param) if pn not in ["1", "2", "head"]: pagemsg("WARNING: head|ang|verb with extra params: %s" % unicode(t)) break else: # no break blib.set_template_name(t, "ang-verb") rmparam(t, "1") rmparam(t, "2") notes.append("convert {{head|ang|verb}} into {{ang-verb}}") head = getparam(t, "head") if head: t.add("1", head) rmparam(t, "head") elif tn == "ang-verb": head = getparam(t, "head") head2 = getparam(t, "head2") head3 = getparam(t, "head3") rmparam(t, "head") rmparam(t, "head2") rmparam(t, "head3") if head: t.add("1", head) if head2: t.add("head2", head2) if head3: t.add("head3", head3) notes.append("move head= to 1= in {{ang-verb}}") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def etym_section_is_movable(sectext, header): parsed = blib.parse_text(sectext) inflection_of_templates_with_unrecognized_tags = [] saw_inflection_of_with_recognized_tag = False for t in parsed.filter_templates(): tn = tname(t) if tn == "inflection of": if getparam(t, "lang"): lang = getparam(t, "lang") first_tag_param = 3 else: lang = getparam(t, "1") first_tag_param = 4 if lang != "ar": pagemsg("WARNING: Non-Arabic language in Arabic {{inflection of}} in %s, skipping: %s" % (header, unicode(t))) return False tags = [] for param in t.params: pn = pname(param) pv = unicode(param.value).strip() if re.search("^[0-9]+$", pn) and int(pn) >= first_tag_param: tags.append(pv) if tags not in split_recognized_tag_sets: inflection_of_templates_with_unrecognized_tags.append(unicode(t)) else: saw_inflection_of_with_recognized_tag = True if not saw_inflection_of_with_recognized_tag: return False if inflection_of_templates_with_unrecognized_tags: pagemsg("WARNING: Unrecognized {{inflection of}} tag set mixed with recognized ones in %s, skipping: %s" % (header, " / ".join(inflection_of_templates_with_unrecognized_tags))) return False for t in parsed.filter_templates(): tn = tname(t) if tn in ["also", "ar-root", "nonlemma", "ar-IPA"]: continue if tn == "ar-verb-form": form = getparam(t, "1") if not form.endswith(u"و") and form.endswith(u"وْ"): pagemsg("WARNING: ar-verb-form form doesn't end with waw in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t))) return False continue if tn != "inflection of": pagemsg("WARNING: Unrecognized template in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t))) return False return True
def process_verb_headt(t): origt = unicode(t) def getp(param): return getparam(t, param) tr = getp("tr") if getp("2"): head = getp("1") g = getp("2") else: head = getp("head") g = getp("1") or getp("a") pf = blib.fetch_param_chain(t, "pf", "pf") impf = blib.fetch_param_chain(t, "impf", "impf") must_continue = False for param in t.params: pn = pname(param) if pn not in ["head", "tr", "1", "a", "2", "pf", "pf2", "pf3", "impf", "impf2", "impf3"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False del t.params[:] if not head: head = pagetitle if belib.needs_accents(head): pagemsg("WARNING: Head %s missing accents: %s" % (head, origt)) if not g: pagemsg("WARNING: No aspect in verb headword: %s" % origt) g = "?" t.add("1", head) if tr: t.add("tr", tr) t.add("2", g) blib.set_param_chain(t, pf, "pf", "pf") blib.set_param_chain(t, impf, "impf", "impf") if origt != unicode(t): notes.append("fix up {{be-verb}} to use new param convention") pagemsg("Replaced %s with %s" % (origt, unicode(t))) return True
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") # Greatly speed things up when --stdin by ignoring non-Latin pages if "==Latin==" not in text: return None, None if not re.search("la-(noun|proper noun|pronoun|verb|adj|num|suffix)-form", text): return None, None retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in [ "la-noun-form", "la-proper noun-form", "la-pronoun-form", "la-verb-form", "la-adj-form", "la-num-form", "la-suffix-form" ]: if not getparam(t, "1"): pagemsg("WARNING: Missing 1=: %s" % unicode(t)) for param in t.params: pn = pname(param) if pn not in ["1", "g", "g2", "g3", "g4"]: pagemsg("WARNING: Extraneous param %s=: %s" % (pn, unicode(t))) return None, None
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "PIE root": if not getparam(t, "2"): pagemsg("WARNING: Something wrong, no 2=: %s" % unicode(t)) continue blib.set_template_name(t, "root") newparams = [] for param in t.params: pn = pname(param) if re.search("^[0-9]+$", pn) and int(pn) >= 2: if pn == "2": newparams.append(("2", "ine-pro")) pv = unicode(param.value) if not pv.startswith("*"): pv = "*" + pv if not pv.endswith("-"): pv = pv + "-" newparams.append((unicode(int(pn) + 1), pv)) else: newparams.append( (unicode(param.name), unicode(param.value))) del t.params[:] for name, value in newparams: t.add(name, value, preserve_spacing=False) notes.append("convert {{%s}} to {{root|...|ine-pro}}" % tn) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def replace_name_translit(m): origline = m.group(0) source_lang, name_type, template, period = m.groups() if source_lang not in blib.languages_byCanonicalName: pagemsg( "WARNING: Unrecognized source lang %s, can't parse: <from> %s <to> %s <end>" % (source_lang, origline, origline)) return origline source_lang_code = blib.languages_byCanonicalName[source_lang][ "code"] parsed = blib.parse_text(template) t = list(parsed.filter_templates())[0] lang = getparam(t, "1") name = getparam(t, "2") alt = getparam(t, "3") eq = blib.remove_links(getparam(t, "4")) if source_lang_code != lang: pagemsg( "WARNING: Source lang code %s for %s != template lang code %s, can't parse: <from> %s <to> %s <end>" % (source_lang_code, source_lang, lang, origline, origline)) return origline if alt: pagemsg( "WARNING: Can't handle alt=%s in %s: <from> %s <to> %s <end>" % (alt, unicode(t), origline, origline)) return origline for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "4", "sc"]: pagemsg( "WARNING: Can't handle %s=%s in %s: <from> %s <to> %s <end>" % (pn, unicode(param.value), origline, origline)) return origline return "{{name translit|%s|%s|%s|type=%s%s}}%s" % ( thislangcode, source_lang_code, name, name_type, "|eq=%s" % eq if eq else "", period)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): if tname(t) == "bg-noun-form": origt = unicode(t) must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "head"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue rmparam(t, "1") rmparam(t, "2") head = getparam(t, "head") rmparam(t, "head") g = getparam(t, "3") rmparam(t, "3") blib.set_template_name(t, "head") t.add("1", "bg") t.add("2", "noun form") if head: t.add("head", head) else: if bglib.needs_accents(pagetitle): pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" % unicode(t)) else: t.add("head", pagetitle) if g: t.add("g", g) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}") headt = None saw_infl_after_head = False saw_headt = False saw_inflt = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) saw_infl = False already_fetched_forms = False if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form": saw_headt = True if headt and not saw_infl_after_head: pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % ( unicode(headt), origt)) saw_infl_after_head = False headt = t if tn == "bg-noun form of": saw_inflt = True if not headt: pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt) continue must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "noun"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue saw_infl_after_head = True noun = getparam(t, "noun") if not noun: pagemsg("WARNING: Didn't see noun=: %s" % origt) continue infls = [] param2 = getparam(t, "2") if param2 == "indefinite": infls.append("indef") elif param2 == "definite": infls.append("def") elif param2 == "vocative": infls.append("voc") elif param2: pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt)) continue param3 = getparam(t, "3") if param3 == "subject": infls.append("sbjv") elif param3 == "object": infls.append("objv") elif param3: pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt)) continue param1 = getparam(t, "1") if param1 == "singular": infls.append("s") elif param1 == "plural": infls.append("p") elif param1 == "count": infls.extend(["count", "form"]) elif param1 == "vocative": infls.extend(["voc", "s"]) else: pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt)) continue blib.set_template_name(t, "inflection of") del t.params[:] t.add("1", "bg") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) t.add("2", noun) else: t.add("2", lemma) t.add("3", "") for i, infl in enumerate(infls): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{bg-noun form of}} to {{inflection of}}") tn = tname(t) saw_infls = infls_to_slot(infls) already_fetched_forms = True if not saw_infls: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "inflection of" and getparam(t, "1") == "bg": saw_inflt = True infls = [] i = 4 while True: infl = getparam(t, str(i)) if not infl: break infls.append(infl) i += 1 saw_infls = infls_to_slot(infls) if not saw_infls: if "vnoun" in infls: pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt)) elif "part" in infls: pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt)) else: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "definite singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_sg" elif tn == "indefinite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "ind_pl" elif tn == "definite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_pl" elif tn == "vocative singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "voc_sg" if saw_infl: if not already_fetched_forms: noun = getparam(t, "2") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) continue t.add("2", lemma) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn)) if saw_infl == "def_sg": def_sub_sg = forms.get("def_sub_sg", None) def_obj_sg = forms.get("def_obj_sg", None) if def_sub_sg != def_obj_sg: pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % ( def_sub_sg, def_obj_sg)) continue form = def_sub_sg else: form = forms.get(saw_infl, None) if not form: pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" % (saw_infl, format_forms(forms))) continue form = form.split(",") filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle] if not filtered_form: pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form))) continue form = filtered_form existing_form = blib.fetch_param_chain(headt, "head", "head") if existing_form: must_continue = False for f in existing_form: if bglib.remove_accents(f) != pagetitle: pagemsg("WARNING: Existing head %s doesn't match page title: %s" % ( f, unicode(headt))) must_continue = True break if must_continue: continue needs_accents = [bglib.needs_accents(f) for f in existing_form] if any(needs_accents) and not all(needs_accents): pagemsg("WARNING: Some but not all existing heads missing accents: %s" % unicode(headt)) continue if not any(needs_accents): if existing_form != form: pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % ( saw_infl, ",".join(existing_form), ",".join(form))) continue origheadt = unicode(headt) blib.set_param_chain(headt, form, "head", "head") pagemsg("Replaced %s with %s" % (origheadt, unicode(headt))) notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form))) if saw_headt and not saw_inflt: pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt)) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in template_to_infl_codes and getparam(t, "1") == "bg": must_continue = False for param in t.params: if pname(param) not in ["1", "2"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue infl_codes = template_to_infl_codes[tn] blib.set_template_name(t, "inflection of") t.add("3", "") for i, infl in enumerate(infl_codes): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{inflection of}}" % tn) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if old_adj_template not in text and "es-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "es-noun" and args.remove_redundant_noun_args: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) if not getparam(t, "2") and (getparam(t, "pl2") or getparam(t, "pl3")): pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" % unicode(t)) continue g = getparam(t, "1") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls and not g.endswith("-p"): pagemsg( "WARNING: Space in m=%s and old default noun algorithm applying" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls and not g.endswith("-p"): pagemsg( "WARNING: Space in f=%s and old default noun algorithm applying" % ",".join(fs)) pls = blib.fetch_param_chain(t, "2", "pl") if not pls and not g.endswith("-p"): if " " in lemma: pagemsg( "WARNING: Space in headword and old default noun algorithm applying" ) continue pls_with_def = [] defpl = make_plural(lemma) if not defpl: continue if len(defpl) > 1: if set(pls) == set(defpl): pls_with_def = ["+"] elif set(pls) < set(defpl): pagemsg( "WARNING: pls=%s subset of defpls=%s, replacing with default" % (",".join(pls), ",".join(defpl))) pls_with_def = ["+"] else: pls_with_def = pls else: for pl in pls: if pl == defpl[0]: pls_with_def.append("+") else: pls_with_def.append(pl) actual_special = None for special in all_specials: special_pl = make_plural(lemma, special) if special_pl is None: continue if len(special_pl) > 1 and set(pls) < set(special_pl): pagemsg( "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing" % (special, ",".join(pls), ",".join(special_pl))) actual_special = special break if set(pls) == set(special_pl): pagemsg("Found special=%s with special_pl=%s" % (special, ",".join(special_pl))) actual_special = special break if pls_with_def == ["+"]: notes.append("remove redundant plural%s %s from {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls))) blib.remove_param_chain(t, "2", "pl") elif actual_special: notes.append("replace plural%s %s with +%s in {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls), actual_special)) blib.set_param_chain(t, ["+" + actual_special], "2", "pl") elif pls_with_def != pls: notes.append( "replace default plural %s with '+' in {{es-noun}}" % ",".join(defpl)) blib.set_param_chain(t, pls_with_def, "2", "pl") def handle_mf(mf, mf_full, make_mf): mfs = blib.fetch_param_chain(t, mf, mf) mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl") if mfs and not any(x.startswith("+") for x in mfs): defmf = make_mf(lemma) if set(mfs) == {defmf}: defpls = make_plural(defmf) ok = False if not mfpls or set(mfpls) == set(defpls): ok = True elif set(mfpls) < set(defpls): pagemsg( "WARNING: %pl=%s subset of default=%s, allowing" % (mf, ",".join(mfpls), ",".join(defpls))) ok = True if ok: notes.append( "replace %s=%s with '+' in {{es-noun}}" % (mf, ",".join(mfs))) blib.set_param_chain(t, ["+"], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") return actual_special = None for special in all_specials: special_mf = make_mf(lemma, special) if special_mf is None: continue if mfs == [special_mf]: pagemsg("Found special=%s with special_mf=%s" % (special, special_mf)) actual_special = special break if actual_special: if not mfpls: pagemsg( "WARNING: Explicit %s=%s matches special=%s but no %s plural" % (mf, ",".join(mfs), actual_special, mf_full)) else: special_mfpl = make_plural(special_mf, actual_special) if special_mfpl: if len(special_mfpl) > 1 and set(mfpls) < set( special_mfpl): pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) elif set(mfpls) == set(special_mfpl): pagemsg( "Found %s=%s and special=%s, %spls=%s matches special_%spl" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf)) else: pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) actual_special = None if actual_special: notes.append( "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural" % (mf_full, ",".join(mfs), actual_special, mf_full)) blib.set_param_chain(t, ["+%s" % actual_special], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") if not actual_special: defmf = make_mf(lemma) mfs_with_def = ["+" if x == defmf else x for x in mfs] if mfs_with_def != mfs: notes.append( "replace default %s %s with '+' in {{es-noun}}" % (mf_full, defmf)) blib.set_param_chain(t, mfs_with_def, mf, mf) if mfpls: defpl = [ x for y in mfs for x in (make_plural(y) or []) ] ok = False if set(defpl) == set(mfpls): ok = True elif len(defpl) > 1 and set(mfpls) < set(defpl): pagemsg( "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing" % (mf, ",".join(mfs), mf, ",".join(mfpls), ",".join(defpl))) ok = True if ok: pagemsg( "Found %s=%s, %spl=%s matches default pl" % (mf, ",".join(mfs), mf, ",".join(mfpls))) notes.append( "remove redundant explicit %s plural %s in {{es-noun}}" % (mf_full, ",".join(mfpls))) blib.remove_param_chain( t, mf + "pl", mf + "pl") else: for special in all_specials: defpl = [ x for y in mfs for x in ( make_plural(y, special) or []) ] if set(defpl) == set(mfpls): pagemsg( "Found %s=%s, %spl=%s matches special=%s" % (mf, ",".join(mfs), mf, ",".join(mfpls), special)) notes.append( "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}" % (mf_full, ",".join(mfpls), special)) blib.set_param_chain( t, ["+%s" % special], mf + "pl", mf + "pl") handle_mf("f", "feminine", make_feminine) handle_mf("m", "masculine", make_masculine) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: pagemsg("No changes to %s" % unicode(t)) if tn == "es-noun" and args.make_multiword_plural_explicit: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) if " " in lemma and not getparam(t, "2"): g = getparam(t, "1") if not g.endswith("-p"): explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" % (lemma, g)) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string" ) continue plurals = explicit_pl.split(",") blib.set_param_chain(t, plurals, "2", "pl") notes.append("add explicit plural to multiword noun") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls: mpls = [] for m in ms: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" % (blib.remove_links(m))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string" % m) continue this_mpls = explicit_pl.split(",") mpls.extend(this_mpls) blib.set_param_chain(t, mpls, "mpl", "mpl") notes.append("add explicit plural to m=%s" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") fpls = blib.fetch_param_chain(t, "fpl", "fpl") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls: fpls = [] for f in fs: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" % (blib.remove_links(f))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string" % f) continue this_fpls = explicit_pl.split(",") fpls.extend(this_fpls) blib.set_param_chain(t, fpls, "fpl", "fpl") notes.append("add explicit plural to f=%s" % ",".join(fs)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if tn == old_adj_template: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) deff = make_feminine(pagetitle) defmpl = make_plural(pagetitle) fs = [] fullfs = [] f = getparam(t, "f") or pagetitle fullfs.append(f) if f == deff: f = "+" elif f == lemma: f = "#" fs.append(f) f2 = getparam(t, "f2") if f2: fullfs.append(f2) if f2 == deff: f2 == "+" fs.append(f2) mpls = [] mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s" mpls.append(mpl) mpl2 = getparam(t, "mpl2") or getparam(t, "pl2") if mpl2: mpls.append(mpl2) fullmpls = mpls # should really check for subsequence but it never occurs if set(mpls) == set(defmpl): mpls = ["+"] elif set(mpls) < set(defmpl): pagemsg( "WARNING: mpls=%s subset of defmpl=%s, replacing with default" % (",".join(mpls), ",".join(defmpl))) mpls = ["+"] mpls = ["#" if x == lemma else x for x in mpls] deffpl = [x for f in fullfs for x in make_plural(f)] fpls = [] fpl = getparam(t, "fpl") or getparam( t, "pl") or (getparam(t, "f") or pagetitle) + "s" fpls.append(fpl) fpl2 = getparam(t, "fpl2") or getparam(t, "pl2") if fpl2: fpls.append(fpl2) fullfpls = fpls # should really check for subsequence but it never occurs if set(fpls) == set(deffpl): fpls = ["+"] elif set(fpls) < set(deffpl): pagemsg( "WARNING: fpls=%s subset of deffpl=%s, replacing with default" % (",".join(fpls), ",".join(deffpl))) fpls = ["+"] fpls = ["#" if x == lemma else x for x in fpls] actual_special = None for special in all_specials: deff = make_feminine(pagetitle, special) if deff is None: continue defmpl = make_plural(pagetitle, special) deffpl = make_plural(deff, special) deff = [deff] if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl: actual_special = special break head = getparam(t, "head") must_continue = False for param in t.params: pn = pname(param) pv = unicode(param.value) if pn == "1" and pv in ["m", "mf"]: pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" % (pn, pv, unicode(t))) continue if pn not in [ "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl", "fpl2" ]: pagemsg("WARNING: Saw unrecognized param %s=%s in %s" % (pn, pv, unicode(t))) must_continue = True break if must_continue: continue del t.params[:] if head: t.add("head", head) if fullfs == [pagetitle] and fullmpls == [ pagetitle ] and fullfpls == [pagetitle]: blib.set_template_name(t, "es-adj-inv") else: blib.set_template_name(t, "es-adj") if actual_special: t.add("sp", actual_special) else: if fs != ["+"]: blib.set_param_chain(t, fs, "f", "f") if mpls == fpls and ("+" not in mpls or defmpl == deffpl): # masc and fem pl the same if mpls != ["+"]: blib.set_param_chain(t, mpls, "pl", "pl") else: if mpls != ["+"]: blib.set_param_chain(t, mpls, "mpl", "mpl") if fpls != ["+"]: blib.set_param_chain(t, fpls, "fpl", "fpl") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to new {{%s}} format" % (old_adj_template, tname(t))) else: pagemsg("No changes to %s" % unicode(t)) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if "it-verb" not in text: return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) def getp(param): return getparam(t, param) if tn in ["it-verb"]: pagemsg("Saw %s" % unicode(t)) if not getp("1"): continue parts = [] aux = getp("aux") or "avere" split_aux_with_footnotes = split_with_footnotes(aux) split_aux_with_footnotes = [ re.sub("^avere", "a", x) for x in split_aux_with_footnotes ] split_aux_with_footnotes = [ re.sub("^essere", "e", x) for x in split_aux_with_footnotes ] parts.append(":".join(split_aux_with_footnotes) + "/") parts.append(":".join(split_with_footnotes(getp("1")))) arg2 = getp("2") arg3 = getp("3") if arg2 or arg3: parts.append("," + ":".join(split_with_footnotes(arg2))) if arg3: parts.append("," + ":".join(split_with_footnotes(arg3))) irregparams = ["imperf", "fut", "sub", "impsub", "imp"] for irregparam in irregparams: arg = getp(irregparam) if arg: parts.append("." + irregparam + ":" + ":".join(split_with_footnotes(arg))) if getp("impers"): parts.append(".only3s") if getp("only3sp"): parts.append(".only3sp") must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "aux", "impers", "only3sp" ] and pn not in irregparams: pagemsg("WARNING: Unrecognized param %s=%s" % (pn, unicode(param.value))) must_continue = True break if must_continue: continue del t.params[:] t.add("1", "".join(parts)) notes.append("convert {{it-verb}} params to new form") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) def verify_template_is_full_line(tn, line): line = line.strip() templates = list(blib.parse_text(line).filter_templates()) if type(tn) is list: tns = tn else: tns = [tn] tntext = "/".join(tns) if len(templates) == 0: pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line)) return None t = templates[0] if tname(t) not in tns: pagemsg( "WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" % (tntext, tntext, line)) return None if unicode(t) != line: pagemsg( "WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line)) return None return t notes = [] if len(pagetitle) == 1 or pagetitle.endswith("-"): pagemsg("Page title is a single letter or a prefix, skipping") return retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Polish", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections), 2): if re.search(r"==\s*Pronunciation\s*==", subsections[k]): secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k]) if secheader != subsections[k]: subsections[k] = secheader notes.append( "remove extraneous spaces in ==Pronunciation== header") extra_notes = [] parsed = blib.parse_text(subsections[k + 1]) num_pl_IPA = 0 saw_pl_p = False for t in parsed.filter_templates(): tn = tname(t) if tn in ["pl-p", "pl-pronunciation"]: saw_pl_p = True break if tn in ["pl-IPA", "pl-IPA-auto"]: num_pl_IPA += 1 if saw_pl_p: pagemsg("Already saw {{pl-p}}, skipping: %s" % unicode(t)) continue if num_pl_IPA == 0: pagemsg( "WARNING: Didn't see {{pl-IPA}} in Pronunciation section, skipping" ) continue if num_pl_IPA > 1: pagemsg( "WARNING: Saw multiple {{pl-IPA}} in Pronunciation section, skipping" ) continue lines = subsections[k + 1].strip().split("\n") # Remove blank lines. lines = [line for line in lines if line] hyph_lines = [] homophone_lines = [] rhyme_lines = [] audio_lines = [] must_continue = False newtemp = None next_audio_param = 0 has_respelling = False ipat = None for line in lines: origline = line # In case of "* {{pl-IPA|...}}", chop off the "* ". line = re.sub(r"^\*\s*(\{\{pl-IPA)", r"\1", line) if line.startswith("{{pl-IPA"): if newtemp: pagemsg( "WARNING: Something wrong, already saw {{pl-IPA}}?: %s" % origline) must_continue = True break ipat = verify_template_is_full_line( ["pl-IPA", "pl-IPA-auto"], line) if ipat is None: must_continue = True break newtemp_str = "{{pl-p}}" newtemp = list( blib.parse_text(newtemp_str).filter_templates())[0] for param in ipat.params: pn = pname(param) pv = unicode(param.value) if re.search("^[0-9]+$", pn): has_respelling = True newtemp.add(pn, pv, preserve_spacing=False) elif re.search("^qual[0-9]*$", pn): newtemp.add(pn.replace("qual", "q"), pv, preserve_spacing=False) else: pagemsg( "WARNING: Unrecognized param %s=%s in {{pl-IPA}}, skipping: %s" % (pn, pv, origline)) must_continue = True break if has_respelling: pagemsg("WARNING: {{pl-IPA}} has respelling: %s" % unicode(ipat)) if must_continue: break continue if not line.startswith("* ") and not line.startswith("*{"): pagemsg( "WARNING: Pronunciation section line doesn't start with '* ', skipping: %s" % origline) must_continue = True break if line.startswith("* "): line = line[2:] else: line = line[1:] if line.startswith("{{hyph"): hyph_lines.append(line) elif line.startswith("{{homophone") or line.startswith( "{{hmp"): homophone_lines.append(line) elif line.startswith("{{audio"): audio_lines.append(line) elif line.startswith("{{rhyme"): rhyme_lines.append(line) else: pagemsg( "WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline) must_continue = True break if has_respelling and (rhyme_lines or hyph_lines): rhyme_hyph = [] if rhyme_lines: rhyme_hyph.append("rhyme line(s) %s" % ",".join(rhyme_lines)) if hyph_lines: rhyme_hyph.append("hyphenation line(s) %s" % ",".join(hyph_lines)) # We formerly skipped these pages, but [[User:Vininn126]] requested running the bot on them. pagemsg("WARNING: Has respelling %s along with %s" % (ipat and unicode(ipat) or "UNKNOWN", " and ".join(rhyme_hyph))) #continue if must_continue: continue if audio_lines: must_continue = False for audio_line in audio_lines: audiot = verify_template_is_full_line("audio", audio_line) if audiot is None: must_continue = True break if getparam(audiot, "1") != "pl": pagemsg( "WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) must_continue = True break audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg( "WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (pn, pv, audio_line)) must_continue = True break if must_continue: break if audiogloss in ["Audio", "audio"]: audiogloss = "" if not newtemp: pagemsg( "WARNING: Saw %s without {{pl-IPA}}, skipping: %s" % (unicode(audiot), audio_line)) must_continue = True break next_audio_param += 1 if next_audio_param == 1: paramsuf = "" else: paramsuf = str(next_audio_param) newtemp.add("a%s" % paramsuf, audiofile, preserve_spacing=False) if audiogloss: newtemp.add("ac%s" % paramsuf, audiogloss, preserve_spacing=False) pagemsg("Replacing %s with %s" % (unicode(audiot), unicode(newtemp))) extra_notes.append("incorporate %s into {{pl-p}}" % unicode(audiot)) if must_continue: continue if rhyme_lines: if len(rhyme_lines) > 1: pagemsg("WARNING: Multiple rhyme lines, not removing: %s" % ", ".join(rhyme_lines)) continue rhyme_line = rhyme_lines[0] rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line) if not rhymet: continue if getparam(rhymet, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line)) continue pagemsg("Ignoring rhyme line: %s" % rhyme_line) extra_notes.append("remove rhyme template %s" % unicode(rhymet)) if hyph_lines: if len(hyph_lines) > 1: pagemsg( "WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines)) continue hyph_line = hyph_lines[0] hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_line) if not hypht: continue if getparam(hypht, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_line)) continue pagemsg("Ignoring hyphenation line: %s" % hyph_line) extra_notes.append("remove hyphenation template %s" % unicode(hypht)) if homophone_lines: next_homophone_param = 0 must_continue = False for homophone_line in homophone_lines: homophones = {} homophone_qualifiers = {} hmpt = verify_template_is_full_line( ["hmp", "homophone", "homophones"], homophone_line) if not hmpt: must_continue = True break if getparam(hmpt, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line)) must_continue = True break for param in hmpt.params: pn = pname(param) pv = unicode(param.value) if not re.search("^q?[0-9]+$", pn): pagemsg( "WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hmpt), homophone_line)) must_continue = True break if pn.startswith("q"): homophone_qualifiers[int(pn[1:])] = pv elif int(pn) > 1: homophones[int(pn) - 1] = pv if must_continue: break if not newtemp: pagemsg( "WARNING: Something wrong, saw %s without {{pl-IPA}}, skipping" % unicode(hmpt)) must_continue = True break hhs = [] hhp_args = [] for pn, pv in sorted(homophones.items()): next_homophone_param += 1 hmp_param = "" if next_homophone_param == 1 else str( next_homophone_param) hhs.append(pv) if pn in homophone_qualifiers: hhp_args.append(("hhp%s" % hmp_param, homophone_qualifiers[pn])) if hhs: newtemp.add("hh", ",".join(hhs)) for pn, pv in hhp_args: newtemp.add(pn, pv, preserve_spacing=False) pagemsg("Replacing %s with %s" % (unicode(hmpt), unicode(newtemp))) extra_notes.append("incorporate homophones into {{pl-p}}") if must_continue: continue pagemsg("Replaced %s with %s" % (unicode(ipat), unicode(newtemp))) all_lines = "\n".join([unicode(newtemp)]) newsubsec = "%s\n\n" % all_lines if subsections[k + 1] != newsubsec: this_notes = ["convert {{pl-IPA}} to {{pl-p}}"] + extra_notes notes.extend(this_notes) subsections[k + 1] = newsubsec secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "autocat": blib.set_template_name(t, "auto cat") notes.append("{{autocat}} -> {{auto cat}}") elif tn in ["ja-readingcat", "ryu-readingcat"]: m = re.search( "^Category:(Japanese|Okinawan) terms spelled with (.*?) read as (.*)$", pagetitle) if not m: pagemsg("WARNING: Can't parse page title") continue langname, kanji, reading = m.groups() if langname == "Japanese": auto_lang = "ja" else: auto_lang = "ryu" t_lang = re.sub("-.*", "", tn) if t_lang != auto_lang: pagemsg( "WARNING: Auto-determined lang code %s for language name %s != template specified %s: %s" % (auto_lang, langname, t_lang, unicode(t))) continue t_kanji = getparam(t, "1").strip() t_reading = getparam(t, "2").strip() if t_kanji != kanji: pagemsg( "WARNING: Auto-determined kanji %s != template specified %s: %s" % (kanji, t_kanji, unicode(t))) continue if t_reading != reading: pagemsg( "WARNING: Auto-determined reading %s != template specified %s: %s" % (reading, t_reading, unicode(t))) continue numbered_params = [] must_continue = False for param in t.params: pn = pname(param) pv = unicode(param.value) if pn in ["1", "2"]: pass elif re.search("^[0-9]+$", pn): numbered_params.append(pv) else: pagemsg( "WARNING: Saw unknown non-numeric param %s=%s, skipping: %s" % (pn, pv, unicode(t))) must_continue = True break if must_continue: continue if len(numbered_params) == 0: pagemsg("WARNING: No reading types given, skipping: %s" % unicode(t)) continue blib.set_template_name(t, "auto cat") del t.params[:] for index, numbered_param in enumerate(numbered_params): t.add(str(index + 1), numbered_param, preserve_spacing=False) notes.append("convert {{%s}} to {{auto cat}}" % tn) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) headt = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["de-verb-old", "de-verb-strong", "de-verb-weak" ] or tn == "head" and getparam(t, "1") == "de" and getparam( t, "2") == "verb": if headt: pagemsg( "WARNING: Encountered headword twice without declension: old %s, current %s" % (unicode(headt), unicode(t))) return headt = t headtn = tn if tn == "de-conj": if not headt: pagemsg("WARNING: Encountered conj without headword: %s" % unicode(t)) return param4_ignorable = False if getparam(headt, "4") in ["h", "haben", "s", "sein"]: param4_ignorable = True for param in headt.params: pn = pname(param) pv = unicode(param.value) if not pv: continue if headtn == "head": allowed_params = ["1", "2", "head"] elif headtn == "de-verb-weak": allowed_params = ["1", "2", "3", "auxiliary", "cat"] elif headtn == "de-verb-strong": allowed_params = [ "1", "2", "3", "class", "class 2", "pres 2", "pres 2 qual", "past 2", "past 2 qual", "past participle 2", "past participle 2 qual", "past subjunctive", "past subjunctive 2", "past subjunctive 2 qual", "auxiliary", "cat" ] else: allowed_params = ["head"] if param4_ignorable: allowed_params.append("4") if pn not in allowed_params: pagemsg("WARNING: Encountered unknown param %s=%s in %s" % (pn, pv, unicode(headt))) return def canonicalize_existing(forms): forms = [re.sub(" '*or'* ", ",", form) for form in forms] forms = [ splitform for form in forms for splitform in form.split(",") ] return [blib.remove_links(form) for form in forms if form] def compare(old, new, entities_compared): if not old: return True if set(old) != set(new): pagemsg( "WARNING: Old %s %s disagree with new %s %s: head=%s, decl=%s" % (entities_compared, ",".join(old), entities_compared, ",".join(new), unicode(headt), unicode(t))) return False return True def fetch_aux(): aux = getparam(headt, "auxiliary") if aux in ["haben", "sein"]: aux = [aux] elif aux == "both": aux = ["haben", "sein"] elif not aux: aux = [] else: pagemsg( "WARNING: Unrecognized auxiliary=%s, skipping: %s" % (aux, unicode(headt))) return None if not aux: param4 = getparam(headt, "4") if param4 in ["h", "haben"]: aux = ["haben"] elif param4 in ["s", "sein"]: aux = ["sein"] return aux if headtn == "de-verb-weak": generate_template = re.sub( r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pres_3s = canonicalize_existing([getparam(headt, "1")]) past = canonicalize_existing([getparam(headt, "2")]) pp = canonicalize_existing([getparam(headt, "3")]) aux = fetch_aux() if aux is None: return if (not compare(pres_3s, forms.get("pres_3s", "-").split(","), "pres 3sgs") or not compare(past, forms.get("pret_3s", "-").split(","), "pasts") or not compare(pp, forms.get("perf_part", "-").split(","), "pp's") or not compare(aux, forms.get("aux", "-").split(","), "auxes")): headt = None continue if headtn == "de-verb-strong": generate_template = re.sub( r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pres_3s = canonicalize_existing( [getparam(headt, "1"), getparam(headt, "pres 2")]) past = canonicalize_existing( [getparam(headt, "2"), getparam(headt, "past 2")]) pp = canonicalize_existing([ getparam(headt, "3"), getparam(headt, "past participle 2") ]) past_subj = canonicalize_existing([ getparam(headt, "past subjunctive"), getparam(headt, "past subjunctive 2") ]) clazz = canonicalize_existing( [getparam(headt, "class"), getparam(headt, "class 2")]) aux = fetch_aux() if aux is None: return if (not compare(pres_3s, forms.get("pres_3s", "-").split(","), "pres 3sgs") or not compare(past, forms.get("pret_3s", "-").split(","), "pasts") or not compare(pp, forms.get("perf_part", "-").split(","), "pp's") or not compare(past_subj, forms.get("subii_3s", "-").split(","), "past subjs") or not compare(aux, forms.get("aux", "-").split(","), "auxes") or not compare(clazz, forms.get("class", "-").split(","), "classes")): headt = None continue del headt.params[:] blib.set_template_name(headt, "de-verb") arg1 = getparam(t, "1") if arg1: headt.add("1", arg1) notes.append("replace {{%s|...}} with new-style {{de-verb%s}}" % (headtn == "head" and "head|de|verb" or headtn, (arg1 and "|" + arg1 or ""))) headt = None if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "autocat": blib.set_template_name(t, "auto cat") notes.append("{{autocat}} -> {{auto cat}}") elif tn in [ "prefix cat", "suffix cat", "circumfix cat", "infix cat", "interfix cat" ]: m = re.search("^Category:(.*) ([a-z]+) ([a-z]+fix)ed with (.*)$", pagetitle) if not m: pagemsg("WARNING: Can't parse page title") continue langname, pos, affixtype, term_and_id = m.groups() m = re.search(r"^(.*?) \((.*)\)$", term_and_id) if m: term, id = m.groups() else: term, id = term_and_id, "" t_lang = getparam(t, "1") t_term = getparam(t, "2") t_alt = getparam(t, "3") t_pos = getparam(t, "pos") t_id = getparam(t, "id") t_tr = getparam(t, "tr") t_sort = getparam(t, "sort") t_sc = getparam(t, "sc") if langname not in blib.languages_byCanonicalName: pagemsg("WARNING: Unrecognized language name: %s" % langname) continue if blib.languages_byCanonicalName[langname]["code"] != t_lang: pagemsg( "WARNING: Auto-determined code %s for language name %s != manually specified %s" % (blib.languages_byCanonicalName[langname]["code"], langname, t_lang)) continue if tn[:-4] != affixtype: pagemsg( "WARNING: Auto-determined affix type %s != manually specified %s" % (affixtype, tn[:-4])) continue def add_missing_hyphens(alt): hyph_c = "([" + possible_hyphens + "])" m = re.search(r"^(\*)(.*)$", alt) if m: althyp, altbase = m.groups() else: althyp, altbase = "", alt m = re.search(r"^(\*)(.*)$", term) if m: termhyp, termbase = m.groups() else: termhyp, termbase = "", term if affixtype == "suffix": m = re.search("^" + hyph_c, termbase) if m: initial_hyphen = m.group(1) if not altbase.startswith(initial_hyphen): alt = althyp + initial_hyphen + altbase elif affixtype == "prefix": m = re.search(hyph_c + "$", termbase) if m: final_hyphen = m.group(1) if not altbase.endswith(final_hyphen): alt = althyp + altbase + final_hyphen elif affixtype in ["infix", "interfix"]: m = re.search("^" + hyph_c + ".*" + hyph_c + "$", termbase) if m: initial_hyphen, final_hyphen = m.groups() if not altbase.startswith(initial_hyphen): altbase = initial_hyphen + altbase if not altbase.endswith(final_hyphen): altbase = altbase + final_hyphen alt = althyp + altbase return alt orig_t_term = t_term t_term = add_missing_hyphens(t_term) already_checked_t_alt = False if t_term != term: manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, t_term)) if manual_entry_name != term: pagemsg( "WARNING: Can't match manually specified term %s (originally %s, entry name %s) to auto-determined term %s" % (t_term, orig_t_term, manual_entry_name, term)) continue if t_alt: pagemsg( "WARNING: Manually specified term %s has extra diacritics and alt=%s also specified, skipping" % (t_term, t_alt)) continue t_alt = t_term already_checked_t_alt = True if t_id != id: pagemsg( "WARNING: Auto-determined ID %s != manually specified %s" % (id, t_id)) continue if (pos == "words" and t_pos not in ["", "word", "words"] or pos != "words" and t_pos != pos and t_pos + "s" != pos and (not t_pos.endswith("x") or t_pos + "es" != pos)): pagemsg( "WARNING: Auto-determined pos %s doesn't match manually specified %s" % (pos, t_pos)) continue if t_alt and not already_checked_t_alt: orig_t_alt = t_alt t_alt = add_missing_hyphens(t_alt) manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, t_alt)) if manual_entry_name != term: pagemsg( "WARNING: Can't match manually specified alt %s (originally %s, entry name %s) to auto-determined term %s" % (t_alt, orig_t_alt, manual_entry_name, term)) continue if t_sort: auto_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, term)) autosort = expand_text( "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}" % (t_lang, auto_entry_name)) manual_entry_name = expand_text( "{{#invoke:languages/templates|makeEntryName|%s|%s}}" % (t_lang, add_missing_hyphens(t_sort))) manual_sort = expand_text( "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}" % (t_lang, manual_entry_name)) if manual_sort != autosort: pagemsg( "Keeping sort key %s because canonicalized sort key %s based on it not same as canonicalized sort key %s based on term %s" % (t_sort, manual_sort, autosort, term)) else: pagemsg( "Discarding sort key %s because canonicalized sort key %s based on it same as canonicalized sort key based on term %s" % (t_sort, manual_sort, term)) t_sort = "" must_continue = False all_existing_params = [ "1", "2", "3", "tr", "pos", "id", "tr", "sc", "sort" ] for param in t.params: pn = pname(param) if pn not in all_existing_params: pagemsg( "WARNING: Unrecognized param %s=%s in affix cat: %s" % (pn, unicode(param.value), unicode(t))) must_continue = True break if must_continue: continue for param in all_existing_params: rmparam(t, param) blib.set_template_name(t, "auto cat") if t_alt: if t_alt == term: pagemsg( "Not adding alt=%s because it's the same as the term" % t_alt) else: t.add("alt", t_alt) if t_tr: t.add("tr", t_tr) if t_sort: t.add("sort", t_sort) if t_sc: t.add("sc", t_sc) notes.append("convert {{%s}} to {{auto cat}}" % tn) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) need_ref_section = False for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) all_pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it": all_pronun_templates.append(t) saw_it_pr = False pronun_based_respellings = [] for t in parsed.filter_templates(): origt = unicode(t) def tmsg(txt): other_templates = [] for t in all_pronun_templates: thist = unicode(t) if thist != origt: other_templates.append(thist) pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" % ", ".join(other_templates) if len(other_templates) > 0 else "")) tn = tname(t) if tn == "it-pr": saw_it_pr = True respellings = blib.fetch_param_chain(t, "1") # FIXME, need to split on comma pronun_based_respellings.extend(respellings) break if tn == "IPA" and getparam(t, "1") == "it": saw_it_pr = True pronuns = blib.fetch_param_chain(t, "2") this_phonemic_pronun = None this_phonemic_respelling = None this_phonetic_pronun = None this_phonetic_respelling = None respellings = [] all_warnings = [] hack_respelling_warnings = [] main_warnings = [] unable = [False] for pronun in pronuns: respelling = ipa_to_respelling(pronun) respelling, this_hack_respelling_warnings = hack_respelling( pagetitle, respelling) hack_respelling_warnings.extend( this_hack_respelling_warnings) def set_unable(msg): main_warnings.append(msg) unable[0] = True tmsg("For pronun %s, generated respelling %s" % (pronun, respelling)) respelling_words = respelling.split(" ") for rw in respelling_words: if rw.endswith("-"): # prefix continue hacked_rw = re.sub( u".[\u0323\u0331]", "e", rw ) # pretend vowels with secondary or no stress are 'e' if not re.search( u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len( re.sub("[^aeiouAEIOU]", "", hacked_rw)) > 1: set_unable( "WARNING: For respelling %s for pronun %s, word %s is missing stress" % (respelling, pronun, rw)) if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$", respelling): set_unable( "WARNING: Strange char in respelling %s for pronun %s" % (respelling, pronun)) else: putative_pagetitle = re.sub( u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])", lambda m: vowel_respelling_to_spelling[m.group( 1)] + m.group(2), respelling) pagetitle_words = pagetitle.split(" ") putative_pagetitle_words = putative_pagetitle.split( " ") if len(pagetitle_words) != len( putative_pagetitle_words): set_unable( "WARNING: Page title has %s words but putative page title %s has %s words" % (len(pagetitle_words), putative_pagetitle, len(putative_pagetitle_words))) else: hacked_putative_pagetitle_words = [] for ptw, puptw in zip( pagetitle_words, putative_pagetitle_words): split_ptw = re.split("([Zz]+)", ptw) split_puptw = re.split( "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw) if len(split_ptw) != len(split_puptw): set_unable( "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s" % (ptw, puptw)) hacked_putative_pagetitle_words.append( puptw) else: parts = [] for i in xrange(len(split_puptw)): if i % 2 == 0: parts.append(split_puptw[i]) else: parts.append(split_ptw[i]) hacked_putative_pagetitle_words.append( "".join(parts)) putative_pagetitle = " ".join( hacked_putative_pagetitle_words) if putative_pagetitle != pagetitle: # If respelling already seen, we already warned about it. if respelling in respellings: assert unable[0] else: set_unable( "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)" % (respelling, putative_pagetitle, pronun)) def append_respelling(respelling): if respelling not in respellings: respellings.append(respelling) def append_warnings(warning): if warning: all_warnings.append(warning) for warning in hack_respelling_warnings: all_warnings.append(warning) del hack_respelling_warnings[:] for warning in main_warnings: all_warnings.append(warning) del main_warnings[:] append_respelling(respelling) if pronun.startswith("/"): if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling, pronun, respelling)) this_phonemic_pronun = pronun this_phonemic_respelling = respelling this_phonetic_pronun = None this_phonetic_respelling = None elif pronun.startswith("["): if this_phonemic_pronun is None: if this_phonetic_pronun is not None: unable[0] = True append_warnings( "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun" % (this_phonetic_pronun, this_phonetic_respelling, pronun, respelling)) else: append_warnings( "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun" % (pronun, respelling)) this_phonetic_pronun = pronun this_phonetic_respelling = respelling elif this_phonemic_respelling != respelling: unable[0] = True append_warnings( "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)" % (this_phonemic_respelling, this_phonemic_pronun, respelling, pronun)) else: if unable[0] and len(main_warnings) > 0: # `unable` could be set from a previous pronunciation but no main warnings this time around # because the previously generated warnings have already been appended to all_warnings. mesg = main_warnings[0] del main_warnings[0] append_warnings(mesg) else: append_warnings(None) this_phonemic_pronun = None this_phonemic_respelling = None else: unable[0] = True append_warnings( "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic" % (pronun, respelling)) if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling)) if not unable[0]: for param in t.params: pn = pname(param) if not re.search("^[0-9]+$", pn) and pn != "nocount": unable[0] = True append_warnings( "WARNING: Saw unrecognized param %s=%s" % (pn, unicode(param.value))) manual_assist = "" if unable[0]: if pagetitle in ipa_directives: respellings = ipa_directives[pagetitle] unable[0] = False manual_assist = " (manually assisted)" tmsg( "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", "s" if len(respellings) > 1 else "", ",".join(respellings), " ||| ".join(all_warnings))) else: tmsg("%s<respelling> %s <end> %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", " ".join(respellings), " ||| ".join(all_warnings))) if not unable[0]: del t.params[:] nextparam = 0 for param in respellings: if "=" in param: paramname, paramval = param.split("=", 1) else: nextparam += 1 paramname = str(nextparam) paramval = param if re.search("^n[0-9]*$", paramname): need_ref_section = True t.add(paramname, paramval) blib.set_template_name(t, "it-pr") notes.append( "replace raw {{IPA|it}} with {{it-pr|%s}}%s" % ("|".join(respellings), manual_assist)) pronun_based_respellings.extend(respellings) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) subsections[k] = unicode(parsed) rhymes_template = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it": if rhymes_template: pagemsg( "WARNING: Saw two {{rhymes|it}} templates: %s and %s" % (unicode(rhymes_template), unicode(t))) rhymes_template = t if rhymes_template: rhyme_based_respellings = [] all_warnings = [] def append_respelling(respelling): if respelling not in rhyme_based_respellings: rhyme_based_respellings.append(respelling) def append_warnings(warning): all_warnings.append(warning) rhymes = blib.fetch_param_chain(rhymes_template, "2") unable = False for rhy in rhymes: spellings = rhyme_to_spelling(rhy) matched = False bad_rhyme_msgs = [] for ending, ending_respelling in spellings: if pagetitle.endswith(ending): prevpart = pagetitle[:-len(ending)] respelling = prevpart + ending_respelling saw_oso_ese = False if ending_respelling == u"óso": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"ó[s]o") elif ending_respelling == u"ése": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"é[s]e") else: if respelling.endswith(u"zióne"): new_respelling = re.sub( u"zióne$", u"tsióne", respelling) pagemsg( "Replaced respelling '%s' with '%s'" % (respelling, new_respelling)) respelling = new_respelling prevpart = respelling[:-len( ending)] + ending_respelling append_respelling(respelling) if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)", prevpart.lower()) or not saw_oso_ese and re.search( u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]", ending_respelling.lower())): append_warnings( "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s" % rhy) unable = True break if "z" in prevpart: append_warnings( "WARNING: Unable to add pronunciation due to z in part before rhyme: %s" % rhy) unable = True break hacked_prevpart = re.sub("([gq])u", r"\1w", prevpart) hacked_prevpart = hacked_prevpart.replace( "gli", "gl") hacked_prevpart = re.sub("([cg])i", r"\1", hacked_prevpart) if re.search("[^aeiou][iu]([aeiou]|$)", hacked_prevpart.lower()): append_warnings( "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s" % rhy) unable = True break if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)", respelling.lower()): append_warnings( "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s" % rhy) unable = True break matched = True break else: bad_rhyme_msgs.append( "WARNING: Unable to match rhyme %s, spelling %s, respelling %s" % (rhy, ending, ending_respelling)) if not matched and not unable and bad_rhyme_msgs: for bad_rhyme_msg in bad_rhyme_msgs: pagemsg(bad_rhyme_msg) if rhyme_based_respellings: if not saw_it_pr: manual_assist = "" if pagetitle in rhyme_directives: rhyme_based_respellings = rhyme_directives[ pagetitle] manual_assist = " (manually assisted)" pagemsg( "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), " ||| ".join(all_warnings), unicode(rhymes_template))) subsections[k] = "* {{it-pr|%s}}\n" % ",".join( rhyme_based_respellings) + subsections[k] notes.append( "add Italian rhyme-based respelling%s %s%s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), manual_assist)) else: different_headers = [] for pos in [ "Noun", "Verb", "Adjective", "Adverb", "Participle" ]: if "==%s==" % pos in secbody: different_headers.append(pos) if len(different_headers) > 1: all_warnings[0:0] = [ "WARNING: Multiple headers %s seen" % ",".join(different_headers) ] if "Etymology 1" in secbody: all_warnings[0:0] = [ "WARNING: Multiple etymologies seen" ] pagemsg( "<respelling> all: %s <end>%s: <from> %s <to> %s <end>" % (" ".join(rhyme_based_respellings), " " + " ||| ".join(all_warnings) if all_warnings else "", unicode(rhymes_template), unicode(rhymes_template))) else: for respelling in rhyme_based_respellings: if (not re.search("^qual[0-9]*=", respelling) and pronun_based_respellings and respelling not in pronun_based_respellings): pagemsg( "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s" % (" (with problems)" if len(all_warnings) > 0 else "", respelling, ",".join(pronun_based_respellings), ": %s" % " ||| ".join(all_warnings) if len(all_warnings) > 0 else "")) if need_ref_section: for k in xrange(len(subsections) - 1, 2, -2): if re.search(r"^===\s*References\s*===$", subsections[k - 1].strip()): if not re.search(r"<references\s*/?\s*>", subsections[k]): subsections[k] = subsections[k].rstrip( "\n") + "\n<references />\n\n" notes.append( "add <references /> to existing ===References=== section for pronunciation refs" ) break else: # no break for k in xrange(len(subsections) - 1, 2, -2): if not re.search(r"==\s*(Anagrams|Further reading)\s*==", subsections[k - 1]): subsections[k + 1:k + 1] = [ "===References===\n", "<references />\n\n" ] notes.append( "add new ===References=== section for pronunciation refs" ) break else: # no break pagemsg( "WARNING: Something wrong, couldn't find location to insert ===References=== section" ) secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg): notes = [] def analyze_declts(declts, pagetitle, headword_gens, headword_pls): decl_genders_gens_and_pls = [] prev_is_weak = None prev_is_sg = None for declt in declts: def getp(param): return getparam(declt, param) tn = tname(declt) gender = re.sub(".*-", "", tn) if gender == "pl": gender = "p" decl_gens = [] decl_pls = [] if gender != "p": is_weak = False is_sg = False for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]: if getp(param): pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt))) return None if gender in ["m", "n"]: arg1 = getp("1") if not arg1: gen = "" elif arg1 in ["n", "ns", "en", "ens"]: is_weak = True gen = arg1 elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]: gen = arg1 else: pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt))) return None decl_gens = convert_gens(pagetitle, [gen], from_decl=True) num = getp("n") if num == "sg": is_sg = True elif num not in ["full", ""]: pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt))) return None if not is_sg: if gender == "f": plsuffix = getp("1") else: plsuffix = getp("2") argpl = getp("pl") if argpl: pl = argpl else: pl = pagetitle + plsuffix if pl == "-": is_sg = True else: decl_pls = normalize_values([pl]) if prev_is_weak is not None and prev_is_weak != is_weak: pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s" % (is_weak, prev_is_weak, declts_to_unicode(declts))) return None prev_is_weak = is_weak if prev_is_sg is not None and prev_is_sg != is_sg: pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s" % (is_sg, prev_is_sg, declts_to_unicode(declts))) return None prev_is_sg = is_sg decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls)) all_decl_genders = [] all_decl_gens = [] all_decl_pls = [] for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls: if decl_gender not in all_decl_genders: all_decl_genders.append(decl_gender) for decl_gen in decl_gens: if decl_gen not in all_decl_gens: all_decl_gens.append(decl_gen) for decl_pl in decl_pls: if decl_pl not in all_decl_pls: all_decl_pls.append(decl_pl) first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0] if len(all_decl_genders) > 1 and ( len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls) ): pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s" % declts_to_unicode(declts)) return None if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls): pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s" % declts_to_unicode(declts)) return None is_weak = prev_is_weak is_sg = prev_is_sg declspec = ":".join(all_decl_genders) def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc): defparts = [] for gender in all_decl_genders: defpart = pagetitle + get_default_part(pagetitle, gender, is_weak) if defpart not in defparts: defparts.append(defpart) if all_decl_parts == defparts: declspec += "," else: all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None) if set(headword_parts) == set(all_decl_parts): headword_part_forms = analyze_forms(pagetitle, headword_parts, None) if headword_part_forms != all_decl_part_forms: pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms, declts_to_unicode(declts))) all_decl_part_forms = headword_part_forms else: pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts))) declspec += ",%s" % all_decl_part_forms return declspec if "m" in all_decl_genders or "n" in all_decl_genders: declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive") if "p" not in all_decl_genders: declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural") declspec = re.sub(",*$", "", declspec) if is_weak: declspec += ".weak" if is_sg: declspec += ".sg" if ss: declspec += ".ss" return declspec, all_decl_genders, all_decl_gens, all_decl_pls old_style_headt = False for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]: if getparam(headt, param): old_style_headt = True break if not old_style_headt: pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt), declts and ", declts=%s" % declts_to_unicode(declts) or "")) return notes is_proper = tname(headt) == "de-proper noun" ss = False if declts: sses = [not not getparam(declt, "ss") for declt in declts] if len(set(sses)) > 1: pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts)) return ss = list(set(sses)) == [True] if ss: if not pagetitle.endswith(u"ß"): pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts)) return # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the # headword and declension specs. pagetitle = re.sub(u"ß$", "ss", pagetitle) adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts) genders = blib.fetch_param_chain(headt, "1", "g") headword_genders = genders gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True)) pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl")) dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim")) fems = normalize_values(blib.fetch_param_chain(headt, "f")) mascs = normalize_values(blib.fetch_param_chain(headt, "m")) if gens == [True]: gens = [] for param in headt.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and ( not adjectival or pn not in "head"): pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt))) return if not genders: pagemsg("WARNING: No genders in head template: %s" % unicode(headt)) return if "p" in genders and len(genders) > 1: pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt)) return if "p" in genders and (gens or pls): pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt)) return saw_mn = "m" in genders or "n" in genders if not saw_mn and not adjectival: if gens and gens == [pagetitle]: gens = [] if gens: pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt)) return if adjectival: if len(declts) > 1: pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s" % declts_to_unicode(declts)) return declt = declts[0] def getp(param): return getparam(declt, param) tn = tname(declt) m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn) if m: default_equiv = None is_sg, gender = m.groups() adj = getp("1") noun = getp("2") if gender in ["m", "f"]: default_equiv = adj + ("e" if gender == "m" else "er") if noun: default_equiv += " " + construct_default_equiv(noun, gender) if gender in ["m", "n"]: noun_gen = getp("3") noun_pl = getp("4") else: noun_gen = "-" noun_pl = getp("3") noun_pl_full = getp("pl") adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es" expected_lemma = adj + adj_ending if gender == "f": # Should be '-er' but we often see '-en' (weak form) instead expected_gens = [adj + "er", adj + "en"] else: expected_gens = [adj + "en"] if is_sg: expected_pls = [] else: expected_pls = [adj + "e", adj + "en"] if not noun: if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"): pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt)) return all_decl_genders = [gender] else: fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "") fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0] def analyze_headword_parts_for_noun(parts, desc): noun_headword_parts = [] for part in parts: m = re.search("^([^ ]+) ([^ ]+)$", part.strip()) if not m: pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s" % (desc, part, unicode(headt), unicode(declt))) return [] part_adj, part_noun = m.groups() noun_headword_parts.append(part_noun) return noun_headword_parts noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive") noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural") retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval expected_lemma = "%s %s" % (expected_lemma, noun) expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)] if is_sg: expected_pls = [] else: expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls] if pagetitle != expected_lemma: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s" % (expected_lemma, pagetitle, unicode(headt), unicode(declt))) return if set(genders) != set(all_decl_genders): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt))) return if not (set(gens) <= set(expected_gens)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt))) return if pls == ["-"]: if expected_pls: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return elif not (set(pls) <= set(expected_pls)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return if not noun: declspec = "+" if is_sg: declspec += ".sg" else: if re.search("^" + CAP, adj): adj_lemma = adj.lower() else: adj_lemma = adj if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]: adj_lemma += "e" adj_form = adj + adj_ending if adj_form.startswith(adj_lemma): adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):]) else: adj_link = "[[%s|%s]]" % (adj_lemma, adj_form) noun_link = "[[%s]]" % noun # This is less accurate than the above. Often head= is wrong. # Try to update adjective and noun links from head= if given. #head = getparam(headt, "head") #if head: # m = re.search("^([^ ]*) ([^ ]*)$", head) # if not m: # pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s" # % (head, unicode(headt), unicode(declt))) # else: # head_adj_link, head_noun_link = m.groups() # m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link) # if m: # adj_link_lemma, adj_link_form = m.groups() # if adj_link_form.startswith(adj_link_lemma): # head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):]) # if head_adj_link != adj_link: # pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s" # % (head_adj_link, adj_link, unicode(headt), unicode(declt))) # adj_link = head_adj_link # if head_noun_link != noun_link: # pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s" # % (head_noun_link, noun_link, unicode(headt), unicode(declt))) # noun_link = head_noun_link declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec) headspec = declspec is_both = is_proper and not is_sg else: pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts)) return else: # not adjectival if len(genders) == 1 and genders[0] in ["m", "f"]: default_equiv = construct_default_equiv(pagetitle, genders[0]) headspec = ":".join(genders) is_sg = False is_both = False is_weak = False headword_gens = [] headword_pls = [] if headspec != "p": pls = convert_pls(pagetitle, pls, is_proper=is_proper) headword_pls = pls if saw_mn: gens = convert_gens(pagetitle, gens) headword_gens = gens if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"]) and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))): is_weak = True def_gens = [] for gender in genders: def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak) if def_gen not in def_gens: def_gens.append(def_gen) if set(def_gens) == set(gens): headspec += "," else: headspec += ",%s" % analyze_forms(pagetitle, gens, None) def_pls = [] for gender in genders: def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak) if def_pl not in def_pls: def_pls.append(def_pl) if set(def_pls) == set(pls): headspec += "," if is_proper: is_both = True elif pls == ["-"]: is_sg = True else: headspec += ",%s" % analyze_forms(pagetitle, pls, None) headspec = re.sub(",*$", "", headspec) if is_weak: headspec += ".weak" if is_sg: headspec += ".sg" if ss: headspec += ".ss" extraspec = "" if dims: extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",") if fems: extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",") if mascs: extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",") if declts and not adjectival: retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval if headspec != declspec: if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls): if set(all_decl_genders) == set(headword_genders): pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt))) declspec = headspec else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return if is_proper: headspec = headspec.replace(".sg", "") if is_both: if ".ss" in headspec: headspec = headspec.replace(".ss", ".both.ss") else: headspec += ".both" newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec) headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt) outmsg = "Would " + headt_outmsg if declts: newdeclt = "{{de-ndecl|%s}}" % declspec declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt) outmsg += " and " + declt_outmsg pagemsg(outmsg) if unicode(headt) != newheadt: newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(headt_outmsg) subsections[subsection_with_head] = newsectext if declts: declts_existing = "\n".join(unicode(declt) for declt in declts) newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(declt_outmsg) subsections[subsection_with_declts] = newsectext return notes
def parse_syns(syns): retval = [] syns = syns.strip() orig_syns = syns qualifier = None while True: # check for qualifiers specified using a qualifier template m = re.search( "^(.*?)\{\{(?:qualifier|qual|q|i)\|([^{}|=]*)\}\}(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break # check for qualifiers using e.g. {{lb|ru|...}} m = re.search( "^(.*?)\{\{(?:lb)\|%s\|([^{}=]*)\}\}(.*?)$" % re.escape(args.lang), syns) if m: before_text, qualifier, after_text = m.groups() # do this before handling often/sometimes/etc. in case the label has often|_|pejorative or similar qualifier = qualifier.replace("|_|", " ") terms_no_following_comma = [ "also", "and", "or", "by", "with", "except", "outside", "in", "chiefly", "mainly", "mostly", "primarily", "especially", "particularly", "excluding", "extremely", "frequently", "humorously", "including", "many", "markedly", "mildly", "now", "occasionally", "of", "often", "sometimes", "originally", "possibly", "rarely", "slightly", "somewhat", "strongly", "then", "typically", "usually", "very" ] qualifier = re.sub( r"\b(%s)\|" % "|".join(terms_no_following_comma), r"\1 ", qualifier) qualifier = qualifier.replace("|", ", ") syns = before_text + after_text break # check for qualifier-like ''(...)'' m = re.search("^(.*?)''\(([^'{}]*)\)''(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break # check for qualifier-like (''...'') m = re.search("^(.*?)\(''([^'{}]*)''\)(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break break # Split on commas, semicolons, slashes but don't split commas etc. inside of braces or brackets split_by_brackets_braces = re.split( r"(\{\{[^{}]*\}\}|\[\[[^\[\]]*\]\])", syns.strip()) comma_separated_runs = blib.split_alternating_runs( split_by_brackets_braces, "(?: *[,;] *| +/ +)") syns = [ "".join(comma_separated_run) for comma_separated_run in comma_separated_runs ] if qualifier and len(syns) > 1: pagemsg( "WARNING: Saw qualifier along with multiple synonyms, not sure how to proceed: <%s>" % orig_syns) return None joiner_after = ";" if qualifier or len(syns) > 1 else "," for synindex, syn in enumerate(syns): orig_syn = syn m = re.search( r"^\{\{[lm]\|%s\|([^{}]*)\}\}$" % re.escape(args.lang), syn) if m: decl = blib.parse_text(syn).filter_templates()[0] gender = None translit = None raw_syn = None alt = None gloss = None lit = None pos = None for param in decl.params: pn = pname(param) pv = unicode(param.value) if pn in ["1"]: pass elif pn == "2": raw_syn = pv elif pn == "3": alt = pv elif pn in ["4", "t", "gloss"]: gloss = pv elif pn == "g": gender = pv elif pn in ["g2", "g3", "g4"]: if not gender: pagemsg( "WARNING: Saw %s=%s without g= in %s <%s> in line: %s" % (pn, pv, syntype, orig_syn, line)) return None gender += "," + pv elif pn == "tr": translit = pv elif pn == "lit": lit = pv elif pn == "pos": pos = pv else: pagemsg( "WARNING: Unrecognized param %s=%s in %s <%s> in line: %s" % (pn, pv, syntype, orig_syn, line)) return None if not raw_syn: pagemsg( "WARNING: Couldn't find raw synonym in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if raw_syn and alt: if "[[" in raw_syn or "[[" in alt: pagemsg( "WARNING: Saw both synonym=%s and alt=%s with brackets in one or both in %s <%s> in line: %s" % (raw_syn, alt, syntype, orig_syn, line)) return None syn = "[[%s|%s]]" % (raw_syn, alt) elif raw_syn: if "[[" in raw_syn: syn = raw_syn else: syn = "[[%s]]" % raw_syn elif alt: pagemsg( "WARNING: Saw alt=%s but no link text in %s <%s> in line: %s" % (alt, syntype, orig_syn, line)) return else: def add_brackets_if_not_already(m): raw_syn = m.group(1) if "[[" not in raw_syn: raw_syn = "[[%s]]" % raw_syn return raw_syn syn = re.sub( r"\{\{[lm]\|%s\|([^{}=]*)\}\}" % re.escape(args.lang), add_brackets_if_not_already, syn) gender = None translit = None gloss = None lit = None pos = None if "{{" in syn or "}}" in syn: pagemsg( "WARNING: Unmatched braces in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if "''" in syn: pagemsg( "WARNING: Italicized text in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if "(" in syn or ")" in syn: pagemsg( "WARNING: Unmatched parens in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if ":" in syn: pagemsg( "WARNING: Unmatched colon in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None # Strip brackets around entire synonym syn = re.sub(r"^\[\[([^\[\]|{}]*)\]\]$", r"\1", syn) # If there are brackets around some words but not all, put brackets around the remaining words if "[[" in syn: split_by_brackets = re.split( r"([^ ]*\[\[[^\[\]]*\]\][^ ]*)", syn) def maybe_add_brackets(m): text = m.group(1) if "[" in text or "]" in text: pagemsg( "WARNING: Saw nested brackets in %s in %s <%s> in line: %s" % (text, syntype, orig_syn, line)) return text if not re.search(r"\w", text, re.U): pagemsg( "Not adding brackets around '%s', saw no letters in %s <%s> in line: %s" % (text, syntype, orig_syn, line)) return text return "[[%s]]" % text # Put brackets around the remainin words not already bracketed or partially bracketed. But don't put # brackets around words inside of HTML comments, and don't include punctuation inside the brackets. for i in xrange(0, len(split_by_brackets), 2): split_out_comments = re.split( "(<!--.*?-->)", split_by_brackets[i]) for j in xrange(0, len(split_out_comments), 2): split_out_comments[j] = re.sub( "([^ ,*/{}:;()?!+<>]+)", maybe_add_brackets, split_out_comments[j]) split_by_brackets[i] = "".join(split_out_comments) new_syn = "".join(split_by_brackets) if new_syn != syn: pagemsg("Add brackets to '%s', producing '%s'" % (syn, new_syn)) syn = new_syn other_params = [ ("tr", translit), ("t", gloss), ("q", qualifier), ("g", gender), ("pos", pos), ("lit", lit), ] # Set the joiner_after to None for everything but the last synonym on the row; we will then change # all commas to semicolons if there is any semicolon, so we are consistently using commas or # semicolons to separate groups of synonyms. retval.append( (syn, other_params, joiner_after if synindex == len(syns) - 1 else None)) return retval
def process_noun_headt(t, declt=None): origt = unicode(t) origdeclt = declt and unicode(declt) or "None" def getp(param): return getparam(t, param) if tname(t) == "head": pos = getp("2") head = getp("head") headtr = getp("tr") g = getp("g") g2 = getp("g2") g3 = getp("g3") anim = "" decl = "" gen = "" gentr = "" pl = "" pltr = "" f = "" ftr = "" m = "" mtr = "" collective = "" collectivetr = "" must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "head", "tr", "g", "g2", "g3", # extra params to ignore "sc"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False else: pos = getp("pos") head = getp("1") or getp("head") or getp("sg") headtr = getp("tr") g = getp("2") or getp("g") g2 = getp("g2") g3 = getp("g3") anim = getp("a") decl = getp("decl") gen = getp("gen") or getp("3") gentr = getp("gentr") pl = getp("pl") or getp("4") pltr = getp("pltr") f = getp("f") ftr = getp("ftr") m = getp("m") mtr = getp("mtr") collective = getp("collective") collectivetr = getp("collectivetr") must_continue = False for param in t.params: pn = pname(param) if pn not in ["pos", "1", "head", "sg", "tr", "2", "g", "g2", "g3", "a", "decl", "gen", "gentr", "3", "pl", "pltr", "4", "f", "ftr", "m", "mtr", "collective", "collectivetr", # extra params to ignore "sc"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False def clean_gender(g): gparts = g.split("-") realg = "?" realan = "?" realpl = "" for part in gparts: if part in ["m", "f", "n"]: realg = part elif part in ["an", "in"]: realan = part elif part == "p": realpl = part elif part != "?": pagemsg("WARNING: Encountered unrecognized gender part '%s' in gender '%s': %s" % ( part, g, origt)) an = anim if an in ["a", "an"]: an = "an" elif an in ["i", "in"]: an = "in" elif an: pagemsg("WARNING: Unrecognized animacy a=%s: %s" % (an, origt)) an = "?" if realan != "?" and an and an != "?" and an != realan: pagemsg("WARNING: Animacy mismatch, anim %s in gender spec %s but a=%s: %s" % ( realan, g, anim, origt)) if realan == "?" and an: realan = an pl = "" if realpl: pl = "-%s" % realpl if realg == "?": pagemsg("WARNING: Unknown gender in gender spec %s: %s" % (g, origt)) if realan == "?": pagemsg("WARNING: Unknown animacy in gender spec %s and a=%s: %s" % (g, anim, origt)) if realg == "?" and realan == "?": return "?%s" % pl else: return "%s-%s%s" % (realg, realan, pl) if not g and not g2 and not g3: pagemsg("WARNING: No gender specified: %s" % origt) g = "?" genders = [] if g: genders.append(clean_gender(g)) if g2: genders.append(clean_gender(g2)) if g3: genders.append(clean_gender(g3)) if not head: head = pagetitle if decl and decl not in ["off", "no", "indeclinable"]: pagemsg("WARNING: Unrecognized value for decl=%s: %s" % (decl, origt)) decl = "" if decl: if gen and gen != "-": pagemsg("WARNING: Indeclinable but gen=%s specified: %s" % (gen, origt)) else: gen = "-" del t.params[:] if tname(t) == "head": blib.set_template_name(t, "be-" + pos) elif pos: t.add("pos", pos) def split_form(form): forms = re.split(r",\s*", form.strip()) forms = [re.sub(r"^\[\[([^\[\]]*)\]\]$", r"\1", f) for f in forms] forms = [belib.add_accent_to_o(f) for f in forms] for f in forms: if "[[" in f: pagemsg("WARNING: Link in form %s: headword=%s, decl=%s" % (f, origt, origdeclt)) if belib.needs_accents(f): pagemsg("WARNING: Form %s missing accents: headword=%s, decl=%s" % (f, origt, origdeclt)) forms = [f for f in forms if f != "-"] return forms def handle_multiform(firstparam, restparam, form, formtr, declparam=None): if form: form = split_form(form) if declparam: if declparam == "-": declforms = ["-"] else: declforms = split_form(getparam(declt, declparam)) if not form: form = declforms elif set(form) != set(declforms): pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" % (restparam, ",".join(form), ",".join(declforms), origt, origdeclt)) if form: blib.set_param_chain(t, form, firstparam, restparam) if formtr: trparam = ("" if restparam == "head" else restparam) + "tr" if not form: pagemsg("WARNING: Saw %s=%s but no %s=: %s" % ("trparam", formtr, restparam, origt)) elif len(form) > 1: pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" % (trparam, formtr, restparam, ",".join(form), origt)) t.add(trparam, formtr) decl_headparam = None decl_genparam = None decl_plparam = None if declt: decl_headparam = "1" tn = tname(declt) if tn == "be-decl-noun": decl_genparam = "3" decl_plparam = "2" elif tn == "be-decl-noun-unc": decl_genparam = "2" decl_plparam = "-" else: decl_genparam = "2" if tn == "be-decl-noun-pl": for g in genders: if not g.endswith("-p"): pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % ( g, unicode(declt), origt)) else: for g in genders: if g.endswith("-p"): pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % ( g, unicode(declt), origt)) handle_multiform("1", "head", head, headtr, decl_headparam) blib.set_param_chain(t, genders, "2", "g") handle_multiform("3", "gen", gen, gentr, decl_genparam) if not getp("3") and pl: t.add("3", "") handle_multiform("4", "pl", pl, pltr, decl_plparam) handle_multiform("m", "m", m, mtr) handle_multiform("f", "f", f, ftr) handle_multiform("collective", "collective", collective, collectivetr) if origt != unicode(t): notes.append("fix up {{%s}} to use new param convention" % tname(t)) pagemsg("Replaced %s with %s" % (origt, unicode(t))) return True
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "autocat": blib.set_template_name(t, "auto cat") notes.append("{{autocat}} -> {{auto cat}}") elif tn == "langcatboiler": m = re.search("^Category:(.* Language)$", pagetitle) if not m: m = re.search("^Category:(.*) language$", pagetitle) if not m: pagemsg("WARNING: Can't parse page title") continue langname = m.group(1) t_lang = getparam(t, "1") if langname not in blib.languages_byCanonicalName: pagemsg("WARNING: Unrecognized language name: %s" % langname) continue langobj = blib.languages_byCanonicalName[langname] if langobj["code"] != t_lang: pagemsg( "WARNING: Auto-determined code %s for language name %s != manually specified %s" % (langobj["code"], langname, t_lang)) continue numbered_params = [] non_numbered_params = [] for param in t.params: pn = pname(param) pv = unicode(param.value).strip() if pn == "1" or not pv: pass elif re.search("^[0-9]+$", pn): numbered_params.append(pv) elif pn not in [ "setwiki", "setwikt", "setsister", "entryname" ]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, pv, unicode(t))) return elif (pn in ["setwiki", "setsister"] and pv == langname + " language" or pn == "entryname" and pv == langname or pn == "setwikt" and pv == langobj["code"]): pagemsg("WARNING: Unnecessary param %s=%s, omitting: %s" % (pn, pv, unicode(t))) else: non_numbered_params.append((pn, pv)) if len(numbered_params) == 0: if langobj["type"] == "reconstructed" or langobj[ "family"] == "art": pagemsg( "Reconstructed or constructed language, allowing no countries" ) else: pagemsg( "WARNING: No countries and not reconstructed or constructed language, adding UNKNOWN" ) numbered_params.append("UNKNOWN") blib.set_template_name(t, "auto cat") del t.params[:] for index, numbered_param in enumerate(numbered_params): t.add(str(index + 1), numbered_param, preserve_spacing=False) for name, value in non_numbered_params: t.add(name, value, preserve_spacing=False) notes.append("convert {{%s}} to {{auto cat}}" % tn) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] if "sa-noun" not in text and "sa-decl-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) headt = None saw_decl = False for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "sa-noun": pagemsg("Saw headt=%s" % unicode(t)) if headt and not saw_decl: pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t))) headt = t saw_decl = False continue if tn in ["sa-decl-noun", "sa-decl"]: pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None)) continue if tn.startswith("sa-decl-noun-"): pagemsg("Saw declt=%s" % unicode(t)) if not headt: pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t))) continue saw_decl = True tr = getparam(headt, "tr") accented_tr = False if not tr: tr = expand_text("{{xlit|sa|%s}}" % pagetitle) pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t))) else: if "-" in tr: pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) tr = tr.replace("-", "") decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś") if AC not in decomptr and GR not in decomptr: pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t))) else: accented_tr = True genders = blib.fetch_param_chain(headt, "g") genders = [g.replace("-p", "").replace("bysense", "") for g in genders] genders = [g for gs in genders for g in ( ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs] )] if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]: tg = tn[-1] if tg not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( tg, ",".join(genders), unicode(headt), unicode(t))) continue decltr = getparam(t, "1") if not decltr: if not accented_tr: pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn)) else: pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("add accented translit %s to {{%s}}" % (tr, tn)) elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari if not accented_tr: pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn)) else: pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn)) else: decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś") subbed = False if AC not in decompdecltr and GR not in decompdecltr: if accented_tr: pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" % (decltr, tr, unicode(headt), unicode(t))) t.add("1", tr) notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn)) subbed = True else: pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) if not subbed and "-" in decltr: pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn)) decltr = decltr.replace("-", "") t.add("1", decltr) subbed = True stripped_decltr = decltr.strip() if "\n" not in decltr and stripped_decltr != decltr: pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" % (decltr, unicode(headt), unicode(t))) notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn)) decltr = stripped_decltr t.add("1", decltr) subbed = True continue if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"): pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t))) continue if tn in old_template_to_gender: must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "4", "n"]: pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t), unicode(headt))) must_continue = True break if must_continue: continue g = old_template_to_gender[tn] if g not in genders: pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % ( g, ",".join(genders), unicode(headt), unicode(t))) continue blib.set_template_name(t, "sa-decl-noun-%s" % g) rmparam(t, "n") rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") t.add("1", tr) notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g)) else: pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if headt: pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt)) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not re.search(r"\{\{pt-noun form of", text): return pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) def feminize_noun(noun): if noun.endswith(u"ão"): return noun[:-2] + "ona" if noun.endswith("dor"): return noun + "a" if noun.endswith("o"): return noun[:-1] + "a" pagemsg( "WARNING: Don't know how to compute female equivalent of %s: %s" % (noun, unicode(t))) return None def singularize_feminine_noun(noun): if noun.endswith("as"): return noun[:-1] pagemsg( "WARNING: Don't know how to compute singular equivalent of feminine noun %s: %s" % (noun, unicode(t))) return None if tn == "pt-noun form of": for param in t.params: pn = pname(param) if pn not in ["1", "2", "3", "4", "t", "nocap", "nodot"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pn, unicode(param.value, origt))) return lemma = blib.remove_links(getparam(t, "1")) gender = getparam(t, "2") number = getparam(t, "3") dimaug = getparam(t, "4") gloss = getparam(t, "t") if dimaug: pagemsg("WARNING: Not sure what to do with 4=%s: %s" % (dimaug, origt)) return if gender in ["m", "mf", "m-f", "onlym", "onlyf"]: if number == "sg": pagemsg("WARNING: Not sure what to do with 2=%s 3=s: %s" % (gender, origt)) return if number != "pl": pagemsg("WARNING: Unrecognized number 3=%s: %s" % (number, origt)) return newname = "plural of" elif gender != "f": pagemsg("WARNING: Unrecognized gender 2=%s: %s" % (gender, origt)) return else: if number == "sg": newname = "female equivalent of" elif number != "pl": pagemsg("WARNING: Unrecognized number 3=%s: %s" % (number, origt)) return else: lemma = singularize_feminine_noun(pagetitle) if not lemma: return newname = "plural of" del t.params[:] blib.set_template_name(t, newname) t.add("1", "pt") t.add("2", lemma) if gloss: t.add("3", "") t.add("4", gloss) notes.append("replace {{pt-noun form of}} with {{%s|pt}}" % newname) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not re.search( r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form", text): return pagemsg("Processing") notes = [] retval = blib.find_modifiable_lang_section(text, "German", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find German section") return sections, j, secbody, sectail, has_non_lang = retval if re.search("== *Etymology 1 *==", secbody): pagemsg("WARNING: Multiple etymology sections, skipping") return parsed = blib.parse_text(secbody) headt = None comparative_of_t = None superlative_of_t = None inflection_of_t = None need_superlative_of_t_lemma = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) def do_comparative_superlative_of(pos, existing_t, should_end): if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{%s of}}, skipping: %s" % (pos, origt)) return False if existing_t: pagemsg( "WARNING: Saw two {{%s of}} templates, skipping: %s and %s" % (pos, unicode(existing_t), origt)) return False if not headt: pagemsg( "WARNING: Saw {{%s of}} without head template, skipping: %s" % (pos, origt)) return False if not pagetitle.endswith(should_end): pagemsg( "WARNING: Incorrect ending for %s, should be -%s, skipping" % (pos, should_end)) return False param2 = getparam(headt, "2") if param2 != "%s adjective" % pos: headt.add("2", "%s adjective" % pos) notes.append( "convert {{head|de|%s}} to {{head|de|%s adjective}}" % (param2, pos)) return t if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [ "adjective form", "adjective comparative form", "adjective superlative form", "participle form" ]: if headt: pagemsg( "WARNING: Saw two head templates, skipping: %s and %s" % (unicode(headt), origt)) return headt = t elif tn == "head" and getparam(t, "1") == "de" and getparam( t, "2") == "verb form": pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt) elif tn == "head": pagemsg("WARNING: Saw unrecognized head template, skipping: %s" % origt) return elif tn == "comparative of": comparative_of_t = do_comparative_superlative_of( "comparative", comparative_of_t, "er") if not comparative_of_t: return elif tn == "superlative of": superlative_of_t = do_comparative_superlative_of( "superlative", superlative_of_t, "sten") if not superlative_of_t: return elif tn == "de-adj form of": pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" % origt) return elif tn in ["inflection of", "infl of"]: if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{inflection of}}, skipping: %s" % origt) return if not headt: pagemsg( "WARNING: Saw {{inflection of}} without head template, skipping: %s" % origt) return if inflection_of_t: pagemsg( "WARNING: Saw {{inflection of}} twice, skipping: %s and %s" % (unicode(inflection_of_t), origt)) return inflection_of_t = t lemma = getparam(t, "2") if getparam(t, "3"): pagemsg( "WARNING: Saw alt form in {{inflection of}}, skipping: %s" % origt) return infl_tags = [] for param in t.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn): pagemsg( "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s" % (pn, pv, origt)) return if int(pn) >= 4: infl_tags.append(pv) tags = "|".join(infl_tags) if tags not in tags_to_ending: pagemsg( "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s" % origt) return del t.params[:] ending = tags_to_ending[tags] if ending in ["sten", "esten"]: need_superlative_of_t_lemma = lemma blib.set_template_name(t, "de-adj form of") t.add("1", lemma) no_explicit = check_if_lemma_and_ending_match_pagetitle( lemma, ending, pagetitle, allow_umlaut=True) if not no_explicit: pagemsg("WARNING: Explicit ending %s required for lemma %s" % (ending, lemma)) t.add("2", ending) notes.append( "convert {{inflection of|de|...}} to {{de-adj form of}}") if "comd" in tags: param2 = getparam(headt, "2") if param2 != "comparative adjective form": headt.add("2", "comparative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|comparative adjective form}}" % param2) elif "supd" in tags: param2 = getparam(headt, "2") if param2 != "superlative adjective form": headt.add("2", "superlative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|superlative adjective form}}" % param2) secbody = unicode(parsed) def add_adj_form_of(secbody, pos, comparative_superlative_t, ending): lemma = getparam(comparative_superlative_t, "2") if check_if_lemma_and_ending_match_pagetitle(lemma, ending, pagetitle, allow_umlaut=False): form_pos = "superlative adjective form" if pos == "superlative" else "adjective form" newsec = """ ===Adjective=== {{head|de|%s}} # {{de-adj form of|%s}}""" % (form_pos, lemma) secbody, replaced = blib.replace_in_text( secbody, unicode(comparative_superlative_t), unicode(comparative_superlative_t) + newsec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" % (ending, unicode(comparative_of_t))) return secbody, False notes.append("add {{de-adj form of}} for %s" % pos) else: pagemsg( "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" % (lemma, pos, ending)) return secbody, True if comparative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t, "er") if not ok: return if superlative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t, "sten") if not ok: return if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma: cursec = """===Adjective=== {{head|de|superlative adjective form}} # %s""" % unicode(inflection_of_t) newsec = """===Adjective=== {{head|de|superlative adjective}} # {{superlative of|de|%s}} """ % need_superlative_of_t_lemma secbody, replaced = blib.replace_in_text(secbody, cursec, newsec + cursec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" % unicode(inflection_of_t)) return notes.append("add {{superlative of|de|...}}") sections[j] = secbody + sectail text = "".join(sections) if not notes: pagemsg("WARNING: Couldn't convert page") return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) has_etym_sections = "==Etymology 1==" in secbody saw_pronun_section_at_top = False split_pronun_sections = False saw_pronun_section_this_etym_section = False saw_existing_pron = False saw_existing_pron_this_etym_section = False etymsection = "top" if has_etym_sections else "all" etymsections_to_first_subsection = {} if etymsection == "top": after_etym_1 = False for k in xrange(2, len(subsections), 2): if "==Etymology 1==" in subsections[k - 1]: after_etym_1 = True if "==Pronunciation==" in subsections[k - 1]: if after_etym_1: split_pronun_sections = True else: saw_pronun_section_at_top = True m = re.search("==Etymology ([0-9]*)==", subsections[k - 1]) if m: etymsections_to_first_subsection[int(m.group(1))] = k msgs = [] def append_msg(txt): if txt not in msgs: msgs.append(txt) def apply_default_pronun_to_pagetitle(): respellings, this_msgs = apply_default_pronun(pagetitle) for msg in this_msgs: append_msg(msg) return respellings for k in xrange(2, len(subsections), 2): msgs = [] def check_missing_pronun(etymsection): if split_pronun_sections and not saw_existing_pron_this_etym_section: pagemsg("WARNING: Missing pronunciations in etym section %s" % etymsection) append_msg("MISSING_PRONUN") append_msg("NEW_DEFAULTED") respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs))) #pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", # " ".join(x.replace(" ", "_") for x in respellings), " ".join(msgs))) m = re.search("==Etymology ([0-9]*)==", subsections[k - 1]) if m: if etymsection != "top": check_missing_pronun(etymsection) etymsection = m.group(1) saw_pronun_section_this_etym_section = False saw_existing_pron_this_etym_section = False if "==Pronunciation " in subsections[k - 1]: pagemsg("WARNING: Saw Pronunciation N section header: %s" % subsections[k - 1].strip()) if "==Pronunciation==" in subsections[k - 1]: if saw_pronun_section_this_etym_section: pagemsg("WARNING: Saw two Pronunciation sections under etym section %s" % etymsection) if saw_pronun_section_at_top and etymsection != "top": pagemsg("WARNING: Saw Pronunciation sections both at top and in etym section %s" % etymsection) saw_pronun_section_this_etym_section = True parsed = blib.parse_text(subsections[k]) respellings = [] prev_it_IPA_t = None prev_it_pr_t = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "it-IPA": saw_existing_pron = True saw_existing_pron_this_etym_section = True if prev_it_IPA_t: pronun_lines = re.findall(r"^.*\{\{it-IPA.*$", subsections[k], re.M) pagemsg("WARNING: Saw multiple {{it-IPA}} templates in a single Pronunciation section: %s" % " ||| ".join(pronun_lines)) must_continue = True break prev_it_IPA_t = t this_respellings = [] saw_pronun = False last_numbered_param = 0 for param in t.params: pn = pname(param) pv = unicode(param.value).strip().replace(" ", "_") if re.search("^[0-9]+$", pn): last_numbered_param += 1 saw_pronun = True if pv == "+": append_msg("EXISTING_DEFAULTED") this_respellings.extend(apply_default_pronun_to_pagetitle()) else: append_msg("EXISTING") this_respellings.append(pv) elif re.search("^ref[0-9]*$", pn) and int(pn[3:] or "1") == last_numbered_param: m = re.search(r"^\{\{R:it:(DiPI|Olivetti|Treccani|Trec)(\|[^{}]*)?\}\}$", pv) if m: refname, refparams = m.groups() refname = "Treccani" if refname == "Trec" else refname this_respellings.append("n:%s%s" % (refname, refparams or "")) else: this_respellings.append("%s=%s" % (pn, pv)) else: this_respellings.append("%s=%s" % (pn, pv)) if not saw_pronun: append_msg("EXISTING_DEFAULTED") this_respellings.extend(apply_default_pronun_to_pagetitle()) respellings.extend(this_respellings) if tn == "it-pr": saw_existing_pron = True saw_existing_pron_this_etym_section = True if prev_it_pr_t: pronun_lines = re.findall(r"^.*\{\{it-pr.*$", subsections[k], re.M) pagemsg("WARNING: Saw multiple {{it-pr}} templates in a single Pronunciation section: %s" % " ||| ".join(pronun_lines)) must_continue = True break prev_it_pr_t = t this_respellings = [] saw_pronun = False for param in t.params: pn = pname(param) pv = unicode(param.value).strip().replace(" ", "_") if re.search("^[0-9]+$", pn): saw_pronun = True #if pv == "+": # append_msg("EXISTING_DEFAULTED") # this_respellings.extend(apply_default_pronun_to_pagetitle()) #else: def fix_ref(m): refname, refparams = m.groups() refname = "Treccani" if refname == "Trec" else refname return "<r:%s%s>" % (refname, refparams or "") pv = re.sub(r"<ref:\{\{R:it:(DiPI|Olivetti|Treccani|Trec|DOP)(\|[^{}]*)?\}\}>", fix_ref, pv) append_msg("EXISTING") this_respellings.append(pv) else: this_respellings.append("%s=%s" % (pn, pv)) if not saw_pronun: append_msg("EXISTING_DEFAULTED") #this_respellings.extend(apply_default_pronun_to_pagetitle()) this_respellings.append("+") respellings.extend(this_respellings) if must_continue: continue if args.include_defns and etymsection not in ["top", "all"]: first_etym_subsec = etymsections_to_first_subsection.get(int(etymsection), None) next_etym_subsec = etymsections_to_first_subsection.get(1 + int(etymsection), None) if first_etym_subsec is None: pagemsg("WARNING: Internal error: Unknown first etym section for =Etymology %s=" % etymsection) else: if next_etym_subsec is None: next_etym_subsec = len(subsections) defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it") append_msg("defns: %s" % ";".join(defns)) if respellings: pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs))) check_missing_pronun(etymsection) if not saw_existing_pron: if args.include_defns and has_etym_sections: for etymsec in sorted(list(etymsections_to_first_subsection.keys())): msgs = [] first_etym_subsec = etymsections_to_first_subsection[etymsec] next_etym_subsec = etymsections_to_first_subsection.get(1 + etymsec, None) if next_etym_subsec is None: next_etym_subsec = len(subsections) append_msg("NEW_DEFAULTED") defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it") append_msg("defns: %s" % ";".join(defns)) respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % (etymsec, " ".join(respellings), " ".join(msgs))) else: msgs = [] append_msg("NEW_DEFAULTED") respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", " ".join(respellings), " ".join(msgs)))
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): if tname(t) == "bg-adj-form": origt = unicode(t) must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "head"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue rmparam(t, "1") rmparam(t, "2") head = getparam(t, "head") rmparam(t, "head") g = getparam(t, "3") rmparam(t, "3") blib.set_template_name(t, "head") t.add("1", "bg") t.add("2", "adjective form") if head: t.add("head", head) else: if bglib.needs_accents(pagetitle): pagemsg( "WARNING: Can't add head= to {{bg-adj-form}} missing it because pagetitle is multisyllabic: %s" % unicode(t)) else: t.add("head", pagetitle) if g: t.add("g", g) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "replace {{bg-adj-form}} with {{head|bg|adjective form}}") headt = None saw_infl_after_head = False saw_headt = False saw_inflt = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) saw_infl = False already_fetched_forms = False if tn == "head" and getparam(t, "1") == "bg" and getparam( t, "2") == "adjective form": saw_headt = True if headt and not saw_infl_after_head: pagemsg( "WARNING: Saw two head templates %s and %s without intervening inflection" % (unicode(headt), origt)) saw_infl_after_head = False headt = t if tn == "bg-adj form of": saw_inflt = True if not headt: pagemsg( "WARNING: Saw {{bg-adj form of}} without head template: %s" % origt) continue must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "adj"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue saw_infl_after_head = True adj = getparam(t, "adj") if not adj: pagemsg("WARNING: Didn't see adj=: %s" % origt) continue infls = [] param2 = getparam(t, "2") if param2 == "indefinite": infls.append("indef") elif param2 == "definite": infls.append("def") elif param2 == "extended": infls.append("voc") else: pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt)) continue param3 = getparam(t, "3") if param3 == "subject": infls.append("sbjv") elif param3 == "object": infls.append("objv") elif param3: pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt)) continue param1 = getparam(t, "1") if param1 == "masculine": infls.extend(["m", "s"]) elif param1 == "feminine": infls.extend(["f", "s"]) elif param1 == "neuter": infls.extend(["n", "s"]) elif param1 == "plural": infls.append("p") else: pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt)) continue blib.set_template_name(t, "inflection of") del t.params[:] t.add("1", "bg") if adj in adjs_to_accents: adj = adjs_to_accents[adj] else: pagemsg( "WARNING: Unable to find accented equivalent of %s: %s" % (adj, origt)) t.add("2", adj) t.add("3", "") for i, infl in enumerate(infls): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{bg-adj form of}} to {{inflection of}}") tn = tname(t) elif tn == "inflection of" and getparam(t, "1") == "bg": saw_inflt = True if saw_headt and not saw_inflt: pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt)) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose) def verify_template_is_full_line(tn, line): templates = list(blib.parse_text(line).filter_templates()) if type(tn) is list: tns = tn else: tns = [tn] tntext = "/".join(tns) if len(templates) == 0: pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line)) return None t = templates[0] if tname(t) not in tns: pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" % (tntext, tntext, line)) return None if unicode(t) != line: pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line)) return None return t notes = [] retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) sect_for_wiki = 0 for k in xrange(1, len(subsections), 2): if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]): sect_for_wiki = k + 1 elif re.search(r"==\s*Pronunciation\s*==", subsections[k]): secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k]) if secheader != subsections[k]: subsections[k] = secheader notes.append("remove extraneous spaces in ==Pronunciation== header") extra_notes = [] parsed = blib.parse_text(subsections[k + 1]) num_it_IPA = 0 saw_it_pr = False for t in parsed.filter_templates(): tn = tname(t) if tn in ["it-pr", "it-pronunciation"]: saw_it_pr = True break if tn == "it-IPA": num_it_IPA += 1 if saw_it_pr: pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t)) continue if num_it_IPA == 0: pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping") continue if num_it_IPA > 1: pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping") continue lines = subsections[k + 1].strip().split("\n") # Remove blank lines. lines = [line for line in lines if line] hyph_lines = [] homophone_lines = [] rfap_lines = [] rhyme_lines = [] must_continue = False audioarg = "" args = [] bare_args = [] args_for_hyph = [] lines_so_far = [] for lineind, line in enumerate(lines): origline = line lines_so_far.append(line) # In case of "* {{it-IPA|...}}", chop off the "* ". line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line) if line.startswith("{{it-IPA"): if args: pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline) must_continue = True break outer_ref_arg = None m = re.search("^(.*?) *<ref>(.*?)</ref>$", line) if m: line, outer_ref_arg = m.groups() ipat = verify_template_is_full_line("it-IPA", line) if ipat is None: must_continue = True break bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"] bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args] bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args] bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args] normalized_bare_args = [ normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline))) for arg in bare_args ] if None in normalized_bare_args: must_continue = True break args = [x for x in bare_args] args_for_hyph = [] for arg in normalized_bare_args: hypharg = ( arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z") .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z") ) hypharg = re.sub(pron_sign_c, "", hypharg) putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", "")) putative_pagetitle = remove_non_final_accents(putative_pagetitle) # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want # pronunciation rè to match page title ré or vice versa.) if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle: args_for_hyph.append(hypharg) for param in ipat.params: pn = pname(param) pv = unicode(param.value) if re.search("^[0-9]+$", pn): continue m = re.search("^(ref|qual)([0-9]*)$", pn) if m: parampref, argnum = m.groups() argnum = int(argnum or "1") - 1 if argnum >= len(args): pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % ( pn, pv, origline)) must_continue = True break args[argnum] += "<%s:%s>" % (parampref, pv) else: pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if outer_ref_arg: if "<ref:" in args[-1]: pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s" % (outer_ref_arg, args[-1], origline)) must_continue = True break else: args[-1] += "<ref:%s>" % outer_ref_arg extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}") continue if line.startswith("{{rfap"): line = "* " + line if line.startswith("{{wiki"): subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki] # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section. del lines_so_far[-1] subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:]) notes.append("move {{wikipedia}} line to top of etym section") continue if not line.startswith("* ") and not line.startswith("*{"): pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s" % origline) must_continue = True break if line.startswith("* "): line = line[2:] else: line = line[1:] if line.startswith("{{hyph"): hyph_lines.append("* " + line) elif line.startswith("{{homophone"): homophone_lines.append("* " + line) elif line.startswith("{{rfap"): rfap_lines.append(line) elif line.startswith("{{audio"): audiot = verify_template_is_full_line("audio", line) if audiot is None: must_continue = True break if getparam(audiot, "1") != "it": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline) must_continue = True break audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if audiogloss in ["Audio", "audio"]: audiogloss = "" if audiogloss: audiogloss = ";%s" % audiogloss audiopart = "<audio:%s%s>" % (audiofile, audiogloss) audioarg += audiopart pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart)) extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot)) elif line.startswith("{{rhyme"): rhyme_lines.append(line) elif remove_accents(line) == remove_accents(pagetitle): pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline) else: pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline) must_continue = True break if must_continue: continue if rhyme_lines: rhyme_error = False rhyme_pronuns = [] for bare_arg in normalized_bare_args: pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg)) if not pronun: rhyme_error = True break rhyme_pronun = ( re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "") .replace(".", "")) if rhyme_pronun not in rhyme_pronuns: rhyme_pronuns.append(rhyme_pronun) if not rhyme_error: saw_non_matching_rhyme = False normalized_rhymes = [] rhyme_line_text = ", ".join(rhyme_lines) normalized_bare_arg_text = ",".join(normalized_bare_args) rhyme_pronun_text = ",".join(rhyme_pronuns) for rhyme_line in rhyme_lines: rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line) if not rhymet: break if getparam(rhymet, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line)) break rhymes = [] must_break = False num_syl = "" rhyme_specific_num_syl = [] for param in rhymet.params: pn = pname(param) pv = unicode(param.value) if not re.search("^s?[0-9]*$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(rhymet), rhyme_line)) must_break = True break if pn == "s": num_syl = "<s:%s>" % pv elif pn.startswith("s"): rhyme_no = int(pn[1:]) - 1 rhyme_specific_num_syl.append((rhyme_no, pv)) elif int(pn) > 1: if pv: rhymes.append([pv, ""]) if must_break: break for rhyme_no, this_num_syl in rhyme_specific_num_syl: if rhyme_no >= len(rhymes): pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % ( rhyme_no + 1, this_num_syl, rhyme_line)) must_break = True break rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl if must_break: break for rhyme, this_num_syl in rhymes: normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm") normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme) this_num_syl = this_num_syl or num_syl if this_num_syl and not args_for_hyph and not hyph_lines: pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s" % (this_num_syl, rhyme, rhyme_line_text)) saw_non_matching_rhyme = True normalized_rhymes.append(normalized_rhyme + this_num_syl) else: normalized_rhymes.append(normalized_rhyme) if rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_bare_arg_text, rhyme_line_text)) elif normalized_rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text)) elif rhyme != normalized_rhyme: pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: # no break if saw_non_matching_rhyme: pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s" % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes) extra_notes.append("incorporate non-default rhymes into {{it-pr}}") else: extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}") rhyme_lines = [] if not args: pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?") continue args[-1] += audioarg if hyph_lines: if len(hyph_lines) > 1: pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines)) else: assert hyph_lines[0].startswith("* ") hyph_line = hyph_lines[0][2:] hyph_templates = re.split(", *", hyph_line) hyphs = [] for hyph_template in hyph_templates: hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template) if not hypht: break syls = [] if getparam(hypht, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template)) break else: must_break = False for param in hypht.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn) and pn != "nocaption": pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hypht), hyph_line)) must_break = True break if pn != "nocaption" and int(pn) > 1: if not pv: hyphs.append(syls) syls = [] else: syls.append(pv) if must_break: break if syls: hyphs.append(syls) else: # no break if hyphs: specified_hyphenations = [".".join(syls) for syls in hyphs] specified_hyphenations = [ re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations] specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations] specified_hyphenations = [ adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations] specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations] hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph] if set(specified_hyphenations) < set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(specified_hyphenations) != set(hyphenations): hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations] rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations] def indices_of_syllable_markers(hyph): # Get the character indices of the syllable markers, but not counting the syllable markers themselves # (i.e. return the number of characters preceding the syllable marker). raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."] adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)] return set(adjusted_indices) if set(specified_hyphenations) == set(hyphenations_without_accents): pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(rehyphenated_specified_hyphenations) == set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1 and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "") and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) else: if not hyphenations: pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" % (",".join(specified_hyphenations), hyph_line)) else: pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations) extra_notes.append("incorporate non-default hyphenations into {{it-pr}}") else: pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line) extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}") hyph_lines = [] if homophone_lines: if len(homophone_lines) > 1: pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines)) else: assert homophone_lines[0].startswith("* ") homophone_line = homophone_lines[0][2:] homophones = {} homophone_qualifiers = {} hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line) if hmpt: if getparam(hmpt, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line)) else: for param in hmpt.params: pn = pname(param) pv = unicode(param.value) if not re.search("^q?[0-9]+$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hmpt), homophone_line)) break if pn.startswith("q"): homophone_qualifiers[int(pn[1:])] = pv elif int(pn) > 1: homophones[int(pn) - 1] = pv else: # no break hmp_args = [] for pn, pv in sorted(homophones.items()): hmp_args.append(pv) if pn in homophone_qualifiers: hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn] args[-1] += "<hmp:%s>" % ",".join(hmp_args) extra_notes.append("incorporate homophones into {{it-pr}}") homophone_lines = [] if args == ["+"]: it_pr = "{{it-pr}}" else: it_pr = "{{it-pr|%s}}" % ",".join(args) pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr)) all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines) newsubsec = "%s\n\n" % all_lines if subsections[k + 1] != newsubsec: this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes notes.extend(this_notes) subsections[k + 1] = newsubsec secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes