def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in lalib.la_headword_templates: for head in lalib.la_get_headword_from_template( t, pagetitle, pagemsg): no_macrons_head = remove_macrons(blib.remove_links(head)) if pagetitle.startswith("Reconstruction"): unprefixed_title = "*" + re.sub(".*/", "", pagetitle) else: unprefixed_title = pagetitle if no_macrons_head != unprefixed_title: pagemsg("WARNING: Bad Latin head: %s" % unicode(t)) return None, None
def investigate_possible_adj(index, adj_pagename, adv, adv_defns): def pagemsg(txt): msg("Page %s %s: %s" % (index, adj_pagename, txt)) pagemsg("Trying for adverb %s" % adv) page = pywikibot.Page(site, adj_pagename) if not page.exists(): pagemsg("Doesn't exist for adverb %s" % adv) return text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["la-adj", "la-part"]: adj = lalib.la_get_headword_from_template( t, adj_pagename, pagemsg)[0] adj_defns = lalib.find_defns(subsections[k]) msg("%s /// %s /// %s /// %s" % (adv, adj, ";".join(adv_defns), ";".join(adj_defns)))
def process_page(page, index, add_dot_after_i, convert_j): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): if tname(t) == "la-IPA": param1 = getparam(t, "1") or pagetitle for prefix in prefixes: if type(prefix) is list: prefix, macron_prefix = prefix else: macron_prefix = prefix orig_param1 = param1 if re.search("^%s[ij]" % macron_prefix, param1): if re.search(u"^%si%s" % (macron_prefix, vowel_re), param1) and add_dot_after_i: param1 = re.sub("^%si" % macron_prefix, "%si." % macron_prefix, param1) notes.append("add dot after i in {{la-IPA}} to force vocalic pronunciation") elif re.search("^%sj%s" % (macron_prefix, vowel_re), param1) and convert_j: param1 = re.sub("^%sj" % macron_prefix, "%si" % macron_prefix, param1) notes.append("convert j to i in {{la-IPA}} to match pagename; j no longer necessary to force consonantal pronunciation") if param1 != orig_param1: origt = unicode(t) # Fetch all params. params = [] for param in t.params: pname = unicode(param.name) if pname.strip() not in ["1"]: params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] t.add("1", param1) # Put remaining parameters in order. for name, value, showkey in params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (origt, unicode(t))) break else: # no break pagemsg("WARNING: Unable to match pronun template against any prefixes: %s" % unicode(t)) secbody = unicode(parsed) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if " " in pagetitle: pagemsg("WARNING: Space in page title, skipping") return pagemsg("Processing") text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "la-adv": adv = blib.remove_links(getparam(t, "1")) or pagetitle macron_stem, is_stem = lalib.infer_adv_stem(adv) if not is_stem: pagemsg( "WARNING: Couldn't infer stem from adverb %s, not standard: %s" % (adv, origt)) continue adv_defns = lalib.find_defns(subsections[k]) possible_adjs = [] stem = lalib.remove_macrons(macron_stem) possible_adjs.append(stem + "us") possible_adjs.append(stem + "is") if stem.endswith("nt"): possible_adjs.append(stem[:-2] + "ns") if stem.endswith("plic"): possible_adjs.append(stem[:-2] + "ex") if stem.endswith("c"): possible_adjs.append(stem[:-1] + "x") if re.search("[aeiou]r$", stem): possible_adjs.append(stem) elif stem.endswith("r"): possible_adjs.append(stem[:-1] + "er") if adv.endswith(u"iē"): possible_adjs.append(stem + "ius") for possible_adj in possible_adjs: investigate_possible_adj(index, possible_adj, adv, adv_defns)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^==.*==\n)", secbody, 0, re.M) notes = [] for k in xrange(2, len(subsections), 2): if "==Inflection==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) poses = set() for t in parsed.filter_templates(): pos = lalib.la_infl_template_pos(t) if pos: poses.add(pos) poses = sorted(list(poses)) if len(poses) > 1: pagemsg( "WARNING: Saw inflection templates for multiple parts of speech: %s" % ",".join(poses)) elif len(poses) == 0: pagemsg( "WARNING: Saw no inflection templates in ==Inflection== section" ) else: if poses[0] == "verb": subsections[k - 1] = subsections[k - 1].replace( "Inflection", "Conjugation") notes.append( "convert Latin ==Inflection== header to ==Conjugation==" ) else: subsections[k - 1] = subsections[k - 1].replace( "Inflection", "Declension") notes.append( "convert Latin ==Inflection== header to ==Declension==" ) secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in ["l", "m", "alternative form of", "alt form"]: if tn in ["l", "m"]: lang = getparam(t, "1") termparam = 2 elif getparam(t, "lang"): lang = getparam(t, "lang") termparam = 1 else: lang = getparam(t, "1") termparam = 2 if lang != "la": #pagemsg("WARNING: Wrong language in template: %s" % unicode(t)) continue term = getparam(t, str(termparam)) alt = getparam(t, str(termparam + 1)) gloss = getparam(t, str(termparam + 2)) if alt and lalib.remove_macrons(alt) == term: origt = unicode(t) t.add(str(termparam), alt) if gloss: t.add(str(termparam + 1), "") else: rmparam(t, str(termparam + 1)) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("move alt param to link param in %s" % tn) secbody = unicode(parsed) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) notes = [] for k in xrange(2, len(subsections), 2): newtext = re.sub(r"^\n*(\{\{la-.*?-form)", r"\1", subsections[k]) if newtext != subsections[k]: notes.append( "remove extraneous newlines before Latin non-lemma headword") indent = len(re.sub("^(=+).*\n", r"\1", subsections[k - 1])) def add_header(m): lastchar, tempname = m.groups() if tempname in tempname_to_header: header_pos = tempname_to_header[tempname] else: pagemsg("WARNING: Unrecognized template name: %s" % tempname) return m.group(0) header = "=" * indent + header_pos + "=" * indent preceding_newline = "\n" if lastchar != "\n" else "" return lastchar + "\n" + preceding_newline + header + "\n{{" + tempname newnewtext = re.sub(r"([^=])\n\{\{(la-[a-z -]*?-form)", add_header, newtext) if newnewtext != newtext: notes.append("add missing header before Latin non-lemma form") subsections[k] = newnewtext secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, sectext, comment): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") retval = lalib.find_latin_section(unicode(page.text), pagemsg) if retval is None: return None, None sectext = re.sub(r"^==Latin==\n", "", sectext) + "\n\n" sections, j, secbody, sectail, has_non_latin = retval notes = [] sections[j] = sectext notes.append(comment) return "".join(sections).rstrip("\n"), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") # Greatly speed things up when --stdin by ignoring non-Latin pages if "==Latin==" not in text: return None, None if not re.search("la-(noun|proper noun|pronoun|verb|adj|num|suffix)-form", text): return None, None retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in [ "la-noun-form", "la-proper noun-form", "la-pronoun-form", "la-verb-form", "la-adj-form", "la-num-form", "la-suffix-form" ]: if not getparam(t, "1"): pagemsg("WARNING: Missing 1=: %s" % unicode(t)) for param in t.params: pn = pname(param) if pn not in ["1", "g", "g2", "g3", "g4"]: pagemsg("WARNING: Extraneous param %s=: %s" % (pn, unicode(t))) return None, None
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn == "la-decl-3rd-I": stem = getparam(t, "1") if stem.endswith("polis"): blib.set_template_name(t, "la-decl-3rd-polis") t.add("1", stem[:-5]) notes.append("Fix noun in -polis to use {{la-decl-3rd-polis}}") else: pagemsg( "WARNING: Found la-decl-3rd-I without stem in -polis: %s" % unicode(t)) elif tn == "la-noun": blib.set_template_name(t, "la-proper noun") secbody = unicode(parsed).replace("==Noun==", "==Proper noun==") sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") text = unicode(page.text) origtext = text notes = [] retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M) saw_a_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) la_noun_template = None la_ndecl_template = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "la-ndecl": if la_ndecl_template: pagemsg("WARNING: Saw multiple noun declension templates in subsection, %s and %s, skipping" % ( unicode(la_ndecl_template), unicode(t))) must_continue = True break la_ndecl_template = t saw_a_template = True if tn in ["la-noun", "la-proper noun", "la-location"] or ( tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") in ["noun", "proper noun"] ): if la_noun_template: pagemsg("WARNING: Saw multiple noun headword templates in subsection, %s and %s, skipping" % ( unicode(la_noun_template), unicode(t))) must_continue = True break la_noun_template = t saw_a_template = True if must_continue: continue if not la_noun_template and not la_ndecl_template: continue new_style_headword_template = ( la_noun_template and tname(la_noun_template) in ["la-noun", "la-proper noun"] and not getparam(la_noun_template, "head2") and not getparam(la_noun_template, "2") and not getparam(la_noun_template, "3") and not getparam(la_noun_template, "4") and not getparam(la_noun_template, "decl") ) if la_noun_template and not la_ndecl_template: if (tname(la_noun_template) in ["la-noun", "la-proper noun"] and getparam(la_noun_template, "indecl")): if new_style_headword_template: pagemsg("Found new-style indeclinable noun headword template, skipping: %s" % unicode(la_noun_template)) continue if (getparam(la_noun_template, "head2") or getparam(la_noun_template, "decl") or getparam(la_noun_template, "2") and getparam(la_noun_template, "2") != getparam(la_noun_template, "1") or not getparam(la_noun_template, "3")): pagemsg("WARNING: Found old-style indeclinable noun headword template and don't know how to convert: %s" % unicode(la_noun_template)) continue gender = getparam(la_noun_template, "3") orig_la_noun_template = unicode(la_noun_template) la_noun_template.add("g", gender[0], before="3") rmparam(la_noun_template, "3") rmparam(la_noun_template, "2") pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert indeclinable {{la-noun}}/{{la-proper noun}} template to new style") subsections[k] = unicode(parsed) continue else: pagemsg("WARNING: Saw noun headword template but no declension template: %s" % unicode(la_noun_template)) continue if la_ndecl_template and not la_noun_template: pagemsg("WARNING: Saw noun declension template but no headword template: %s" % unicode(la_ndecl_template)) continue orig_la_noun_template = unicode(la_noun_template) if new_style_headword_template: pagemsg("Found new-style noun headword template, skipping: %s" % orig_la_noun_template) continue def render_headword_and_decl(): return "headword template <from> %s <to> %s <end>, declension template <from> %s <to> %s <end>" % ( orig_la_noun_template, orig_la_noun_template, unicode(la_ndecl_template), unicode(la_ndecl_template) ) if tname(la_noun_template) == "head": explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["head", "head1"], "head") lemma = explicit_head_param_head or [pagetitle] elif tname(la_noun_template) == "la-location": explicit_head_param_head = [getparam(la_noun_template, "1")] else: explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["1", "head", "head1"], "head") lemma = explicit_head_param_head or [pagetitle] if "[[" in lemma[0]: if len(lemma) > 1: pagemsg("WARNING: Multiple lemmas %s and lemmas with links in them, can't handle, skipping: %s" % ( ",".join(lemma), render_headword_and_decl() )) continue ndecl_lemma = getparam(la_ndecl_template, "1") if "[[" not in ndecl_lemma: must_continue = False for m in re.finditer(r"(\[\[.*?\]\])", lemma[0]): link = m.group(1) plainlink = blib.remove_links(link) if plainlink not in ndecl_lemma: pagemsg("WARNING: Can't interpolate link %s into declension template, skipping: %s" % ( link, render_headword_and_decl())) must_continue = True break ndecl_lemma = ndecl_lemma.replace(plainlink, link, 1) if must_continue: continue new_ndecl_template = blib.parse_text(unicode(la_ndecl_template)).filter_templates()[0] new_ndecl_template.add("1", ndecl_lemma) pagemsg("Adding links to decl template %s to produce %s" % ( unicode(la_ndecl_template), unicode(new_ndecl_template))) la_ndecl_template = new_ndecl_template noun_props = new_generate_noun_forms(unicode(la_ndecl_template), errandpagemsg, expand_text, include_props=True) if noun_props is None: continue decl_gender = noun_props.get("g", None) if tname(la_noun_template) == "head": noun_gender = blib.fetch_param_chain(la_noun_template, ["g", "g1"], "g") if not noun_gender and not decl_gender: pagemsg("WARNING: No gender in {{head|la|...}} and no declension gender, can't proceed, skipping: %s" % render_headword_and_decl()) continue elif tname(la_noun_template) == "la-location": noun_gender = [getparam(la_noun_template, "4")] else: noun_gender = blib.fetch_param_chain(la_noun_template, ["3", "g", "g1"], "g") if not noun_gender: pagemsg("WARNING: No gender in old-style headword, skipping: %s" % render_headword_and_decl()) continue def do_compare_headword_decl_forms(id_slot, headword_forms, decl_slots, adjust_for_missing_gen_forms=False, remove_headword_links=False): return compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props, render_headword_and_decl(), pagemsg, adjust_for_missing_gen_forms=adjust_for_missing_gen_forms, remove_headword_links=remove_headword_links) def check_headword_vs_decl_decls(regularized_noun_decl): must_continue = False decl_lemma = getparam(la_ndecl_template, "1") if "((" in decl_lemma: pagemsg("WARNING: (( in decl_lemma, can't handle, skipping: %s" % render_headword_and_decl()) must_continue = True return segments = re.split(r"([^<> -]+<[^<>]*>)", decl_lemma) decl_decls = [] for i in xrange(1, len(segments) - 1, 2): m = re.search("^([^<> -]+)<([^<>]*)>$", segments[i]) stem_spec, decl_and_subtype_spec = m.groups() decl_and_subtypes = decl_and_subtype_spec.split(".") decl_decl = decl_and_subtypes[0] decl_decls.append(decl_decl) if set(regularized_noun_decl) != set(decl_decls): if set(regularized_noun_decl) <= set(decl_decls): pagemsg("headword decl %s subset of declension decl %s, allowing: %s" % ( ",".join(regularized_noun_decl), ",".join(decl_decls), render_headword_and_decl())) else: pagemsg("WARNING: headword decl %s not same as or subset of declension decl %s, skipping: %s" % ( ",".join(regularized_noun_decl), ",".join(decl_decls), render_headword_and_decl())) must_continue = True return must_continue def check_headword_vs_decl_gender(): must_continue = False if len(noun_gender) == 1 and noun_gender[0] == decl_gender: need_explicit_gender = False else: need_explicit_gender = True if len(noun_gender) > 1: pagemsg("WARNING: Saw multiple headword genders %s, please verify: %s" % ( ",".join(noun_gender), render_headword_and_decl())) elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")): pagemsg("WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % ( noun_gender[0], decl_gender, render_headword_and_decl())) must_continue = True return need_explicit_gender, must_continue def erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender): # Erase all params del la_noun_template.params[:] # Copy params from decl template for param in la_ndecl_template.params: pname = unicode(param.name) la_noun_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False) # Add explicit gender if needed if need_explicit_gender: explicit_genders = [] for ng in noun_gender: ng = ng[0] if ng not in explicit_genders: explicit_genders.append(ng) blib.set_param_chain(la_noun_template, explicit_genders, "g", "g") if tname(la_noun_template) == "head": if explicit_head_param_head and not do_compare_headword_decl_forms("lemma", explicit_head_param_head, ["linked_nom_sg", "linked_nom_pl"]): continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Check for extraneous {{head|la|...}} parameters must_continue = False is_proper_noun = getparam(la_ndecl_template, "2") == "proper noun" for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2"] or re.search("^(head|g)[0-9]*$", pname.strip()): continue pagemsg("WARNING: Saw extraneous param %s in {{head}} template, skipping: %s" % ( pname, render_headword_and_decl())) must_continue = True break if must_continue: continue # Copy params from decl template blib.set_template_name(la_noun_template, "la-proper noun" if is_proper_noun else "la-noun") erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{head|la|...}} to new-style {{la-noun}}/{{la-proper noun}} template") elif tname(la_noun_template) == "la-location": noun_decl = [getparam(la_noun_template, "6")] if not noun_decl: pagemsg("WARNING: No noun decl in {{la-location}}, skipping: %s" % render_headword_and_decl()) continue genitive = [getparam(la_noun_template, "2")] if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]): continue if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"], adjust_for_missing_gen_forms=True, remove_headword_links=True): continue regularized_noun_decl = [] must_continue = False for nd in noun_decl: if nd not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % ( nd, render_headword_and_decl())) must_continue = True break regularized_noun_decl.append(noun_decl_to_decl_type[nd]) if must_continue: continue must_continue = check_headword_vs_decl_decls(regularized_noun_decl) if must_continue: continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Check for extraneous {{la-location}} parameters must_continue = False for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4", "5", "6"]: continue pagemsg("WARNING: Saw extraneous param %s in {{la-location}} template, skipping: %s" % ( pname, render_headword_and_decl())) must_continue = True break if must_continue: continue blib.set_template_name(la_noun_template, "la-proper noun") erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{la-location}} to new-style {{la-proper noun}} template") else: # old-style {{la-noun}} or {{la-proper noun}} noun_decl = blib.fetch_param_chain(la_noun_template, ["4", "decl", "decl1"], "decl") if not noun_decl: pagemsg("WARNING: No noun decl in old-style headword, skipping: %s" % render_headword_and_decl()) continue genitive = blib.fetch_param_chain(la_noun_template, ["2", "gen", "gen1"], "gen") if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]): continue if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"], adjust_for_missing_gen_forms=True, remove_headword_links=True): continue regularized_noun_decl = [] must_continue = False for nd in noun_decl: if nd not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % ( nd, render_headword_and_decl())) must_continue = True break regularized_noun_decl.append(noun_decl_to_decl_type[nd]) if must_continue: continue must_continue = check_headword_vs_decl_decls(regularized_noun_decl) if must_continue: continue need_explicit_gender, must_continue = check_headword_vs_decl_gender() if must_continue: continue # Fetch remaining params from headword template headword_params = [] for param in la_noun_template.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4"] or re.search("^(head|gen|g|decl)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender) # Copy remaining params from headword template for name, value, showkey in headword_params: la_noun_template.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template))) notes.append("convert {{la-noun}}/{{la-proper noun}} params to new style") subsections[k] = unicode(parsed) if not saw_a_template: pagemsg("WARNING: Saw no noun headword or declension templates") secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def correct_nom_sg_n_participle(page, index, participle, lemma): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval if "===Etymology 1===" in secbody: pagemsg("WARNING: Multiple etymologies, don't know what to do") return None, None notes = [] subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M) participle_text = """{{head|la|participle|[[indeclinable]]|head=%s}} # {{inflection of|la|%s||perf|pasv|part}}\n\n""" % (participle, lemma) saw_participle = False for k in xrange(2, len(subsections), 2): if subsections[k - 1] == "===Participle===\n": if saw_participle: pagemsg("WARNING: Saw multiple participles, skipping") return None, None saw_participle = True subsections[k] = participle_text notes.append("correct participle %s of %s to be impersonal" % (participle, lemma)) secbody = "".join(subsections) if not saw_participle: for k in xrange(2, len(subsections), 2): insert_before = False if subsections[k - 1] == "===References===\n": pagemsg( "Inserting new participle subsection before references subsection" ) insert_before = True elif re.search(r"\{\{inflection of.*\|sup", subsections[k]): pagemsg( "Inserting new participle subsection before supine subsection" ) insert_before = True if insert_before: subsections[k - 1:k - 1] = ["===Participle===\n" + participle_text] secbody = "".join(subsections) break else: # no break if not secbody.endswith("\n\n"): secbody += "\n\n" secbody += "===Participle===\n" + participle_text notes.append("add impersonal participle %s of %s" % (participle, lemma)) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^==.*==\n)", secbody, 0, re.M) if len(subsections) != 3: pagemsg("WARNING: Not right # of sections (expected 1): %s" % ",".join(subsections[k].strip() for k in xrange(1, len(subsections), 2))) return None, None if subsections[1] != "===Verb===\n": pagemsg("WARNING: Expected ===Verb=== in subsections[1] but saw %s" % subsections[1].strip()) return None, None parsed = blib.parse_text(subsections[2]) infl = None lemma = None infloft = None for t in parsed.filter_templates(): if tname(t) == "la-verb-form": if infl: pagemsg("WARNING: Saw more than one {{la-verb-form}} call: %s" % unicode(t)) return None, None infl = getparam(t, "1") elif tname(t) == "inflection of": if lemma: pagemsg("WARNING: Saw more than one {{inflection of}} call: %s" % unicode(t)) return None, None if getparam(t, "lang"): lemma = getparam(t, "1") else: lemma = getparam(t, "2") infloft = t else: pagemsg("WARNING: Saw unexpected template: %s" % unicode(t)) return None, None if not infl or not lemma: pagemsg("WARNING: Didn't find both inflection %s and lemma %s" % ( infl, lemma)) return None, None infl = re.sub(u" (esse|īrī)$", "", infl) if infl.endswith(u"us"): if infl.endswith(u"ūrus"): partdesc = "Future active participle" head_template = "{{la-future participle|%s}}" % infl[:-2] infl_template = "{{la-decl-1&2|%s}}" % infl[:-2] else: if "perf|act" in unicode(infloft): partdesc = "Perfect active participle" else: partdesc = "Perfect passive participle" head_template = "{{la-perfect participle|%s}}" % infl[:-2] infl_template = "{{la-decl-1&2|%s}}" % infl[:-2] sectext = """ ===Etymology=== %s of {{m|la|%s}}. ===Pronunciation=== * {{la-IPA|%s}} ===Participle=== %s # {{rfdef|la}} ====Declension==== %s""" % (partdesc, lemma, infl, head_template, infl_template) comment = "correct Latin form to participle" elif infl.endswith("um"): sectext = """ ===Etymology=== From {{m|la|%s}}. ===Pronunciation=== * {{la-IPA|%s}} ===Gerund=== {{la-gerund|%s}} # {{rfdef|la}} ====Declension==== {{la-decl-gerund|%s}} ===Participle=== {{la-part-form|%s}} # {{inflection of|la|%s||acc|m|s|;|nom//acc//voc|n|s}}""" % ( lemma, infl, infl[:-2], infl[:-2], infl, infl[:-2] + "us" ) comment = "correct Latin form to gerund/participle form" else: pagemsg("WARNING: Unrecognized ending for participle/gerund %s" % infl) return None, None sections[j] = sectext + sectail return "".join(sections), comment
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) saw_noun = None saw_proper_noun = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-noun": if saw_noun: pagemsg( "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_noun), unicode(t))) return saw_noun = t elif tn == "la-proper noun": if saw_proper_noun: pagemsg( "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_proper_noun), unicode(t))) return saw_proper_noun = t if saw_noun and saw_proper_noun: pagemsg( "WARNING: Saw both noun and proper noun, can't correct header/headword" ) return if not saw_noun and not saw_proper_noun: pagemsg( "WARNING: Saw neither noun nor proper noun, can't correct header/headword" ) return pos = "pn" if saw_proper_noun else "n" ht = saw_proper_noun or saw_noun if getparam(ht, "indecl"): pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht)) return generate_template = blib.parse_text(unicode(ht)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") return tempargs = blib.split_generate_args(result) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in tempargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for index, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, index, parsed): return process_form(page, index, slot, form, pos) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), index, handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_form(page, index, slot, form, pos): def pagemsg(txt): msg("Page %s %s %s: %s" % (index, slot, form, txt)) notes = [] pagemsg("Processing") if not page.exists(): pagemsg("Skipping form value %s, page doesn't exist" % form) return None, None text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval if pos == "pn": from_header = "==Noun==" to_header = "==Proper noun==" from_headword_template = "la-noun-form" to_headword_template = "la-proper noun-form" from_pos = "noun form" to_pos = "proper noun form" from_lemma_pos = "noun" to_lemma_pos = "proper noun" elif pos == "n": from_header = "==Proper noun==" to_header = "==Noun==" from_headword_template = "la-proper noun-form" to_headword_template = "la-noun-form" from_pos = "proper noun form" to_pos = "noun form" from_lemma_pos = "proper noun" to_lemma_pos = "noun" else: raise ValueError("Unrecognized POS %s" % pos) subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): if (re.search(r"\{\{%s([|}])" % from_headword_template, subsections[k]) or re.search(r"\{\{head\|la\|%s([|}])" % from_pos, subsections[k])): newsubsec = subsections[k] newsubsec = re.sub(r"\{\{%s([|}])" % from_headword_template, r"{{%s\1" % to_headword_template, newsubsec) newsubsec = re.sub(r"\{\{head\|la\|%s([|}])" % from_pos, r"{{head|la|%s\1" % to_pos, newsubsec) newheadersubsec = subsections[k - 1] newheadersubsec = newheadersubsec.replace(from_header, to_header) if newsubsec != subsections[k] or newheadersubsec != subsections[ k - 1]: notes.append("non-lemma %s -> %s in header and headword" % (from_lemma_pos, to_lemma_pos)) subsections[k] = newsubsec subsections[k - 1] = newheadersubsec secbody = "".join(subsections) sections[j] = secbody + sectail text = "".join(sections) return text, notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") text = unicode(page.text) origtext = text notes = [] retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M) saw_a_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) la_verb_template = None la_conj_template = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "la-conj": if la_conj_template: pagemsg( "WARNING: Saw multiple verb conjugation templates in subsection, %s and %s, skipping" % (unicode(la_conj_template), unicode(t))) must_continue = True break la_conj_template = t saw_a_template = True if tn == "la-verb": if la_verb_template: pagemsg( "WARNING: Saw multiple verb headword templates in subsection, %s and %s, skipping" % (unicode(la_verb_template), unicode(t))) must_continue = True break la_verb_template = t saw_a_template = True if must_continue: continue if not la_verb_template and not la_conj_template: continue if la_verb_template and not la_conj_template: pagemsg( "WARNING: Saw verb headword template but no conjugation template: %s" % unicode(la_verb_template)) continue if la_conj_template and not la_verb_template: pagemsg( "WARNING: Saw verb conjugation template but no headword template: %s" % unicode(la_conj_template)) continue orig_la_verb_template = unicode(la_verb_template) if re.search(r"^(irreg|[0-9]\+*)(\..*)?$", getparam(la_verb_template, "1")): pagemsg("Found new-style verb headword template, skipping: %s" % orig_la_verb_template) continue def render_headword_and_conj(): return "headword template <from> %s <to> %s <end>, conjugation template <from> %s <to> %s <end>" % ( orig_la_verb_template, orig_la_verb_template, unicode(la_conj_template), unicode(la_conj_template)) verb_props = new_generate_verb_forms(unicode(la_conj_template), errandpagemsg, expand_text, include_props=True) if verb_props is None: continue subtypes = [ x.replace("-", "") for x in safe_split(verb_props["subtypes"], ".") ] conj_type = verb_props["conj_type"] conj_subtype = verb_props.get("conj_subtype", None) def compare_headword_conj_forms(id_slot, headword_forms, conj_slots, adjust_for_missing_perf_forms=False, remove_conj_links=False): conj_forms = "" for slot in conj_slots: if slot in verb_props: conj_forms = verb_props[slot] break conj_forms = safe_split(conj_forms, ",") if remove_conj_links: conj_forms = [blib.remove_links(x) for x in conj_forms] corrected_headword_forms = [ lengthen_ns_nf(x) for x in headword_forms ] corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms] if adjust_for_missing_perf_forms: # There are several instances of 4++ verbs where only the -īvī variant, # not the -iī variant, is listed in the headword. Don't get tripped up # by that. ivi_conj_forms = [ x for x in corrected_conj_forms if x.endswith(u"īvī") ] for ivi_conj_form in ivi_conj_forms: ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form) if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms: corrected_headword_forms.append(ii_conj_form) if set(corrected_headword_forms) != set(corrected_conj_forms): macronless_headword_forms = set( lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_conj_forms = set( lalib.remove_macrons(x) for x in corrected_conj_forms) if macronless_headword_forms == macronless_conj_forms: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) else: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) return False return True verb_conj = getparam(la_verb_template, "conj") or getparam( la_verb_template, "c") pattern = getparam(la_verb_template, "pattern") lemma = blib.fetch_param_chain(la_verb_template, ["1", "head", "head1"], "head") inf = blib.fetch_param_chain(la_verb_template, ["2", "inf", "inf1"], "inf") perf = blib.fetch_param_chain(la_verb_template, ["3", "perf", "perf1"], "perf") sup = blib.fetch_param_chain(la_verb_template, ["4", "sup", "sup1"], "sup") # Hack to handle cases like abeō where the headword normally lists perfect # abiī but the conj lists abiī, abīvī. if verb_conj == "irreg" and len(lemma) > 0 and lemma[0].endswith( u"eō"): ivi = re.sub(u"eō$", u"īvī", lemma[0]) if ivi not in perf: perf.append(ivi) if not compare_headword_conj_forms("lemma", lemma, [ "1s_pres_actv_indc", "3s_pres_actv_indc", "1s_perf_actv_indc", "3s_perf_actv_indc" ]): continue if "depon" in subtypes or "semidepon" in subtypes: if sup: pagemsg( "WARNING: Saw supine in conjunction with deponent verb, skipping: %s" % render_headword_and_conj()) continue sup = [re.sub("[sm]( (sum|est))?$", "m", x) for x in perf] else: if not compare_headword_conj_forms( "perfect", perf, ["1s_perf_actv_indc", "3s_perf_actv_indc"], adjust_for_missing_perf_forms=True, # Remove links from perfect to handle cases like adsoleō where the # perfect is adsoluī,[[adsolitus]] [[sum]] and the headword says # adsoluī,adsolitus sum. remove_conj_links=True): continue if len(sup) > 0 and sup[0].endswith(u"ūrus"): if not compare_headword_conj_forms("future participle", sup, ["futr_actv_ptc"]): continue if "supfutractvonly" not in subtypes: if len(lemma) > 0 and lemma[0].endswith("sum"): pass else: pagemsg( "WARNING: Expected supfutractvonly in subtypes=%s, skipping: %s" % (".".join( sorted(subtypes)), render_headword_and_conj())) continue else: if not compare_headword_conj_forms("supine", sup, ["sup_acc"]): continue if not verb_conj: pagemsg("WARNING: No conj in headword template: %s" % render_headword_and_conj()) else: conj_type_to_verb_conj = { "1st": "1", "2nd": "2", "3rd": "3", "3rd-io": "io", "4th": "4", "irreg": "irreg", } if conj_type not in conj_type_to_verb_conj: pagemsg( "WARNING: Something wrong, saw unrecognized conj_type=%s: %s" % (conj_type, render_headword_and_conj())) continue conj_type = conj_type_to_verb_conj[conj_type] if conj_subtype: if conj_subtype not in conj_type_to_verb_conj: pagemsg( "WARNING: Something wrong, saw unrecognized conj_subtype=%s" % (conj_subtype, render_headword_and_conj())) continue conj_subtype = conj_type_to_verb_conj[conj_subtype] if verb_conj != conj_type and verb_conj != conj_subtype: pagemsg( "WARNING: Conjugation template has conj=%s, subconj=%s but headword template has conj=%s, skipping: %s" % (conj_type, conj_subtype, verb_conj, render_headword_and_conj())) continue pattern = pattern.replace("opt-semi-depon", "optsemidepon") pattern = pattern.replace("semi-depon", "semidepon") pattern = pattern.replace("pass-3only", "pass3only") pattern = pattern.replace("pass-impers", "passimpers") pattern = pattern.replace("no-actv-perf", "noactvperf") pattern = pattern.replace("no-pasv-perf", "nopasvperf") pattern = pattern.replace("perf-as-pres", "perfaspres") pattern = pattern.replace("short-imp", "shortimp") pattern = pattern.replace("sup-futr-actv-only", "supfutractvonly") pattern = safe_split(pattern, "-") pattern = [ x for x in pattern if x not in ["noperf", "nosup", "irreg", "def", "facio", "shortimp", "depon"] ] subtypes = [ x for x in subtypes if x not in ["I", "noperf", "nosup", "irreg", "depon"] ] if len(lemma) > 0 and lemma[0].endswith("sum"): # This is added automatically by [[sum]] subtypes = [x for x in subtypes if x != "supfutractvonly"] if set(pattern) != set(subtypes): if set(subtypes) >= set(pattern) and ( set(subtypes) - set(pattern) <= { "nopass", "p3inf", "poetsyncperf", "optsyncperf", "alwayssyncperf" }): pagemsg( "Subtypes=%s of conjugation template have extra, ignorable subtypes %s compared with pattern=%s of headword template: %s" % (".".join(sorted(subtypes)), ".".join( sorted(list(set(subtypes) - set(pattern)))), ".".join( sorted(pattern)), render_headword_and_conj())) else: pagemsg( "WARNING: Conjugation template has subtypes=%s but headword template has pattern=%s, skipping: %s" % (".".join(sorted(subtypes)), ".".join( sorted(pattern)), render_headword_and_conj())) continue # Fetch remaining params from headword template headword_params = [] for param in la_verb_template.params: pname = unicode(param.name) if pname.strip() in [ "1", "2", "3", "4", "44", "conj", "c", "pattern" ] or re.search("^(head|inf|perf|sup)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) # Erase all params del la_verb_template.params[:] # Copy params from conj template for param in la_conj_template.params: pname = unicode(param.name) la_verb_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False) # Copy remaining params from headword template for name, value, showkey in headword_params: la_verb_template.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (orig_la_verb_template, unicode(la_verb_template))) notes.append("convert {{la-verb}} params to new style") subsections[k] = unicode(parsed) if not saw_a_template: pagemsg("WARNING: Saw no verb headword or conjugation templates") secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def delete_form_1(page, index, lemma, formind, formval, pos, tag_sets_to_delete, preserve_diaeresis): notes = [] tag_sets_to_delete = True if tag_sets_to_delete is True else ( sorted(tag_sets_to_delete)) frozenset_tag_sets_to_delete = True if tag_sets_to_delete is True else set( frozenset(tag_set) for tag_set in tag_sets_to_delete) def pagemsg(txt): msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) if pos == "verbform": expected_head_template = "la-verb-form" expected_header_pos = "Verb" expected_head_pos = "verb form" elif pos == "nounform": expected_head_template = "la-noun-form" expected_header_pos = "Noun" expected_head_pos = "noun form" elif pos == "adjform": expected_head_template = "la-adj-form" expected_header_pos = "Adjective" expected_head_pos = "adjective form" elif pos == "partform": expected_head_template = "la-part-form" expected_header_pos = "Participle" expected_head_pos = "participle form" elif pos == "numform": expected_head_template = "la-num-form" expected_header_pos = "Numeral" expected_head_pos = "numeral form" else: raise ValueError("Unrecognized part of speech %s" % pos) text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval # FIXME! #if "==Etymology 1==" in secbody: # etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) # for k in xrange(2, len(etym_sections), 2): # etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True) # secbody = "".join(etym_sections) subsections_to_delete = [] subsections_to_remove_inflections_from = [] subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) saw_head = False saw_infl = False saw_other_infl = False remove_deletable_tag_sets_from_subsection = False saw_bad_template = False for t in parsed.filter_templates(): tn = tname(t) if tn == expected_head_template: saw_head = True elif tn == "head" and getparam(t, "1") == "la" and getparam( t, "2") == expected_head_pos: saw_head = True elif tn == "inflection of": lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 if lang != "la": errandpagemsg( "WARNING: In Latin section, found {{inflection of}} for different language %s: %s" % (lang, unicode(t))) return None, None actual_lemma = getparam(t, str(lemma_param)) # Allow mismatch in macrons, which often happens, e.g. because # a macron was added to the lemma page but not to the inflections if remove_macrons(actual_lemma, preserve_diaeresis) == remove_macrons( lemma, preserve_diaeresis): # fetch tags tags = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) for tag in tags: if "//" in tag: pagemsg( "WARNING: Don't know how to handle multipart tags yet: %s" % unicode(t)) saw_other_infl = True break else: # no break tag_sets = lalib.split_tags_into_tag_sets(tags) for tag_set in tag_sets: if tag_sets_to_delete is True or frozenset( lalib.canonicalize_tag_set(tag_set) ) in frozenset_tag_sets_to_delete: saw_infl = True else: pagemsg( "Found {{inflection of}} for correct lemma but wrong tag set %s, expected one of %s: %s" % ("|".join(tag_set), ",".join( "|".join(x) for x in tag_sets_to_delete), unicode(t))) saw_other_infl = True else: pagemsg( "Found {{inflection of}} for different lemma %s: %s" % (actual_lemma, unicode(t))) saw_other_infl = True if saw_head and saw_infl: if saw_other_infl: pagemsg( "Found subsection #%s to delete but has inflection-of template for different lemma or nondeletable tag set, will remove only deletable tag sets" % (k // 2)) remove_deletable_tag_sets_from_subsection = True for t in parsed.filter_templates(): tn = tname(t) if tn not in [ expected_head_template, "inflection of" ] and not (tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") == expected_head_pos): pagemsg( "WARNING: Saw unrecognized template in otherwise deletable subsection #%s: %s" % (k // 2, unicode(t))) saw_bad_template = True break else: # No break if "===%s===" % expected_header_pos in subsections[k - 1]: if remove_deletable_tag_sets_from_subsection: subsections_to_remove_inflections_from.append(k) else: subsections_to_delete.append(k) else: pagemsg( "WARNING: Wrong header in otherwise deletable subsection #%s: %s" % (k // 2, subsections[k - 1].strip())) if not subsections_to_delete and not subsections_to_remove_inflections_from: pagemsg( "Found Latin section but no deletable or excisable subsections") return None, None #### Now, we can delete an inflection, a subsection or the whole section or page for k in subsections_to_remove_inflections_from: newsubsec = subsections[k] if not newsubsec.endswith("\n"): # This applies to the last subsection on the page newsubsec += "\n" def remove_inflections(m): parsed = blib.parse_text(m.group(0)) for t in parsed.filter_templates(): tn = tname(t) if tn == "inflection of": lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 assert lang == "la" actual_lemma = getparam(t, str(lemma_param)) # Allow mismatch in macrons, which often happens, e.g. because # a macron was added to the lemma page but not to the inflections if remove_macrons(actual_lemma, preserve_diaeresis) == remove_macrons( lemma, preserve_diaeresis): tr = getparam(t, "tr") alt = getparam(t, "alt") or getparam( t, str(lemma_param + 1)) # fetch tags tags = [] params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) elif pname not in ["lang", "tr", "alt"]: params.append((pname, pval, param.showkey)) tag_sets = lalib.split_tags_into_tag_sets(tags) filtered_tag_sets = [] for tag_set in tag_sets: if tag_sets_to_delete is not True and frozenset( lalib.canonicalize_tag_set(tag_set) ) not in frozenset_tag_sets_to_delete: filtered_tag_sets.append(tag_set) if not filtered_tag_sets: return "" # Erase all params. del t.params[:] # Put back new params. t.add("1", lang) t.add("2", actual_lemma) if tr: t.add("tr", tr) t.add("3", alt) next_tag_param = 4 for tag in lalib.combine_tag_set_group( filtered_tag_sets): t.add(str(next_tag_param), tag) next_tag_param += 1 return unicode(parsed) newnewsubsec = re.sub(r"^# \{\{inflection of\|[^{}\n]*\}\}\n", remove_inflections, newsubsec, 0, re.M) if newnewsubsec != newsubsec: notes.append("removed inflection(s) for bad Latin form(s)") subsections[k] = newnewsubsec for k in reversed(subsections_to_delete): # Do in reverse order so indices don't change del subsections[k] del subsections[k - 1] if len(subsections) == 1 or len(subsections) == 3 and re.search( "^==+References==+$", subsections[1].strip()): # Whole section deletable if subsections[0].strip(): pagemsg( "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>" % subsections[0].strip()) return None, None if "[[Category:" in sectail: pagemsg( "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>" % sectail.strip()) return None, None if not has_non_latin: # Can delete the whole page, but check for non-blank section 0 cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0]) if cleaned_sec0.strip(): pagemsg( "WARNING: Whole page deletable except that there's text above all sections: <%s>" % cleaned_sec0.strip()) return None, None pagetitle = unicode(page.title()) pagemsg("Page %s should be deleted" % pagetitle) pages_to_delete.append(pagetitle) return None, None del sections[j] del sections[j - 1] notes.append( "excised %s subsection%s for bad Latin forms, leaving no Latin section" % (len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s")) if j > len(sections): # We deleted the last section, remove the separator at the end of the # previous section. sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1]) text = "".join(sections) else: # Some but not all subsections remain secbody = "".join(subsections) sections[j] = secbody + sectail if subsections_to_delete and subsections_to_remove_inflections_from: deletable_subsec_text = "Subsection(s) %s deletable and subsection(s) %s excisable" % ( ",".join(str(k // 2) for k in subsections_to_delete), ",".join( str(k // 2) for k in subsections_to_remove_inflections_from)) deletable_subsec_note_text = "deleted %s subsection%s and partly excised %s subsection%s" % ( len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s", len(subsections_to_remove_inflections_from), "" if len(subsections_to_remove_inflections_from) == 1 else "s") elif subsections_to_delete: deletable_subsec_text = "Subsection(s) %s deletable" % (",".join( str(k // 2) for k in subsections_to_delete)) deletable_subsec_note_text = "deleted %s subsection%s" % ( len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s") else: deletable_subsec_text = "Subsection(s) %s excisable" % (",".join( str(k // 2) for k in subsections_to_remove_inflections_from)) deletable_subsec_note_text = "partly excised %s subsection%s" % ( len(subsections_to_remove_inflections_from), "" if len(subsections_to_remove_inflections_from) == 1 else "s") if "==Etymology" in sections[j]: pagemsg( "WARNING: %s but found Etymology subsection, don't know how to handle" % deletable_subsec_text) return None, None if "==Pronunciation" in sections[j]: pagemsg( "WARNING: %s but found Pronunciation subsection, don't know how to handle" % deletable_subsec_text) return None, None notes.append( "%s for bad Latin forms, leaving some subsections remaining" % deletable_subsec_note_text) text = "".join(sections) return text, notes
def process_page(page, index, headword_template, decl_template): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] parsed = blib.parse_text(secbody) num_noun_headword_templates = 0 num_ndecl_templates = 0 num_adecl_templates = 0 for t in parsed.filter_templates(): tn = tname(t) if tn in ["la-noun", "la-proper noun"]: num_noun_headword_templates += 1 if tn == "la-ndecl": num_ndecl_templates += 1 if tn == "la-adecl": num_adecl_templates += 1 # FIXME, also add something for manually-specified declensions (synaeresis?) if "\n===Declension===\n" in secbody: pagemsg("WARNING: Saw misindented Declension header") if num_adecl_templates >= 1: pagemsg("WARNING: Saw {{la-adecl}} in noun section") if num_ndecl_templates + num_adecl_templates >= num_noun_headword_templates: pagemsg( "WARNING: Already seen %s decl template(s) >= %s headword template(s), skipping" % (num_ndecl_templates + num_adecl_templates, num_noun_headword_templates)) return None, None subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) num_declension_headers = 0 for k in xrange(1, len(subsections), 2): if "Declension" in subsections[k] or "Inflection" in subsections[k]: num_declension_headers += 1 if num_declension_headers >= num_noun_headword_templates: pagemsg( "WARNING: Already seen %s Declension/Inflection header(s) >= %s headword template(s), skipping" % (num_declension_headers, num_noun_headword_templates)) return None, None for k in xrange(2, len(subsections), 2): if headword_template in subsections[k]: pagemsg("Inserting declension section after subsection %s" % k) subsections[k] = subsections[k].rstrip('\n') + "\n\n" num_equal_signs = len( re.sub("^(=+).*", r"\1", subsections[k - 1].strip())) subsections[k + 1:k + 1] = [ "%sDeclension%s\n%s\n\n" % ("=" * (num_equal_signs + 1), "=" * (num_equal_signs + 1), decl_template) ] notes.append("add section for Latin declension %s" % decl_template) break else: pagemsg("WARNING: Couldn't locate headword template, skipping: %s" % headword_template) return None, None secbody = "".join(subsections) sections[j] = secbody + sectail text = "".join(sections) text = re.sub("\n\n\n+", "\n\n", text) if not notes: notes.append("convert 3+ newlines to 2") return text, notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text notes = [] retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M) saw_a_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) la_adj_template = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "la-adj": if la_adj_template: pagemsg( "WARNING: Saw multiple adjective headword templates in subsection, %s and %s, skipping" % (unicode(la_adj_template), unicode(t))) must_continue = True break la_adj_template = t saw_a_template = True if must_continue: continue if not la_adj_template: continue m = re.search( r"'*comparative'*: '*(.*?)'+,* *'*superlative'*: '*(.*?)'+", subsections[k]) if m: comp, sup = m.groups() def parse_comp_sup(cs): m = re.search(r"^\{\{[lm]\|la\|(.*?)\}\}$", cs) if m: return m.group(1) m = re.search(r"^\[\[.*?\|(.*?)\]\]$", cs) if m: return m.group(1) m = re.search(r"^\[\[(.*?)\]\]$", cs) if m: return m.group(1) pagemsg("WARNING: Can't parse comp/sup %s" % cs) return None comp = parse_comp_sup(comp) sup = parse_comp_sup(sup) if comp and sup: orig_la_adj_template = unicode(la_adj_template) la_adj_template.add("comp", comp) la_adj_template.add("sup", sup) pagemsg("Replaced %s with %s" % (orig_la_adj_template, unicode(la_adj_template))) notes.append( "move comparative/superative to {{la-adj}} headword params" ) subsections[k] = unicode(parsed) subsections[k] = re.sub( r"\n+\* *'*comparative'*: '*(.*?)'+,* *'*superlative'*: '*(.*?)'+\n+", "\n\n", subsections[k], 1) secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] def process_etym_section(sectext, is_etym_section): if "==Pronunciation 1==" not in sectext: pagemsg("No ==Pronunciation 1== in %s" % ("etym section" if is_etym_section else "text")) return sectext if is_etym_section: equalsigns = "====" else: equalsigns = "===" subsections = re.split("(^==.*==\n)", sectext, 0, re.M) if len(subsections) > 2 and subsections[1] == "===Etymology===\n": # Allow for an Etymology section at the beginning (many examples have one, # saying e.g. "Inflected form of {{m|la|pulchellus||beautiful little}}.". offset = 2 else: offset = 0 if not (len(subsections) == 9 + offset or (len(subsections) == 11 + offset and subsections[9 + offset] == "===References===\n")): pagemsg( "WARNING: Not right # of sections (normally four, potentially five or six with ===Etymology=== and/or ===References===): %s" % (",".join(subsections[k].strip() for k in xrange(1, len(subsections), 2)))) return sectext if (subsections[1 + offset] != "%sPronunciation 1%s\n" % (equalsigns, equalsigns) or subsections[5 + offset] != "%sPronunciation 2%s\n" % (equalsigns, equalsigns)): pagemsg( "WARNING: Expected %sPronunciation N%s headers but saw %s and %s" % (equalsigns, equalsigns, subsections[1 + offset].strip(), subsections[5 + offset].strip())) return sectext if subsections[3 + offset] != subsections[7 + offset]: if is_etym_section: pagemsg( "WARNING: Already in etym section and saw different POS headers %s and %s, can't convert to etym sections" % (subsections[3 + offset].strip(), subsections[7 + offset].strip())) return sectext elif offset > 0: pagemsg( "WARNING: Already have ===Etymology=== section and saw different POS headers %s and %s, can't convert to etym sections" % (subsections[3 + offset].strip(), subsections[7 + offset].strip())) return sectext else: pagemsg("Saw different POS headers %s and %s" % (subsections[3 + offset].strip(), subsections[7 + offset].strip())) subsections[ 1 + offset] = "===Etymology 1===\n\n====Pronunciation====\n" subsections[2 + offset] = re.sub(r"^\{\{rfc-pron-n\|.*?\}\}\n", "", subsections[2 + offset], 0, re.M) subsections[ 5 + offset] = "===Etymology 2===\n\n====Pronunciation====\n" notes.append( "Combined ===Pronunciation 1=== and ===Pronunciation 2=== to ===Etymology 1=== and ===Etymology 2=== because different parts of speech/lemmas" ) return "".join(subsections) else: def find_lemmas(text): lemmas = set() parsed = blib.parse_text(text) for t in parsed.filter_templates(): if tname(t) == "inflection of": if getparam(t, "lang"): lemmas.add(getparam(t, "1")) else: lemmas.add(getparam(t, "2")) return lemmas first_lemmas = find_lemmas(subsections[4 + offset]) second_lemmas = find_lemmas(subsections[8 + offset]) if first_lemmas != second_lemmas: pagemsg( "WARNING: Different lemmas in two POS sections: %s and %s" % (",".join(first_lemmas), ",".join(second_lemmas))) return sectext # For verbs with the infinitive in the second section, swap the # sections to put the infinitive first. if re.search(r"\|inf[|}]", subsections[8 + offset]): # Preserve the newlines at the end of each section; only swap the text. m = re.match(r"\A(.*?)(\n*)\Z", subsections[4 + offset], re.S) text4, newlines4 = m.groups() m = re.search(r"\A(.*?)(\n*)\Z", subsections[8 + offset], re.S) text8, newlines8 = m.groups() subsections[4 + offset] = text8 + newlines4 subsections[8 + offset] = text4 + newlines8 temptext = subsections[2 + offset] subsections[2 + offset] = subsections[6 + offset] subsections[6 + offset] = temptext notes.append("swap non-lemma sections to put infinitive first") subsections[1 + offset] = "%sPronunciation%s\n" % (equalsigns, equalsigns) subsections[3 + offset] = re.sub( "^=+", equalsigns, re.sub("=+\n$", equalsigns + "\n", subsections[3 + offset])) subsections[7 + offset] = re.sub( "^=+", equalsigns, re.sub("=+\n$", equalsigns + "\n", subsections[7 + offset])) subsections[2 + offset] = subsections[2 + offset].strip( ) + "\n" + subsections[6 + offset].strip() + "\n\n" parsed = blib.parse_text(subsections[2 + offset]) for t in parsed.filter_templates(): if tname(t) == "la-IPA": t.add("ann", "1") subsections[2 + offset] = unicode(parsed) subsections[2 + offset] = re.sub(r"^\{\{rfc-pron-n\|.*?\}\}\n", "", subsections[2 + offset], 0, re.M) del subsections[6 + offset] del subsections[5 + offset] notes.append( "combine %sPronunciation 1%s and %sPronunciation 2%s" % (equalsigns, equalsigns, equalsigns, equalsigns)) return "".join(subsections) has_etym_1 = "==Etymology 1==" in secbody if not has_etym_1: secbody = process_etym_section(secbody, is_etym_section=False) else: etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): etym_sections[k] = process_etym_section(etym_sections[k], is_etym_section=True) secbody = "".join(etym_sections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): if "==Adverb==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) posdeg = None compt = None supt = None for t in parsed.filter_templates(): if tname(t) == "comparative of": if compt: pagemsg( "WARNING: Saw multiple {{comparative of}}: %s and %s" % (unicode(compt), unicode(t))) else: compt = t posdeg = blib.remove_links(getparam(t, "1")) if not posdeg: pagemsg( "WARNING: Didn't see positive degree in {{comparative of}}: %s" % unicode(t)) elif tname(t) == "superlative of": if supt: pagemsg( "WARNING: Saw multiple {{superlative of}}: %s and %s" % (unicode(supt), unicode(t))) else: supt = t posdeg = blib.remove_links(getparam(t, "1")) if not posdeg: pagemsg( "WARNING: Didn't see positive degree in {{superlative of}}: %s" % unicode(t)) if compt and supt: pagemsg( "WARNING: Saw both comparative and superlative, skipping: %s and %s" % (unicode(compt), unicode(supt))) continue if not compt and not supt: pagemsg( "WARNING: Didn't see {{comparative of}} or {{superlative of}} in section %s" % k) continue for t in parsed.filter_templates(): tn = tname(t) if tn in ["la-adv-comp", "la-adv-sup"]: pagemsg("Already saw fixed headword: %s" % unicode(t)) break if tn == "head": if not getparam(t, "1") == "la": pagemsg("WARNING: Saw wrong language in {{head}}: %s" % unicode(t)) else: pos = getparam(t, "2") head = blib.remove_links(getparam(t, "head")) or pagetitle if pos not in [ "adverb", "adverbs", "adverb form", "adverb forms", "adverb comparative form", "adverb comparative forms", "adverb superlative form", "adverb superlative forms", ]: pagemsg( "WARNING: Unrecognized part of speech '%s': %s" % (pos, unicode(t))) else: real_head, real_comp, real_sup = find_head_comp_sup( lalib.remove_macrons(posdeg), pagemsg) if real_head: if lalib.remove_macrons( real_head) != lalib.remove_macrons( posdeg): pagemsg( "WARNING: Can't replace positive degree %s with %s because they differ when macrons are removed" % (posdeg, real_head)) else: pagemsg( "Using real positive degree %s instead of %s" % (real_head, posdeg)) inflt = compt or supt origt = unicode(inflt) inflt.add("1", real_head) pagemsg("Replaced %s with %s" % (origt, unicode(inflt))) if compt: newname = "la-adv-comp" infldeg = "comparative" if real_comp and real_comp != "-": if lalib.remove_macrons( real_comp) != lalib.remove_macrons( head): pagemsg( "WARNING: Can't replace comparative degree %s with %s because they differ when macrons are removed" % (head, real_comp)) else: pagemsg( "Using real comparative degree %s instead of %s" % (real_comp, head)) head = real_comp else: pagemsg( "WARNING: Couldn't retrieve real comparative for positive degree %s" % real_head) else: newname = "la-adv-sup" infldeg = "superlative" if real_sup and real_sup != "-": if lalib.remove_macrons( real_sup) != lalib.remove_macrons( head): pagemsg( "WARNING: Can't replace superlative degree %s with %s because they differ when macrons are removed" % (head, real_sup)) else: pagemsg( "Using real superlative degree %s instead of %s" % (real_sup, head)) head = real_sup else: pagemsg( "WARNING: Couldn't retrieve real superlative for positive degree %s" % real_head) origt = unicode(t) rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") blib.set_template_name(t, newname) t.add("1", head) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "replace {{head|la|...}} with {{%s}} and fix up positive/%s" % (newname, infldeg)) subsections[k] = unicode(parsed) secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] def fix_up_section(sectext, warn_on_multiple_heads): parsed = blib.parse_text(sectext) heads = set() pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if lalib.la_template_is_head(t): heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg)) elif tn == "la-IPA": pronun_templates.append(t) if len(heads) > 1: if warn_on_multiple_heads: pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads)) return sectext if len(heads) == 0: pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads)) return sectext newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext) newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M) if newsectext != sectext: notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0]) sectext = newsectext # Recompute pronun templates as we may have added one. parsed = blib.parse_text(sectext) pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-IPA": pronun_templates.append(t) if "{{a|Ecclesiastical}} {{IPA" in sectext: if len(pronun_templates) == 0: pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template") elif len(pronun_templates) > 1: pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" % ",".join(unicode(tt) for tt in pronun_templates)) else: origt = unicode(pronun_templates[0]) pronun_templates[0].add("eccl", "yes") pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0]))) newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "", sectext, 0, re.M) if newsectext == sectext: pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation") else: notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}") sectext = newsectext return sectext # If there are multiple Etymology sections, the pronunciation may be above all of # them if all have the same pronunciation, else it will be within each section. # Cater to both situations. We first try without splitting on etym sections; if that # doesn't change anything, it may be because there were multiple heads found and # separate pronunciation sections, so we then try splitting on etym sections. has_etym_1 = "==Etymology 1==" in secbody newsecbody = fix_up_section(secbody, warn_on_multiple_heads=not has_etym_1) if newsecbody != secbody: secbody = newsecbody elif has_etym_1: etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True) secbody = "".join(etym_sections) sections[j] = secbody + sectail return "".join(sections), notes
def delete_term(index, term, expected_head_templates, save, verbose): notes = [] def pagemsg(txt): msg("Page %s %s: %s" % (index, term, txt)) page = pywikibot.Page(site, term) if not page.exists(): pagemsg("Skipping form value %s, page doesn't exist" % term) return text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) saw_lemma_in_etym = False saw_wrong_lemma_in_etym = False saw_head = False infl_template = None saw_bad_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn in expected_head_templates: saw_head = True elif tn in ["inflection of", "rfdef", "la-IPA"]: pass else: pagemsg( "WARNING: Saw unrecognized template in subsection #%s %s: %s" % (k // 2, subsections[k - 1].strip(), unicode(t))) saw_bad_template = True delete = False if saw_head: if saw_bad_template: pagemsg( "WARNING: Would delete but saw unrecognized template, not deleting" ) else: delete = True if not delete: return if "==Etymology" in sections[j]: pagemsg( "WARNING: Found Etymology subsection, don't know how to handle") return if "==Pronunciation " in sections[j]: pagemsg( "WARNING: Found Pronunciation N subsection, don't know how to handle" ) return #### Now, we can maybe delete the whole section or page if subsections[0].strip(): pagemsg( "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>" % subsections[0].strip()) return if "[[Category:" in sectail: pagemsg( "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>" % sectail.strip()) return if not has_non_latin: # Can delete the whole page, but check for non-blank section 0 cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0]) if cleaned_sec0.strip(): pagemsg( "WARNING: Whole page deletable except that there's text above all sections: <%s>" % cleaned_sec0.strip()) return pagetitle = unicode(page.title()) pagemsg("Page %s should be deleted" % pagetitle) pages_to_delete.append(pagetitle) return del sections[j] del sections[j - 1] notes.append("removed Latin section for bad term") if j > len(sections): # We deleted the last section, remove the separator at the end of the # previous section. sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1]) text = "".join(sections) return text, notes
def delete_participle_1(page, index, lemma, formind, formval, pos, preserve_diaeresis, save, verbose, diff): notes = [] def pagemsg(txt): msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(formval, preserve_diaeresis), pagemsg, verbose) expected_head_template = "la-part" text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) saw_lemma_in_etym = False saw_wrong_lemma_in_etym = False saw_head = False infl_template = None saw_bad_template = False for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn == "m" and "==Etymology==" in subsections[k - 1]: actual_lemma = getparam(t, "2") if remove_macrons(lemma, preserve_diaeresis) == remove_macrons( actual_lemma, preserve_diaeresis): saw_lemma_in_etym = True else: pagemsg( "WARNING: Saw wrong lemma %s != %s in Etymology section: %s" % (actual_lemma, lemma, unicode(t))) saw_wrong_lemma_in_etym = True elif tn == expected_head_template: saw_head = True elif tn == "la-adecl": if not saw_head: pagemsg( "WARNING: Saw inflection template without (or before) head template, skipping: %s" % unicode(t)) elif infl_template: pagemsg( "WARNING: Saw two possible inflection templates: first %s, second %s" % (infl_template, unicode(t))) else: infl_template = unicode(t) elif tn in [ "rfdef", "R:L&S", "R:Elementary Lewis", "R:du Cange", "R:Gaffiot", "R:NLW", "alternative form of", "la-IPA" ]: pass else: pagemsg( "WARNING: Saw unrecognized template in subsection #%s %s: %s" % (k // 2, subsections[k - 1].strip(), unicode(t))) saw_bad_template = True delete = False if saw_head and infl_template: if not saw_lemma_in_etym: pagemsg( "WARNING: Would delete but didn't see reference to correct lemma %s in Etymology section, not deleting" % lemma) elif saw_wrong_lemma_in_etym: pagemsg( "WARNING: Would delete but saw reference to wrong lemma in Etymology section, not deleting" ) elif saw_bad_template: pagemsg( "WARNING: Would delete but saw unrecognized template, not deleting" ) else: delete = True if not delete: return None, None args = lalib.generate_adj_forms(infl_template, errandpagemsg, expand_text) if args is None: return None, None single_forms_to_delete = [] for key, form in args.iteritems(): single_forms_to_delete.extend(form.split(",")) for formformind, formformval in blib.iter_items(single_forms_to_delete): delete_form(index, formval, formformind, formformval, "partform", True, preserve_diaeresis, save, verbose, diff) #### Now, we can maybe delete the whole section or page if subsections[0].strip(): pagemsg( "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>" % subsections[0].strip()) return None, None if "[[Category:" in sectail: pagemsg( "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>" % sectail.strip()) return None, None if not has_non_latin: # Can delete the whole page, but check for non-blank section 0 cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0]) if cleaned_sec0.strip(): pagemsg( "WARNING: Whole page deletable except that there's text above all sections: <%s>" % cleaned_sec0.strip()) return None, None pagetitle = unicode(page.title()) pagemsg("Page %s should be deleted" % pagetitle) pages_to_delete.append(pagetitle) return None, None del sections[j] del sections[j - 1] notes.append("removed Latin section for bad participle") if j > len(sections): # We deleted the last section, remove the separator at the end of the # previous section. sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1]) text = "".join(sections) return text, notes