def remove_inflections(m): parsed = blib.parse_text(m.group(0)) for t in parsed.filter_templates(): tn = tname(t) if tn == "inflection of": lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 assert lang == "la" actual_lemma = getparam(t, str(lemma_param)) # Allow mismatch in macrons, which often happens, e.g. because # a macron was added to the lemma page but not to the inflections if remove_macrons(actual_lemma, preserve_diaeresis) == remove_macrons( lemma, preserve_diaeresis): tr = getparam(t, "tr") alt = getparam(t, "alt") or getparam( t, str(lemma_param + 1)) # fetch tags tags = [] params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) elif pname not in ["lang", "tr", "alt"]: params.append((pname, pval, param.showkey)) tag_sets = lalib.split_tags_into_tag_sets(tags) filtered_tag_sets = [] for tag_set in tag_sets: if tag_sets_to_delete is not True and frozenset( lalib.canonicalize_tag_set(tag_set) ) not in frozenset_tag_sets_to_delete: filtered_tag_sets.append(tag_set) if not filtered_tag_sets: return "" # Erase all params. del t.params[:] # Put back new params. t.add("1", lang) t.add("2", actual_lemma) if tr: t.add("tr", tr) t.add("3", alt) next_tag_param = 4 for tag in lalib.combine_tag_set_group( filtered_tag_sets): t.add(str(next_tag_param), tag) next_tag_param += 1 return unicode(parsed)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in lalib.la_headword_templates: for head in lalib.la_get_headword_from_template( t, pagetitle, pagemsg): no_macrons_head = remove_macrons(blib.remove_links(head)) if pagetitle.startswith("Reconstruction"): unprefixed_title = "*" + re.sub(".*/", "", pagetitle) else: unprefixed_title = pagetitle if no_macrons_head != unprefixed_title: pagemsg("WARNING: Bad Latin head: %s" % unicode(t)) return None, None
def delete_form(index, lemma, formind, formval, pos, tag_sets_to_delete, preserve_diaeresis, save, verbose, diff): def pagemsg(txt): msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) if "[" in formval: pagemsg("Skipping form value %s with link in it" % formval) return page = pywikibot.Page(site, remove_macrons(formval, preserve_diaeresis)) if not page.exists(): pagemsg("Skipping form value %s, page doesn't exist" % formval) return def do_delete_form_1(page, index, parsed): return delete_form_1(page, index, lemma, formind, formval, pos, tag_sets_to_delete, preserve_diaeresis) blib.do_edit(page, index, do_delete_form_1, save=save, verbose=verbose, diff=diff)
def compare_headword_conj_forms(id_slot, headword_forms, conj_slots, adjust_for_missing_perf_forms=False, remove_conj_links=False): conj_forms = "" for slot in conj_slots: if slot in verb_props: conj_forms = verb_props[slot] break conj_forms = safe_split(conj_forms, ",") if remove_conj_links: conj_forms = [blib.remove_links(x) for x in conj_forms] corrected_headword_forms = [ lengthen_ns_nf(x) for x in headword_forms ] corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms] if adjust_for_missing_perf_forms: # There are several instances of 4++ verbs where only the -īvī variant, # not the -iī variant, is listed in the headword. Don't get tripped up # by that. ivi_conj_forms = [ x for x in corrected_conj_forms if x.endswith(u"īvī") ] for ivi_conj_form in ivi_conj_forms: ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form) if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms: corrected_headword_forms.append(ii_conj_form) if set(corrected_headword_forms) != set(corrected_conj_forms): macronless_headword_forms = set( lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_conj_forms = set( lalib.remove_macrons(x) for x in corrected_conj_forms) if macronless_headword_forms == macronless_conj_forms: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) else: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) return False return True
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if " " in pagetitle: pagemsg("WARNING: Space in page title, skipping") return pagemsg("Processing") text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "la-adv": adv = blib.remove_links(getparam(t, "1")) or pagetitle macron_stem, is_stem = lalib.infer_adv_stem(adv) if not is_stem: pagemsg( "WARNING: Couldn't infer stem from adverb %s, not standard: %s" % (adv, origt)) continue adv_defns = lalib.find_defns(subsections[k]) possible_adjs = [] stem = lalib.remove_macrons(macron_stem) possible_adjs.append(stem + "us") possible_adjs.append(stem + "is") if stem.endswith("nt"): possible_adjs.append(stem[:-2] + "ns") if stem.endswith("plic"): possible_adjs.append(stem[:-2] + "ex") if stem.endswith("c"): possible_adjs.append(stem[:-1] + "x") if re.search("[aeiou]r$", stem): possible_adjs.append(stem) elif stem.endswith("r"): possible_adjs.append(stem[:-1] + "er") if adv.endswith(u"iē"): possible_adjs.append(stem + "ius") for possible_adj in possible_adjs: investigate_possible_adj(index, possible_adj, adv, adv_defns)
def get_lemmas_of_form_page(parsed): lemmas = set() for t in parsed.filter_templates(): tname = unicode(t.name) first_param = None if (tname in ["inflection of", "comparative of", "superlative of"]): first_param = get_first_param(t) if first_param: lemma = lalib.remove_macrons(blib.remove_links(getparam(t, first_param))) lemmas.add(lemma) return lemmas
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in ["l", "m", "alternative form of", "alt form"]: if tn in ["l", "m"]: lang = getparam(t, "1") termparam = 2 elif getparam(t, "lang"): lang = getparam(t, "lang") termparam = 1 else: lang = getparam(t, "1") termparam = 2 if lang != "la": #pagemsg("WARNING: Wrong language in template: %s" % unicode(t)) continue term = getparam(t, str(termparam)) alt = getparam(t, str(termparam + 1)) gloss = getparam(t, str(termparam + 2)) if alt and lalib.remove_macrons(alt) == term: origt = unicode(t) t.add(str(termparam), alt) if gloss: t.add(str(termparam + 1), "") else: rmparam(t, str(termparam + 1)) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("move alt param to link param in %s" % tn) secbody = unicode(parsed) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "inflection of": lang = getparam(t, "lang") if lang: term_param = 1 else: lang = getparam(t, "1") term_param = 2 if lang != "la": continue term = getparam(t, str(term_param)) alt = getparam(t, str(term_param + 1)) if alt: if lalib.remove_macrons(alt) != lalib.remove_macrons(term): pagemsg( "WARNING: alt not same as term modulo macrons: %s" % unicode(t)) continue origt = unicode(t) t.add(str(term_param), alt) t.add(str(term_param + 1), "") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "move alt param to term param in Latin {{inflection of}}") return unicode(parsed), notes
def compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props, headword_and_decl_text, pagemsg, adjust_for_missing_gen_forms=False, adjust_for_e_ae_gen=False, remove_headword_links=False): decl_forms = "" for slot in decl_slots: if slot in noun_props: decl_forms = noun_props[slot] break decl_forms = safe_split(decl_forms, ",") if remove_headword_links: headword_forms = [blib.remove_links(x) for x in headword_forms] corrected_headword_forms = [lengthen_ns_nf(x) for x in headword_forms] corrected_decl_forms = [lengthen_ns_nf(x) for x in decl_forms] if adjust_for_e_ae_gen: corrected_headword_forms = [re.sub(u"ē$", "ae", x) for x in headword_forms] if adjust_for_missing_gen_forms: # Nouns in -ius and -ium are commonly missing the shortened genitive # variants. Don't get tripped up by that. ii_decl_forms = [x for x in corrected_decl_forms if x.endswith(u"iī")] for ii_decl_form in ii_decl_forms: i_decl_form = re.sub(u"iī$", u"ī", ii_decl_form) if i_decl_form in corrected_decl_forms and i_decl_form not in corrected_headword_forms: corrected_headword_forms.append(i_decl_form) if set(corrected_headword_forms) != set(corrected_decl_forms): macronless_headword_forms = set(lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_decl_forms = set(lalib.remove_macrons(x) for x in corrected_decl_forms) if macronless_headword_forms == macronless_decl_forms: pagemsg("WARNING: Headword %s=%s different from decl %s=%s in macrons only, skipping: %s" % ( id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms), headword_and_decl_text )) else: pagemsg("WARNING: Headword %s=%s different from decl %s=%s in more than just macrons, skipping: %s" % ( id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms), headword_and_decl_text )) return False return True
def process_lemma(index, pagetitle, slots, program_args): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose) page = pywikibot.Page(site, pagetitle) parsed = blib.parse(page) for t in parsed.filter_templates(): tn = tname(t) pos = None if tn == "la-conj": pos = "verb" elif tn == "la-ndecl": pos = "noun" elif tn == "la-adecl": pos = "adj" if pos: args = lalib.generate_infl_forms(pos, unicode(t), errandpagemsg, expand_text) for slot in args: matches = False for spec in slots: if spec == slot: matches = True break if lalib.slot_matches_spec(slot, spec): matches = True break if matches: for formpagename in re.split(",", args[slot]): if "[" in formpagename or "|" in formpagename: pagemsg("WARNING: Skipping page %s with links in it" % formpagename) else: formpagename = lalib.remove_macrons(formpagename) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Skipping dictionary form") else: def do_process_page(page, index, parsed): return process_page(index, page, program_args) blib.do_edit(formpage, index, do_process_page, save=program_args.save, verbose=program_args.verbose, diff=program_args.diff)
def merge_forms_for_slot(slot, this_inflargs): # Merge the forms of all inflection templates under the given # lemma headword all_valid_forms = [] all_valid_forms_with_syncopated = [] for inflargs in this_inflargs: if slot not in inflargs: continue saw_slot_in_inflargs = True forms = inflargs[slot].split(",") valid_forms = [ form for form in forms if "[" not in form and "|" not in form ] for form in valid_forms: if form not in all_valid_forms: all_valid_forms.append(form) if form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append(form) if pos == "verb" and re.search(u"v[eiē]", form): syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1", form) if syncopated_form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append( syncopated_form) all_matchable_forms = [ form for form in all_valid_forms if lalib.remove_macrons(form) == pagetitle ] all_matchable_forms_with_syncopated = [ form for form in all_valid_forms_with_syncopated if lalib.remove_macrons(form) == pagetitle ] return (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated)
def yield_infl_of_templates_and_properties(): for t in headword['infl_of_templates']: lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 if lang != "la": errandstagemsg( "WARNING: In Latin section, found {{inflection of}} for different language %s: %s" % (lang, unicode(t))) continue lemma = getparam(t, str(lemma_param)) if "[" in lemma or "|" in lemma: stagemsg("WARNING: Link in lemma %s, skipping: %s" % (lemma, unicode(t))) continue inflargs_sets = lookup_inflection( lalib.remove_macrons(lemma), pos, expected_headtemps, expected_infltemps, stagemsg, errandstagemsg) if inflargs_sets is None: stagemsg( "WARNING: Lemma %s doesn't exist or has no %s heads" % (lemma, pos)) continue # fetch tags tags = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) # split tags into tag sets (which may be multipart) and further # split any multipart tag sets into component tag sets tag_sets = [ tag_set for maybe_multipart_tag_set in lalib.split_tags_into_tag_sets(tags) for tag_set in lalib.split_multipart_tag_set( maybe_multipart_tag_set) ] yield t, lemma_param, lemma, inflargs_sets, tag_sets
def check_participle(form, pagemsg): orig_pagemsg = pagemsg def pagemsg(txt): orig_pagemsg("%s: %s" % (form, txt)) if "[" in form or "|" in form: pagemsg("Skipping form with brackets or vertical bar") return page = pywikibot.Page(site, lalib.remove_macrons(form)) if not blib.safe_page_exists(page, pagemsg): pagemsg("Skipping nonexistent page") parsed = blib.parse_text(unicode(page.text)) for t in parsed.filter_templates(): tn = tname(t) if tn == "la-part": actual_part = re.sub("/.*", "", getparam(t, "1")) if actual_part != form: pagemsg("WARNING: Found actual participle %s, expected %s" % ( actual_part, form))
def process_non_lemma_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn in ["la-adj-comp", "la-adj-sup"]: lemma = getparam(t, "1") or pagetitle pos = getparam(t, "pos") if pos: def do_process(page, index, parsed): return process_lemma_page(page, index, tn == "la-adj-comp", lemma) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(pos)), index, do_process, save=args.save, verbose=args.verbose, diff=args.diff) else: pagemsg("WARNING: Didn't see positive degree: %s" % unicode(t))
def process_page(index, page, save, verbose, diff): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") parsed = blib.parse(page) for t in parsed.filter_templates(): if tname(t) == "la-conj": args = lalib.generate_verb_forms(unicode(t), errandpagemsg, expand_text) supforms = args.get("sup_acc", "") if supforms: supforms = supforms.split(",") for supform in supforms: non_impers_part = re.sub("um$", "us", supform) pagemsg( "Line to delete: part %s allbutnomsgn {{la-adecl|%s}}" % (non_impers_part, non_impers_part)) def do_correct_nom_sg_n_participle(page, index, parsed): return correct_nom_sg_n_participle( page, index, supform, args["1s_pres_actv_indc"]) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(supform)), index, do_correct_nom_sg_n_participle, save=save, verbose=verbose, diff=diff)
def process_page(index, pos, lemma, subs, infl, save, verbose): def pagemsg(txt): msg("Page %s %s: %s" % (index, lemma, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, lemma, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose) pagemsg("Processing") args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text) if args is None: return forms_to_delete = [] for key, form in args.iteritems(): forms_to_delete.extend(form.split(",")) for formind, form in blib.iter_items(forms_to_delete): def handler(page, formind, parsed): return process_form(index, page, lemma, formind, form, subs) blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)
pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix))) return unicode(parsed), notes parser = blib.create_argparser( "Add Latin adverbs to adjectives based on the output of find_latin_adj_for_adv.py" ) parser.add_argument("--direcfile", required=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lines = [x.rstrip('\n') for x in codecs.open(args.direcfile, "r", "utf-8")] for i, line in blib.iter_items(lines, start, end): m = re.search("^(.*?) /// (.*?) /// .*? /// .*?$", line) if not m: msg("Page %s: Unrecognized line: %s" % (i, line)) continue adv, adj = m.groups() def do_process_page(page, index, parsed): return process_page(page, index, adv) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(adj)), i, do_process_page, save=args.save, verbose=args.verbose, diff=args.diff) blib.elapsed_time()
def check_for_tag_set_match(tag_set, allow_lemma_mismatch): slot = lalib.tag_set_to_slot(tag_set, tag_set_groups, stagemsg) if slot is None: # Already issued warning return [] if slot not in possible_slots: stagemsg( "WARNING: Unrecognized slot %s from tag set: %s" % (slot, unicode(t))) return [] saw_slot_in_inflargs = False matching_actual_lemmas = [] for actual_lemmas, this_inflargs in inflargs_sets: saw_matching_lemma = False for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if (lalib.remove_macrons(lemma) == lalib.remove_macrons(actual_lemma) if allow_lemma_mismatch else lemma == actual_lemma): saw_matching_lemma = True if not saw_matching_lemma: continue (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot(slot, this_inflargs)) matched_form = False if set(headword_forms) == set(all_matchable_forms): stagemsg( "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) <= set( all_matchable_forms): stagemsg( "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), ",".join(all_matchable_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) == set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms_with_syncopated)) ) matched_form = True elif set(headword_forms) <= set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), ",".join( all_matchable_forms_with_syncopated), slot, lemma, ",".join( all_valid_forms_with_syncopated))) matched_form = True if matched_form: for actual_lemma in actual_lemmas: if actual_lemma not in matching_actual_lemmas: matching_actual_lemmas.append( actual_lemma) if not matching_actual_lemmas: if not saw_slot_in_inflargs: if "pasv" in slot: stagemsg( "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb" % (",".join(headword_forms), slot, lemma)) else: stagemsg( "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s" % (",".join(headword_forms), slot, lemma)) return matching_actual_lemmas
def process_form(index, page, lemma, formind, formval, subs): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) notes = [] parsed = blib.parse(page) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) def fix_head(headparam, head, tn): for badstem, goodstem in subs: if head.startswith(badstem): newhead = goodstem + head[len(badstem):] t.add(headparam, newhead) notes.append("correct stem %s -> %s in {{%s}}" % ( badstem, goodstem, tn)) return newhead else: # no break pagemsg("WARNING: Head %s not same as page title and doesn't begin with bad stem %s: %s" % ( head, " or ".join(badstem for badstem, goodstem in subs), unicode(t))) return False # la-suffix-form has its own format, don't handle if tn in lalib.la_nonlemma_headword_templates and tn != "la-suffix-form": headparam = "head" head = getparam(t, headparam) if not head: headparam = "1" head = getparam(t, headparam) if remove_macrons(head) != pagetitle: newhead = fix_head(headparam, head, tn) if newhead and remove_macrons(newhead) != pagetitle: pagemsg("WARNING: Replacement head %s not same as page title: %s" % ( newhead, unicode(t))) elif tn in lalib.la_infl_of_templates: langparam = "lang" headparam = "1" altparam = "2" lang = getparam(t, langparam) if not lang: langparam = "1" headparam = "2" altparam = "3" lang = getparam(t, langparam) if lang == "la": link = getparam(t, headparam) alt = getparam(t, altparam) head = alt or link if remove_macrons(head) != remove_macrons(lemma): if subs: newhead = fix_head(headparam, head, tn + "|la") if newhead: t.add(altparam, "") if remove_macrons(newhead) != remove_macrons(lemma): pagemsg("WARNING: Replacement lemma %s not same as lemma %s: %s" % ( newhead, lemma, unicode(t))) else: if link != lemma or alt != "": t.add(headparam, lemma) t.add(altparam, "") notes.append("correct lemma and/or move alt text to link text in {{%s|la}}" % tn) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def lookup_inflection(lemma_no_macrons, pos, expected_headtemps, expected_infltemps, pagemsg, errandpagemsg): global args lemma_pagetitle = lemma_no_macrons if lemma_pagetitle.startswith("*"): lemma_pagetitle = "Reconstruction:Latin/" + lemma_pagetitle[1:] orig_pagemsg = pagemsg orig_errandpagemsg = errandpagemsg def pagemsg(txt): orig_pagemsg("%s: %s" % (lemma_no_macrons, txt)) def errandpagemsg(txt): orig_errandpagemsg("%s: %s" % (lemma_no_macrons, txt)) def expand_text(tempcall): cache_key = (tempcall, lemma_pagetitle) if cache_key in expand_text_cache: retval = expand_text_cache[cache_key] if args.verbose: pagemsg("Found (%s, %s)=%s in expand_text_cache" % (tempcall, lemma_pagetitle, retval)) return retval if args.verbose: pagemsg("Couldn't find (%s, %s) in expand_text_cache" % (tempcall, lemma_pagetitle)) result = blib.expand_text(tempcall, lemma_pagetitle, pagemsg, args.verbose) expand_text_cache[cache_key] = result return result if lemma_pagetitle in heads_and_defns_cache: if args.verbose: pagemsg("Found %s in heads_and_defns_cache" % lemma_pagetitle) retval = heads_and_defns_cache[lemma_pagetitle] else: if args.verbose: pagemsg("Couldn't find %s in heads_and_defns_cache" % lemma_pagetitle) page = pywikibot.Page(site, lemma_pagetitle) try: exists = blib.try_repeatedly(lambda: page.exists(), pagemsg, "determine if page exists") except pywikibot.exceptions.InvalidTitle as e: pagemsg("WARNING: Invalid title %s, skipping" % lemma_pagetitle) heads_and_defns_cache[lemma_pagetitle] = "nonexistent" traceback.print_exc(file=sys.stdout) return None if not exists: pagemsg("WARNING: Lemma %s doesn't exist" % lemma_no_macrons) heads_and_defns_cache[lemma_pagetitle] = "nonexistent" return None retval = lalib.find_heads_and_defns(unicode(page.text), pagemsg) heads_and_defns_cache[lemma_pagetitle] = retval if retval == "nonexistent": pagemsg("WARNING: Lemma %s doesn't exist (cached)" % lemma_no_macrons) return None if retval is None: return None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval matched_head = False inflargs_sets = [] seen_heads = [] seen_infltns = [] for headword in headwords: ht = headword['head_template'] tn = tname(ht) heads = lalib.la_get_headword_from_template(ht, lemma_pagetitle, pagemsg, expand_text) for head in heads: if head not in seen_heads: seen_heads.append(head) for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn not in seen_infltns: seen_infltns.append(infltn) if tn in expected_headtemps: oright = unicode(ht) for head in heads: head_no_links = blib.remove_links(head) if lalib.remove_macrons(head_no_links) == lemma_no_macrons: break else: # no break continue this_inflargs = [] for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn not in expected_infltemps: pagemsg( "WARNING: Saw bad declension template for %s, expected one of {{%s}}: %s" % (pos, ",".join( "{{%s}}" % temp for temp in expected_infltemps), unicode(inflt))) continue originflt = unicode(inflt) inflargs = lalib.generate_infl_forms(pos, originflt, errandpagemsg, expand_text) if inflargs is None: continue this_inflargs.append(inflargs) matched_head = True inflargs_sets.append((heads, this_inflargs)) if not matched_head: pagemsg( "WARNING: Couldn't find any matching heads, even allowing macron differences (seen heads %s, seen infl template names %s)" % (",".join(seen_heads), ",".join(seen_infltns))) return None return inflargs_sets
def process_text_on_page(index, pagetitle, text): global args if pagetitle.startswith("Reconstruction:Latin/"): pagetitle = re.sub("^Reconstruction:Latin/", "*", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if not args.stdin: pagemsg("Processing") # Greatly speed things up when --stdin by ignoring non-Latin pages if "==Latin==" not in text: return None, None retval = lalib.find_heads_and_defns(text, pagemsg) if retval is None: return None, None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval for headword in headwords: ht = headword['head_template'] tn = tname(ht) if tn == "la-noun-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "noun form": pos = "noun" tag_set_groups = lalib.noun_tag_groups possible_slots = lalib.la_noun_decl_overrides expected_headtemps = ["la-noun"] expected_infltemps = ["la-ndecl"] elif tn == "la-proper noun-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "proper noun form": pos = "pn" tag_set_groups = lalib.noun_tag_groups possible_slots = lalib.la_noun_decl_overrides expected_headtemps = ["la-proper noun"] expected_infltemps = ["la-ndecl"] #elif tn == "la-pronoun-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "pronoun form": # pos = "pronoun" # tag_set_groups = lalib.adj_tag_groups # possible_slots = lalib.la_adj_decl_overrides # expected_headtemp = ??? elif tn == "la-verb-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "verb form": pos = "verb" tag_set_groups = lalib.verb_tag_groups possible_slots = lalib.la_verb_overrides expected_headtemps = ["la-verb"] expected_infltemps = ["la-conj"] elif tn == "la-adj-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "adjective form": pos = "adj" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-adj", "la-adj-comp", "la-adj-sup"] expected_infltemps = ["la-adecl"] elif tn == "la-part-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "participle form": pos = "part" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-part"] expected_infltemps = ["la-adecl"] #elif tn == "la-suffix-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "suffix form": # pos = "suffix" elif tn == "la-num-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "numeral form": pos = "numadj" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-num-adj"] expected_infltemps = ["la-adecl"] else: continue # # We have the following: # # 1. The non-lemma headword, with one or (potentially but unlikely) more # than one headword form. # 2. Under the headword, multiple {{inflection of}} templates, each of # which specifies a single lemma under which the non-lemma form # belongs, and one or more corresponding tag sets. # 3. The lemma page corresponding to the lemma specified in an # {{inflection of}} template may have one or more lemmas of the right # part of speech. Each lemma specifies one or (potentially but # unlikely) more than one lemma form. Some, all or none of the lemmas # might match the lemma specified in the {{inflection of}} template # in macrons (i.e. there's an exact match between the lemma in the # {{inflection of}} template and one of the actual lemma forms of a # lemma on the page). # 4. Under each lemma on the lemma page is one or more inflection # templates specifying the inflections of the lemma. Each inflection # template specifies the non-lemma form(s) (potentially more than one) # for each slot. # # When looking up a given {{inflection of}} template, the ideal case is # that the specified lemma matches one of the actual lemmas, and all # corresponding specified non-lemma forms match the corresponding actual # non-lemma form(s) for all tag sets. (If there are multiple specified # non-lemma forms, they may match across inflection templates if there's # more than one, e.g. the first matches the first inflecion template and # the second matches the second inflection template.) # # What if there are mismatches? # # 1. If the specified non-lemma forms are a subset of the actual # non-lemma forms for a given {{inflection of}} template and lemma, # this is still considered a match but we make a note of it (not a # warning). # 2. If a single {{inflection of}} template has multiple tag sets in it # and and for some but not all tag sets the specified non-lemma forms # match, we consider this a match but issue a warning. (In the future, # we might consider removing the bad tag sets, conditioned on a # separate command-line flag.) # 3. If the specified lemma of a given {{inflection of}} template # doesn't match any actual lemmas, we look at all actual lemmas that # match except for macrons and see if, for any of them, the specified # non-lemma forms match the actual non-lemma forms per (1) and (2). # If so, we gather the set of lemma forms for all such lemmas. If # there's only one, we can update the specified lemma in the # {{inflection of}} template (and issue a warning). If there are # multiple, we issue a warning and don't update the specified lemma. # 4. We first loop through all {{inflection of}} templates for the given # specified non-lemma forms and check for matches according to # (1), (2) and (3). If some but not all templates match, we issue # a warning and we're done with this non-lemma headword. # 5. If there are no matches per (4), we look for the set of actual forms # that match all tag sets of all {{inflection of}} templates when # ignoring macron differences. If there is such a non-empty set, # we can update the specified non-lemma forms in the non-lemma # headword (and issue a warning). When doing so, we may need to # update the corresponding pronunciation template(s), according to # logic still to be determined (FIXME), but similar to or identical to # existing logic in clean_latin_long_vowels.py. # 6. If there are no matches per (5), we first look at the possible # assignments of actual lemmas to each possible {{inflection of}} # template (ignoring macron differences). If there's only one such # assignment (i.e. each {{inflection of}} template can be assigned to # only one actual lemma), then for that assignment, we find the # actual forms that match the non-lemma pagename except in macrons and # are common among all the sets of inflections, and update the # specified non-lemma forms in the non-lemma headword using those # forms (and issue a warning). When doing so, we may need to update # the corresponding pronunciation template(s) as in (5). If there are # no forms in common, issue a warning and do nothing. # 7. If there are multiple assignments of actual lemmas to # {{inflection of}} templates, we loop over all possible assignments. # For each assignment, we find the set of actual common non-lemma # forms as in (6). If there is more than one assignment with a # non-empty set of actual common non-lemma forms, or no assignment, # we issue a warning and do nothing. Otherwise, we update the # specified non-lemma forms in the non-lemma headword (and # corresponding pronunciation template(s)) as in (6). headword_forms = lalib.la_get_headword_from_template( ht, pagetitle, pagemsg) matching_headword_forms = [] for headword_form in headword_forms: if "[" in headword_form or "|" in headword_form: pagemsg( "WARNING: Bracket or pipe symbol in non-lemma headword form, should not happen: %s" % unicode(ht)) headword_form = blib.remove_links(headword_form) if lalib.remove_macrons(headword_form) != pagetitle: pagemsg( "WARNING: Bad headword form %s, doesn't match page title: %s" % (headword_form, unicode(ht))) elif headword_form in matching_headword_forms: pagemsg("WARNING: Duplicate headword form %s: %s" % (headword_form, unicode(ht))) else: matching_headword_forms.append(headword_form) headword_forms = matching_headword_forms for stage in [1, 2, 3]: def stagemsg(txt): pagemsg("Stage %s: %s" % (stage, txt)) def errandstagemsg(txt): errandpagemsg("Stage %s: %s" % (stage, txt)) def yield_infl_of_templates_and_properties(): for t in headword['infl_of_templates']: lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 if lang != "la": errandstagemsg( "WARNING: In Latin section, found {{inflection of}} for different language %s: %s" % (lang, unicode(t))) continue lemma = getparam(t, str(lemma_param)) if "[" in lemma or "|" in lemma: stagemsg("WARNING: Link in lemma %s, skipping: %s" % (lemma, unicode(t))) continue inflargs_sets = lookup_inflection( lalib.remove_macrons(lemma), pos, expected_headtemps, expected_infltemps, stagemsg, errandstagemsg) if inflargs_sets is None: stagemsg( "WARNING: Lemma %s doesn't exist or has no %s heads" % (lemma, pos)) continue # fetch tags tags = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) # split tags into tag sets (which may be multipart) and further # split any multipart tag sets into component tag sets tag_sets = [ tag_set for maybe_multipart_tag_set in lalib.split_tags_into_tag_sets(tags) for tag_set in lalib.split_multipart_tag_set( maybe_multipart_tag_set) ] yield t, lemma_param, lemma, inflargs_sets, tag_sets def merge_forms_for_slot(slot, this_inflargs): # Merge the forms of all inflection templates under the given # lemma headword all_valid_forms = [] all_valid_forms_with_syncopated = [] for inflargs in this_inflargs: if slot not in inflargs: continue saw_slot_in_inflargs = True forms = inflargs[slot].split(",") valid_forms = [ form for form in forms if "[" not in form and "|" not in form ] for form in valid_forms: if form not in all_valid_forms: all_valid_forms.append(form) if form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append(form) if pos == "verb" and re.search(u"v[eiē]", form): syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1", form) if syncopated_form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append( syncopated_form) all_matchable_forms = [ form for form in all_valid_forms if lalib.remove_macrons(form) == pagetitle ] all_matchable_forms_with_syncopated = [ form for form in all_valid_forms_with_syncopated if lalib.remove_macrons(form) == pagetitle ] return (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) if stage == 1: matched_infl_of_templates = False for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): def check_for_tag_set_match(tag_set, allow_lemma_mismatch): slot = lalib.tag_set_to_slot(tag_set, tag_set_groups, stagemsg) if slot is None: # Already issued warning return [] if slot not in possible_slots: stagemsg( "WARNING: Unrecognized slot %s from tag set: %s" % (slot, unicode(t))) return [] saw_slot_in_inflargs = False matching_actual_lemmas = [] for actual_lemmas, this_inflargs in inflargs_sets: saw_matching_lemma = False for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if (lalib.remove_macrons(lemma) == lalib.remove_macrons(actual_lemma) if allow_lemma_mismatch else lemma == actual_lemma): saw_matching_lemma = True if not saw_matching_lemma: continue (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot(slot, this_inflargs)) matched_form = False if set(headword_forms) == set(all_matchable_forms): stagemsg( "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) <= set( all_matchable_forms): stagemsg( "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), ",".join(all_matchable_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) == set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms_with_syncopated)) ) matched_form = True elif set(headword_forms) <= set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), ",".join( all_matchable_forms_with_syncopated), slot, lemma, ",".join( all_valid_forms_with_syncopated))) matched_form = True if matched_form: for actual_lemma in actual_lemmas: if actual_lemma not in matching_actual_lemmas: matching_actual_lemmas.append( actual_lemma) if not matching_actual_lemmas: if not saw_slot_in_inflargs: if "pasv" in slot: stagemsg( "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb" % (",".join(headword_forms), slot, lemma)) else: stagemsg( "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s" % (",".join(headword_forms), slot, lemma)) return matching_actual_lemmas saw_matching_lemma = False for actual_lemmas, this_inflargs in inflargs_sets: if lemma in [ blib.remove_links(x) for x in actual_lemmas ]: saw_matching_lemma = True break if saw_matching_lemma: tag_set_matches = [] tag_set_mismatches = [] for tag_set in tag_sets: matching_lemmas = check_for_tag_set_match( tag_set, allow_lemma_mismatch=False) if matching_lemmas: tag_set_matches.append(tag_set) else: tag_set_mismatches.append(tag_set) if len(tag_set_matches) > 0: matched_infl_of_templates = True if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Matched tag sets %s but not %s, counting as a match: %s" % (",".join("|".join(tag_set) for tag_set in tag_set_matches), ",".join( "|".join(tag_set) for tag_set in tag_set_mismatches), unicode(t))) else: stagemsg( "WARNING: Couldn't match any tag sets: %s" % unicode(t)) else: stagemsg( "WARNING: Couldn't match lemma %s among potential lemmas %s, trying without lemma matches: %s" % (lemma, ",".join( actual_lemma for actual_lemmas, this_inflargs in inflargs_sets for actual_lemma in actual_lemmas), unicode(t))) tag_set_matches = [] tag_set_mismatches = [] all_matching_lemmas = [] for tag_set in tag_sets: matching_lemmas = check_for_tag_set_match( tag_set, allow_lemma_mismatch=True) if matching_lemmas: tag_set_matches.append(tag_set) for matching_lemma in matching_lemmas: if matching_lemma not in all_matching_lemmas: all_matching_lemmas.append( matching_lemma) else: tag_set_mismatches.append(tag_set) if len(tag_set_matches) > 0: matched_infl_of_templates = True if len(all_matching_lemmas) == 1: notes.append( "fix macrons in lemma of '%s' (stage 1): %s -> %s" % (tname(t), lemma, all_matching_lemmas[0])) if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Fixing macrons in lemma %s -> %s despite only some tag sets %s but not %s matching, counting as a match: %s" % (lemma, all_matching_lemmas[0], ",".join( "|".join(tag_set) for tag_set in tag_set_matches), ",".join("|".join(tag_set) for tag_set in tag_set_mismatches), unicode(t))) else: stagemsg( "WARNING: Fixing macrons in lemma %s -> %s; all tag sets match: %s" % (lemma, all_matching_lemmas[0], unicode(t))) origt = unicode(t) t.add(str(lemma_param), all_matching_lemmas[0]) stagemsg("Replaced %s with %s" % (origt, unicode(t))) else: if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Multiple possible lemmas %s match some tag sets %s but not %s, counting as a match but not updating lemma %s: %s" % (",".join(all_matching_lemmas), ",".join( "|".join(tag_set) for tag_set in tag_set_matches), ",".join("|".join(tag_set) for tag_set in tag_set_mismatches), lemma, unicode(t))) else: stagemsg( "WARNING: Multiple possible lemmas %s match tag sets, with all tag sets matching, counting as a match but not updating lemma %s: %s" % (",".join(all_matching_lemmas), lemma, unicode(t))) else: stagemsg( "WARNING: Couldn't match any tag sets even when allowing macron mismatches with lemma %s: %s" % (lemma, unicode(t))) if matched_infl_of_templates: break elif stage == 2: common_forms = None no_common_forms = False for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): for tag_set in tag_sets: slot = lalib.tag_set_to_slot(tag_set, tag_set_groups, stagemsg) if slot is None or slot not in possible_slots: # Already issued warning no_common_forms = True break this_tag_set_matching_forms = [] combined_this_inflargs = [] for actual_lemmas, this_inflargs in inflargs_sets: for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if lemma == actual_lemma: combined_this_inflargs.extend( this_inflargs) break if not combined_this_inflargs: continue (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot(slot, combined_this_inflargs)) for form in all_matchable_forms: if form not in this_tag_set_matching_forms: this_tag_set_matching_forms.append(form) if common_forms is None: common_forms = this_tag_set_matching_forms if len(common_forms) == 0: no_common_forms = True break else: new_common_forms = [] for form in common_forms: if form in this_tag_set_matching_forms: new_common_forms.append(form) common_forms = new_common_forms if len(common_forms) == 0: no_common_forms = True break if no_common_forms: break if no_common_forms or common_forms is None: stagemsg( "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets, not changing headword form(s) but trying again allowing macron differences in lemmas: %s" % (pagetitle, unicode(ht))) else: notes.append( "fix macrons in forms of '%s' (stage 2): %s -> %s" % (tname(ht), ",".join(headword_forms), ",".join(common_forms))) oright = unicode(ht) if tname(ht) == "head": blib.set_param_chain(ht, common_forms, "head", "head") else: blib.set_param_chain(ht, common_forms, "1", "head") stagemsg("Replaced %s with %s" % (oright, unicode(ht))) if len(common_forms) > 1: stagemsg( "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s" % ",".join(common_forms)) else: assert len(common_forms) == 1 clean_latin_long_vowels.process_pronun_templates( headword['pronun_section'], common_forms[0], stagemsg, notes, "fix macrons in pronun of '%%s' (stage 2): %s -> %s" % (",".join(headword_forms), ",".join(common_forms))) break else: assert stage == 3 multiple_assignments = False infl_of_assignments = [] for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): matching_lemmas = [] for actual_lemmas, this_inflargs in inflargs_sets: for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if lalib.remove_macrons( lemma) == lalib.remove_macrons( actual_lemma): if actual_lemma not in matching_lemmas: matching_lemmas.append(actual_lemma) if len(matching_lemmas) > 1: stagemsg( "WARNING: Multiple actual lemmas %s match {{inflection of}} lemma %s, hence multiple assignments, doing things the hard way: %s" % (",".join(matching_lemmas), lemma, unicode(t))) multiple_assignments = True infl_of_assignments.append(matching_lemmas) cur_assignment = None cur_common_forms = None for assignment in itertools.product(*infl_of_assignments): common_forms = None no_common_forms = False for actual_lemma, ( t, lemma_param, lemma, inflargs_sets, tag_sets) in zip( assignment, yield_infl_of_templates_and_properties()): for tag_set in tag_sets: slot = lalib.tag_set_to_slot( tag_set, tag_set_groups, stagemsg) if slot is None or slot not in possible_slots: # Already issued warning no_common_forms = True break this_tag_set_matching_forms = [] combined_this_inflargs = [] for actual_lemmas, this_inflargs in inflargs_sets: if actual_lemma in actual_lemmas: combined_this_inflargs.extend( this_inflargs) (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot( slot, combined_this_inflargs)) for form in all_matchable_forms: if form not in this_tag_set_matching_forms: this_tag_set_matching_forms.append( form) if common_forms is None: common_forms = this_tag_set_matching_forms if len(common_forms) == 0: no_common_forms = True break else: new_common_forms = [] for form in common_forms: if form in this_tag_set_matching_forms: new_common_forms.append(form) common_forms = new_common_forms if len(common_forms) == 0: no_common_forms = True break if no_common_forms: break if not no_common_forms and common_forms is not None: if cur_assignment: stagemsg( "WARNING: Multiple assignments of lemmas have common forms, at least %s -> %s and %s -> %s, not changing: %s" % (",".join(cur_assignment), ",".join(cur_common_forms), ",".join(assignment), ",".join(common_forms), unicode(ht))) else: cur_assignment = assignment cur_common_forms = common_forms if cur_assignment is None: stagemsg( "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets when allowing macron differences in lemmas, not changing headword form(s): %s" % (pagetitle, unicode(ht))) else: for actual_lemma, ( t, lemma_param, lemma, inflargs_sets, tag_sets) in zip( cur_assignment, yield_infl_of_templates_and_properties()): notes.append( "fix macrons in lemma of '%s' (stage 3): %s -> %s" % (tname(t), lemma, actual_lemma)) stagemsg( "WARNING: found common forms %s, updating lemma %s to %s: %s" % (",".join(cur_common_forms), lemma, actual_lemma, unicode(t))) origt = unicode(t) t.add(str(lemma_param), actual_lemma) stagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "fix macrons in forms of '%s' (stage 3): %s -> %s" % (tname(ht), ",".join(headword_forms), ",".join(cur_common_forms))) oright = unicode(ht) if tname(ht) == "head": blib.set_param_chain(ht, cur_common_forms, "head", "head") else: blib.set_param_chain(ht, cur_common_forms, "1", "head") stagemsg("Replaced %s with %s" % (oright, unicode(ht))) if len(cur_common_forms) > 1: stagemsg( "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s" % ",".join(cur_common_forms)) else: assert len(cur_common_forms) == 1 clean_latin_long_vowels.process_pronun_templates( headword['pronun_section'], cur_common_forms[0], stagemsg, notes, "fix macrons in pronun of '%%s' (stage 3): %s -> %s" % (",".join(headword_forms), ",".join(cur_common_forms))) break secbody = "".join(unicode(x) for x in parsed_subsections) sections[j] = secbody + sectail return "".join(sections), notes
#!/usr/bin/env python # -*- coding: utf-8 -*- import blib from blib import msg import sys import lalib parser = blib.create_argparser("Remove Latin macrons from input", no_beginning_line=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for index, line in blib.iter_items(sys.stdin, start, end): line = line.strip().decode('utf-8') msg(lalib.remove_macrons(line))
parser.add_argument('--lemma-file', help="File containing lemmas to process, one per line; non-lemma forms will be done") parser.add_argument('--lemmas', help="List of comma-separated lemmas to process; non-lemma forms will be done") parser.add_argument("--slots", help="Slots to process in conjunction with --lemmas and --lemma-file.") parser.add_argument('--override-pronun', action="store_true", help="Override existing pronunciations") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.lemma_file or args.lemmas: slots = args.slots.split(",") if args.lemma_file: lemmas = read_pages(args.lemma_file, start, end) else: lemmas = blib.iter_items(re.split(",", args.lemmas.decode("utf-8")), start, end) for i, lemma in lemmas: process_lemma(i, lalib.remove_macrons(lemma), slots, args) else: def do_process_page(page, index, parsed): return process_page(index, page, args) blib.do_pagefile_cats_refs(args, start, end, do_process_page, default_cats=["Latin lemmas", "Latin non-lemma forms"], edit=True) def subval_to_string(subval): if type(subval) is tuple: pron, extra_params, pre, post = subval return unicode(FoundPronun(pron, extra_params, pre, post)) else: return subval for regex, subvals in manual_pronun_mapping:
def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose)
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_heads_and_defns(text, pagemsg) if retval is None: return None, None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval part_headwords = [] adj_headwords = [] pn_headwords = [] noun_headwords = [] for headword in headwords: ht = headword['head_template'] tn = tname(ht) if tn == "la-part" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["participle", "participles"]: part_headwords.append(headword) elif tn == "la-adj" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["adjective", "adjectives"]: adj_headwords.append(headword) elif tn == "la-proper noun" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["proper noun", "proper nouns"]: pn_headwords.append(headword) elif tn == "la-noun" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]: noun_headwords.append(headword) headwords_to_do = None if part_headwords and not adj_headwords: pos = "part" headwords_to_do = part_headwords expected_inflt = "la-adecl" elif pn_headwords and not noun_headwords: pos = "pn" headwords_to_do = pn_headwords expected_inflt = "la-ndecl" if not headwords_to_do: return None, None for headword in headwords_to_do: for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn != expected_inflt: pagemsg( "WARNING: Saw bad declension template for %s, expected {{%s}}: %s" % (pos, expected_inflt, unicode(inflt))) continue inflargs = lalib.generate_infl_forms(pos, unicode(inflt), errandpagemsg, expand_text) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in inflargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for formindex, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, formindex, parsed): return process_form(page, formindex, slot, form, pos, pagemsg) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), "%s.%s" % (index, formindex), handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) allow_2 = False lemma = None if tn in ["la-future participle", "la-perfect participle", "la-gerundive"]: base = getparam(t, "1") if tn == "la-gerundive": param2 = getparam(t, "2") if param2: if lalib.remove_macrons(base) == lalib.remove_macrons(param2): allow_2 = True base = param2 else: pagemsg("WARNING: Unrecognized param 2: %s" % origt) continue if not base: pagemsg("WARNING: Empty param 1: %s" % origt) continue lemma = base + "us" elif tn == "la-present participle": base = getparam(t, "1") ending = getparam(t, "2") if not base: pagemsg("WARNING: Empty param 1: %s" % origt) continue if not ending: pagemsg("WARNING: Empty param 2: %s" % origt) continue if ending == "ans": lemma = base + u"āns" elif ending == "ens": lemma = base + u"ēns" elif ending == "iens": lemma = u"%siēns/%seunt" % (base, base) else: pagemsg("WARNING: Unrecognized param 2: %s" % origt) continue allow_2 = True if lemma: bad_param = False for param in t.params: pname = unicode(param.name) if pname.strip() == "1" or allow_2 and pname.strip() == "2": continue pagemsg("WARNING: Unrecognized param %s=%s: %s" % ( pname, param.value, origt)) bad_param = True if bad_param: continue rmparam(t, "2") t.add("1", lemma) blib.set_template_name(t, "la-part") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append(u"convert {{%s}} to {{la-part}}" % tn) return unicode(parsed), notes
def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma, preserve_diaeresis), pagemsg, verbose)
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) saw_noun = None saw_proper_noun = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-noun": if saw_noun: pagemsg( "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_noun), unicode(t))) return saw_noun = t elif tn == "la-proper noun": if saw_proper_noun: pagemsg( "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_proper_noun), unicode(t))) return saw_proper_noun = t if saw_noun and saw_proper_noun: pagemsg( "WARNING: Saw both noun and proper noun, can't correct header/headword" ) return if not saw_noun and not saw_proper_noun: pagemsg( "WARNING: Saw neither noun nor proper noun, can't correct header/headword" ) return pos = "pn" if saw_proper_noun else "n" ht = saw_proper_noun or saw_noun if getparam(ht, "indecl"): pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht)) return generate_template = blib.parse_text(unicode(ht)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") return tempargs = blib.split_generate_args(result) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in tempargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for index, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, index, parsed): return process_form(page, index, slot, form, pos) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), index, handler, save=args.save, verbose=args.verbose, diff=args.diff)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval notes = [] subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): if "==Adverb==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) posdeg = None compt = None supt = None for t in parsed.filter_templates(): if tname(t) == "comparative of": if compt: pagemsg( "WARNING: Saw multiple {{comparative of}}: %s and %s" % (unicode(compt), unicode(t))) else: compt = t posdeg = blib.remove_links(getparam(t, "1")) if not posdeg: pagemsg( "WARNING: Didn't see positive degree in {{comparative of}}: %s" % unicode(t)) elif tname(t) == "superlative of": if supt: pagemsg( "WARNING: Saw multiple {{superlative of}}: %s and %s" % (unicode(supt), unicode(t))) else: supt = t posdeg = blib.remove_links(getparam(t, "1")) if not posdeg: pagemsg( "WARNING: Didn't see positive degree in {{superlative of}}: %s" % unicode(t)) if compt and supt: pagemsg( "WARNING: Saw both comparative and superlative, skipping: %s and %s" % (unicode(compt), unicode(supt))) continue if not compt and not supt: pagemsg( "WARNING: Didn't see {{comparative of}} or {{superlative of}} in section %s" % k) continue for t in parsed.filter_templates(): tn = tname(t) if tn in ["la-adv-comp", "la-adv-sup"]: pagemsg("Already saw fixed headword: %s" % unicode(t)) break if tn == "head": if not getparam(t, "1") == "la": pagemsg("WARNING: Saw wrong language in {{head}}: %s" % unicode(t)) else: pos = getparam(t, "2") head = blib.remove_links(getparam(t, "head")) or pagetitle if pos not in [ "adverb", "adverbs", "adverb form", "adverb forms", "adverb comparative form", "adverb comparative forms", "adverb superlative form", "adverb superlative forms", ]: pagemsg( "WARNING: Unrecognized part of speech '%s': %s" % (pos, unicode(t))) else: real_head, real_comp, real_sup = find_head_comp_sup( lalib.remove_macrons(posdeg), pagemsg) if real_head: if lalib.remove_macrons( real_head) != lalib.remove_macrons( posdeg): pagemsg( "WARNING: Can't replace positive degree %s with %s because they differ when macrons are removed" % (posdeg, real_head)) else: pagemsg( "Using real positive degree %s instead of %s" % (real_head, posdeg)) inflt = compt or supt origt = unicode(inflt) inflt.add("1", real_head) pagemsg("Replaced %s with %s" % (origt, unicode(inflt))) if compt: newname = "la-adv-comp" infldeg = "comparative" if real_comp and real_comp != "-": if lalib.remove_macrons( real_comp) != lalib.remove_macrons( head): pagemsg( "WARNING: Can't replace comparative degree %s with %s because they differ when macrons are removed" % (head, real_comp)) else: pagemsg( "Using real comparative degree %s instead of %s" % (real_comp, head)) head = real_comp else: pagemsg( "WARNING: Couldn't retrieve real comparative for positive degree %s" % real_head) else: newname = "la-adv-sup" infldeg = "superlative" if real_sup and real_sup != "-": if lalib.remove_macrons( real_sup) != lalib.remove_macrons( head): pagemsg( "WARNING: Can't replace superlative degree %s with %s because they differ when macrons are removed" % (head, real_sup)) else: pagemsg( "Using real superlative degree %s instead of %s" % (real_sup, head)) head = real_sup else: pagemsg( "WARNING: Couldn't retrieve real superlative for positive degree %s" % real_head) origt = unicode(t) rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") blib.set_template_name(t, newname) t.add("1", head) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "replace {{head|la|...}} with {{%s}} and fix up positive/%s" % (newname, infldeg)) subsections[k] = unicode(parsed) secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, adverb): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") text = unicode(page.text) parsed = blib.parse_text(text) adj_template = None part_template = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-adj": if adj_template: pagemsg( "WARNING: Saw multiple adjective templates: %s and %s" % (unicode(adj_template), unicode(t))) else: adj_template = t if tn == "la-part": if part_template: pagemsg( "WARNING: Saw multiple participle templates: %s and %s" % (unicode(part_template), unicode(t))) else: part_template = t if adj_template and part_template: pagemsg("Saw both %s and %s, modifying adjective" % (unicode(adj_template), unicode(part_template))) if adj_template: template_to_fix = adj_template elif part_template: template_to_fix = part_template else: pagemsg("WARNING: Didn't see adjective or participle template") return None, None existing_advs = blib.fetch_param_chain(template_to_fix, "adv", "adv") changed = False for i in xrange(len(existing_advs)): if lalib.remove_macrons(existing_advs[i]) == lalib.remove_macrons(adv): if existing_advs[i] != adv: pagemsg("Updating macrons of %s -> %s in %s" % (existing_advs[i], adv, unicode(template_to_fix))) existing_advs[i] = adv changed = True notes.append("update macrons of adv=, changing %s -> %s" % (existing_advs[i], adv)) else: pagemsg("Already saw %s: %s" % (adv, unicode(template_to_fix))) break else: # no break existing_advs.append(adv) changed = True notes.append("add adv %s to adjective" % adv) if changed: origt = unicode(template_to_fix) blib.set_param_chain(template_to_fix, existing_advs, "adv", "adv") pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix))) return unicode(parsed), notes