def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg("Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value))
def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg( "Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value))
def canonicalize_existing(forms): forms = [re.sub(" '*or'* ", ",", form) for form in forms] forms = [ splitform for form in forms for splitform in form.split(",") ] return [blib.remove_links(form) for form in forms if form]
def process_section(index, pagetitle, sectext): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) parsed = blib.parse_text(sectext) head = None for t in parsed.filter_templates(): newhead = get_head_param(t, pagetitle) if newhead is not None: newhead = [blib.remove_links(x) for x in newhead] if head and head != newhead: pagemsg("WARNING: Saw multiple heads %s and %s" % (",".join(head), ",".join(newhead))) head = newhead if not head: pagemsg("WARNING: Couldn't find head") saw_pronun = False for t in parsed.filter_templates(): tn = tname(t) if tn == "IPA": if getparam(t, "1") != "ang": pagemsg("WARNING: Wrong-language IPA template: %s" % unicode(t)) continue pagemsg("<from> %s <to> {{ang-IPA|%s}} <end>" % (unicode(t), "|".join(head) or "<<%s>>" % pagetitle)) saw_pronun = True elif tn == "ang-IPA": pagemsg("Saw existing pronunciation: %s" % unicode(t)) saw_pronun = True if not saw_pronun: pagemsg( "WARNING: Didn't see pronunciation for headword %s <new> {{ang-IPA|%s}} <end>" % (",".join(head), "|".join(head)))
def add(val, tr, is_lemma): val_to_add = blib.remove_links(val) # Remove monosyllabic accents to correctly handle the case of # рад, which has some heads with an accent and some without. val_to_add, tr = remove_monosyllabic_accents( val_to_add, tr) this_heads.add((val_to_add, tr, is_lemma))
def get_headword_pronuns(parsed, pagetitle, pagemsg, expand_text): # Get the headword pronunciation(s) headword_pronuns = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-letter" or tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") == "letter": pagemsg("WARNING: Skipping page with letter headword") return None if lalib.la_template_is_head(t): headword_pronuns.extend(lalib.la_get_headword_from_template(t, pagetitle, pagemsg, expand_text)) # Canonicalize by removing links and final !, ? headword_pronuns = [re.sub("[!?]$", "", blib.remove_links(x)) for x in headword_pronuns] #for pronun in headword_pronuns: # if lalib.remove_macrons(pronun) != pagetitle: # pagemsg("WARNING: Headword pronun %s doesn't match page title, skipping" % pronun) # return None # Check for acronym/non-syllabic. for pronun in headword_pronuns: if lalib.is_nonsyllabic(pronun): pagemsg("WARNING: Pronunciation is non-syllabic, skipping: %s" % pronun) return None if re.search("[" + lalib.uppercase + "][" + lalib.combining_accent_str + "]?[" + lalib.uppercase + "]", pronun): pagemsg("WARNING: Pronunciation may be an acronym, please check: %s" % pronun) headword_pronuns = remove_list_duplicates(headword_pronuns) if len(headword_pronuns) < 1: pagemsg("WARNING: Can't find headword template") return None return headword_pronuns
def clean(value): value = value.strip() value = remove_links(value) value = re.sub(", +", ",", value) if value == "-": value = "" return value
def process_text_on_page_for_full_conj(index, pagename, text, verbs): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if pagename not in verbs: pagemsg("WARNING: Can't find entry, skipping") return entry = verbs[pagename] origentry = entry first, rest = pagename.split(" ", 1) restwords = rest.split(" ") def_link = "%s<> %s" % (first, " ".join("[[%s]]" % word for word in restwords)) if def_link == entry: pagemsg("Replacing entry '%s' with a blank entry because it's the default" % entry) entry = "" elif re.sub("<.*?>", "<>", entry) == def_link: newentry = blib.remove_links(entry) pagemsg("Replacing entry '%s' with entry without links '%s'" % (entry, newentry)) entry = newentry parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "es-verb": if not getparam(t, "attn"): pagemsg("Didn't see attn=1: %s" % unicode(t)) continue rmparam(t, "attn") if entry: t.add("1", entry) notes.append("add conjugation '%s' to Spanish verb" % entry) else: notes.append("add conjugation (default) to Spanish verb") if tn == "head" and getparam(t, "1") == "es" and getparam(t, "2") == "verb": head = getparam(t, "head") if head: pagemsg("WARNING: Removing head=%s compared with entry '%s', original entry '%s': %s" % (head, entry, origentry, unicode(t))) rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") blib.set_template_name(t, "es-verb") if entry: t.add("1", entry) notes.append("convert {{head|es|verb}} to {{es-verb|%s}}" % entry) else: notes.append("convert {{head|es|verb}} to {{es-verb}}") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in lalib.la_headword_templates: for head in lalib.la_get_headword_from_template( t, pagetitle, pagemsg): no_macrons_head = remove_macrons(blib.remove_links(head)) if pagetitle.startswith("Reconstruction"): unprefixed_title = "*" + re.sub(".*/", "", pagetitle) else: unprefixed_title = pagetitle if no_macrons_head != unprefixed_title: pagemsg("WARNING: Bad Latin head: %s" % unicode(t)) return None, None
def check_need_accent(text): for word in re.split(" +", text): word = blib.remove_links(word) if u"\u0301" in word or u"ё" in word: continue if not ru.is_monosyllabic(word): return True return False
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if " " in pagetitle: pagemsg("WARNING: Space in page title, skipping") return pagemsg("Processing") text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "la-adv": adv = blib.remove_links(getparam(t, "1")) or pagetitle macron_stem, is_stem = lalib.infer_adv_stem(adv) if not is_stem: pagemsg( "WARNING: Couldn't infer stem from adverb %s, not standard: %s" % (adv, origt)) continue adv_defns = lalib.find_defns(subsections[k]) possible_adjs = [] stem = lalib.remove_macrons(macron_stem) possible_adjs.append(stem + "us") possible_adjs.append(stem + "is") if stem.endswith("nt"): possible_adjs.append(stem[:-2] + "ns") if stem.endswith("plic"): possible_adjs.append(stem[:-2] + "ex") if stem.endswith("c"): possible_adjs.append(stem[:-1] + "x") if re.search("[aeiou]r$", stem): possible_adjs.append(stem) elif stem.endswith("r"): possible_adjs.append(stem[:-1] + "er") if adv.endswith(u"iē"): possible_adjs.append(stem + "ius") for possible_adj in possible_adjs: investigate_possible_adj(index, possible_adj, adv, adv_defns)
def fetch(param): val = getparam(t, param).strip() val = blib.remove_links(val) vals = re.split(r",\s*", val) retval = [] for v in vals: # Remove final footnote symbols are per [[Module:table tools]] v = re.sub(ur"[*~@#$%^&+0-9_\u00A1-\u00BF\u00D7\u00F7\u2010-\u2027\u2030-\u205E\u2070-\u20CF\u2100-\u2B5F\u2E00-\u2E3F]*$", "", v) retval.append(uk.add_monosyllabic_stress(v)) return ", ".join(retval)
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) global args notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "vi-hantu": if not one_char(pagetitle): pagemsg("WARNING: Length of page title is %s > 1, skipping" % len(pagetitle)) continue if getparam(t, "pos"): pagemsg("WARNING: Saw pos=, skipping: %s" % unicode(t)) continue chu = getparam(t, "chu") if chu and chu != "Nom": pagemsg("WARNING: Saw chu=%s not 'Nom', skipping: %s" % (chu, unicode(t))) continue if chu == "Nom": newparam = "nom" else: newparam = "reading" reading = blib.remove_links(getparam(t, "1")) if not reading: pagemsg("WARNING: Empty reading, skipping: %s" % unicode(t)) continue must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "rs", "chu"]: pagemsg( "WARNING: Unrecognized parameter %s=%s, skipping: %s" % (pn, unicode(param.value), unicode(t))) must_continue = True break if must_continue: continue t.add(newparam, reading, before="1") rmparam(t, "1") blib.set_template_name(t, "vi-readings") notes.append("{{vi-hantu}} -> {{vi-readings}}") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def get_lemmas_of_form_page(parsed): lemmas = set() for t in parsed.filter_templates(): tname = unicode(t.name) first_param = None if (tname in ["inflection of", "comparative of", "superlative of"]): first_param = get_first_param(t) if first_param: lemma = lalib.remove_macrons(blib.remove_links(getparam(t, first_param))) lemmas.add(lemma) return lemmas
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] origtext = text parsed = blib.parse_text(text) head = None for t in parsed.filter_templates(): tn = tname(t) newhead = None if tn == "head" and getparam(t, "1") == "ang" or tn in [ "ang-noun", "ang-noun-form", "ang-verb", "ang-verb-form", "ang-adj", "ang-adj-form", "ang-adv", "ang-con", "ang-prep", "ang-prefix", "ang-proper noun", "ang-suffix" ]: newhead = getparam(t, "head") or pagetitle if newhead: if head: pagemsg("WARNING: Saw head=%s and newhead=%s, skipping" % (head, newhead)) return head = newhead if u"ƿ" not in head: pagemsg("WARNING: Something wrong, didn't see wynn in head: %s" % head) saw_altspell = None for t in parsed.filter_templates(): tn = tname(t) if tn == "alternative spelling of": if saw_altspell: pagemsg( "WARNING: Saw multiple {{alternative spelling of}}, skipping: %s and %s" % (unicode(saw_altspell), unicode(t))) return saw_altspell = unicode(t) if getparam(t, "1") != "ang": pagemsg( "WARNING: {{alternative spelling of}} without language 'ang', skipping: %s" % unicode(t)) return param2 = getparam(t, "2") should_param2 = blib.remove_links(head).replace(u"ƿ", "w") if param2 != should_param2: origt = unicode(t) t.add("2", should_param2) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "fix 2= in {{alternative spelling of}} in wynn Old English entries" ) text = re.sub("\n\n+", "\n\n", unicode(parsed)) if origtext != text and not notes: notes.append("condense 3+ newlines to 2") return text, notes
def hi_adj_is_indeclinable(t, pagetitle): if tname(t) == "hi-adj": pagename = blib.remove_links(getparam(t, "head") or pagetitle) # If the lemma doesn't end with any of the declinable suffixes, it's # definitely indeclinable. Some indeclinable adjectives end with these # same suffixes, but we have no way to know that these are indeclinable, # so assume declinable. return not (pagename.endswith(AA) or pagename.endswith(IND_AA) or pagename.endswith(AA + M) or pagename.endswith(IND_AA + M) or pagename.endswith(AA + N) or pagename.endswith(IND_AA + N)) return False
def tr(text, lang=None, sc=None, msgfun=msg): text = remove_links(text) text = tr_canonicalize_greek(text) text = rsub(text, u"γ([γκξχ])", r"n\1") text = rsub(text, u"ρρ", "rrh") text = rsub(text, '.', tt) # compose accented characters, fix hA and similar text = tr_canonicalize_latin(text) return text
def check_lemma(lemma): lemma = blib.remove_links(lemma) if lemma in northern_kurdish_lemmas: return "kmr", "Northern Kurdish", "existence of Northern Kurdish lemma" elif lemma in central_kurdish_lemmas: return "ckb", "Central Kurdish", "existence of Central Kurdish lemma" elif lemma in known_northern_kurdish_terms: return "kmr", "Northern Kurdish", "Kurdish Wiktionary" elif lemma in known_central_kurdish_terms: return "ckb", "Central Kurdish", "Kurdish Wiktionary" elif re.search("^[%s]" % arabic_charset, lemma): return "ckb", "Central Kurdish", "Arabic charset" else: return "kmr", "Northern Kurdish", "Latin charset"
def hi_lemma_is_indeclinable(t, pagetitle, pagemsg): if tname(t) in ["hi-noun", "hi-proper noun"]: return not not getparam(t, "ind") if tname(t) == "hi-adj": if getparam(t, "ind"): return True pagename = blib.remove_links(getparam(t, "head") or pagetitle) # If the lemma doesn't end with any of the declinable suffixes, it's # definitely indeclinable. Some indeclinable adjectives end with these # same suffixes, but we have no way to know that these are indeclinable, # so assume declinable. return not (pagename.endswith(AA) or pagename.endswith(IND_AA) or pagename.endswith(AA + M)) return False
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] adjval = None numval = None for t in parsed.filter_templates(): if unicode(t.name) == "ru-adj": adjval = blib.remove_links(getparam(t, "1")) if (unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "numeral"): numval = blib.remove_links(getparam(t, "head")) for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "ordinalbox" and getparam(t, "1") == "ru": if not adjval: pagemsg("WARNING: Can't find accented ordinal form") elif adjval != pagetitle: t.add("alt", adjval) notes.append("Add alt=%s to ordinalbox" % adjval) if unicode(t.name) == "cardinalbox" and getparam(t, "1") == "ru": if not numval: pagemsg("WARNING: Can't find accented cardinal form") elif numval != pagetitle: t.add("alt", numval) notes.append("Add alt=%s to cardinalbox" % numval) if "[[Category:Russian cardinal numbers]]" not in unicode(parsed): pagemsg("WARNING: Numeral not in [[Category:Russian cardinal numbers]]") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return parsed, notes
def replace_decl(page, index, parsed, decl, declforms): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing decl %s" % decl) notes = [] for t in parsed.filter_templates(): tn = tname(t) forms = {} if tn == args.lang + "-decl-noun": number = "" getslots = uk_decl_noun_slots if args.lang == "uk" else be_decl_noun_slots elif tn == args.lang + "-decl-noun-unc": number = "sg" getslots = uk_decl_noun_unc_slots if args.lang == "uk" else be_decl_noun_unc_slots elif tn == args.lang + "-decl-noun-pl": number = "pl" getslots = uk_decl_noun_pl_slots if args.lang == "uk" else be_decl_noun_pl_slots else: continue i = 1 for slot in getslots: if slot: form = getparam(t, i).strip() if not form: continue form = blib.remove_links(form) # eliminate spaces around commas form = re.sub(r"\s*,\s*", ",", form) slotforms = form.split(",") slotforms = [ (uk.add_monosyllabic_stress(f) if args.lang == "uk" else be.add_monosyllabic_accent(f)) for f in slotforms ] forms[slot] = ",".join(slotforms) i += 1 if compare_forms(forms, declforms, pagemsg): origt = unicode(t) t.name = args.lang + "-ndecl" del t.params[:] t.add("1", decl) newt = unicode(t) pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("replace {{%s|...}} with %s" % (tn, newt)) return unicode(parsed), notes
def get_lemmas(line): line_els = do_split(r"\s+", line) if args.pos: lemmas = line_els[0] else: if len(line_els) < 2: fatal(line, "Not enough elements in line") lemmas = line_els[1] starts_with_exclamation_point = False if lemmas.startswith("!"): starts_with_exclamation_point = True lemmas = lemmas[1:] lemmas = remove_links(lemmas).split(",") first_lemma_no_accents = module.remove_accents(lemmas[0]) return lemmas, first_lemma_no_accents, starts_with_exclamation_point
def process_page_for_modification(index, pagetitle, text, new_pronuns): if pagetitle not in new_pronuns: return def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Old English section") return sections, j, secbody, sectail, has_non_lang = retval heads = None if "Etymology 1" in secbody: etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): parsed = blib.parse_text(etym_sections[k]) secheads = [] for t in parsed.filter_templates(): this_heads = get_head_param(t, pagetitle) if this_heads: this_heads = [blib.remove_links(x) for x in this_heads] for head in this_heads: if head not in secheads: secheads.append(head) if heads is None: heads = secheads elif set(heads) != set(secheads): pagemsg( "Saw head(s) %s in one etym section and %s in another, splitting pronuns per etym section" % (",".join(heads), ",".join(secheads))) for k in xrange(2, len(etym_sections), 2): etym_sections[k] = process_section_for_modification( index, pagetitle, etym_sections[k], 4, new_pronuns[pagetitle]) sections[j] = "".join(etym_sections) + sectail return "".join( sections), "add pronunciation(s) to Old English lemma(s)" pagemsg( "All etym sections have same head(s) %s, creating a single pronun section" % ",".join(heads)) secbody = process_section_for_modification(index, pagetitle, secbody, 3, new_pronuns[pagetitle]) sections[j] = secbody + sectail return "".join(sections), "add pronunciation(s) to Old English lemma(s)"
def fix_up_section(sectext, warn_on_multiple_heads): parsed = blib.parse_text(sectext) heads = set() pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if lalib.la_template_is_head(t): heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg)) elif tn == "la-IPA": pronun_templates.append(t) if len(heads) > 1: if warn_on_multiple_heads: pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads)) return sectext if len(heads) == 0: pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads)) return sectext newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext) newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M) if newsectext != sectext: notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0]) sectext = newsectext # Recompute pronun templates as we may have added one. parsed = blib.parse_text(sectext) pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-IPA": pronun_templates.append(t) if "{{a|Ecclesiastical}} {{IPA" in sectext: if len(pronun_templates) == 0: pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template") elif len(pronun_templates) > 1: pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" % ",".join(unicode(tt) for tt in pronun_templates)) else: origt = unicode(pronun_templates[0]) pronun_templates[0].add("eccl", "yes") pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0]))) newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "", sectext, 0, re.M) if newsectext == sectext: pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation") else: notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}") sectext = newsectext return sectext
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in hindi_head_templates: maxtr = 1 for i in range(1, 10): if getparam(t, "tr" if i == 1 else "tr%s" % i): maxtr = i for i in range(1, maxtr + 1): trparam = "tr" if i == 1 else "tr%s" % i tr = getparam(t, trparam) if tr: pagemsg("Manual translit tr=%s in %s, not checking" % (tr, unicode(t))) else: headparam = "head" if i == 1 else "head%s" % i head = getparam(t, headparam) if head: head = blib.remove_links(head) else: head = pagetitle newtr = expand_text("{{xlit|hi|%s}}" % head) oldtr = expand_text( "{{#invoke:User:Benwing2/hi-translit|tr|%s}}" % head) if newtr and oldtr: if newtr == oldtr: pagemsg( "Auto translit %s same in new and old: %s" % (newtr, unicode(t))) else: pagemsg( "WARNING: Different translit, new=%s, old=%s: %s" % (newtr, oldtr, unicode(t)))
def tr(text, lang=None, sc=None, msgfun=msg): text = remove_links(text) text = tr_canonicalize_bulgarian(text) # Remove word-final hard sign text = rsub(text, u"[Ъъ]($|[- \]])", ur"\1") # ьо becomes jo, Ьо becomes Jo text = rsub(text, u"ь(?=[Оо])", ur"j") text = rsub(text, u"Ь(?=[Оо])", ur"J") text = rsub(text, '.', tt) # compose accented characters text = tr_canonicalize_latin(text) return text
def infer_decl(t, pagemsg): if verbose: pagemsg("Processing %s" % unicode(t)) forms = {} # Initialize all cases to blank in case we don't set them again later for case, numparam in short_adj_cases_params: form = getparam(t, case) or getparam(t, numparam) form = form.strip() form = blib.remove_links(form) forms[case] = form def get_form(case): if forms[case] == "-": return "" return forms[case]
def compare_headword_conj_forms(id_slot, headword_forms, conj_slots, adjust_for_missing_perf_forms=False, remove_conj_links=False): conj_forms = "" for slot in conj_slots: if slot in verb_props: conj_forms = verb_props[slot] break conj_forms = safe_split(conj_forms, ",") if remove_conj_links: conj_forms = [blib.remove_links(x) for x in conj_forms] corrected_headword_forms = [ lengthen_ns_nf(x) for x in headword_forms ] corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms] if adjust_for_missing_perf_forms: # There are several instances of 4++ verbs where only the -īvī variant, # not the -iī variant, is listed in the headword. Don't get tripped up # by that. ivi_conj_forms = [ x for x in corrected_conj_forms if x.endswith(u"īvī") ] for ivi_conj_form in ivi_conj_forms: ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form) if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms: corrected_headword_forms.append(ii_conj_form) if set(corrected_headword_forms) != set(corrected_conj_forms): macronless_headword_forms = set( lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_conj_forms = set( lalib.remove_macrons(x) for x in corrected_conj_forms) if macronless_headword_forms == macronless_conj_forms: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) else: pagemsg( "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s" % (id_slot, ",".join(headword_forms), id_slot, ",".join(conj_forms), render_headword_and_conj())) return False return True
def compare_single_form(f1, f2): words1 = re.split("[ -]", f1) words2 = re.split("[ -]", f2) if len(words1) != len(words2): return None for i in xrange(len(words1)): if words1[i] != words2[i]: w1 = fixup_link(words1[i]) w2 = words2[i] # Allow case where existing is monosyllabic and missing a stress # compared with proposed w1 = {w1, try_to_stress(w1)} # Allow case where existing is missing a link as compared to # proposed (but not other way around; we don't want a link # disappearing) w2 = {w2, blib.remove_links(w2)} if not (w1 & w2): return None return True
def compare(old, new, stuff, nocanon=False): if not old: return True if not nocanon: remove_monosyllabic_accents = ( uk.remove_monosyllabic_stress if args.lang == "uk" else be.remove_monosyllabic_accents) old = [ remove_monosyllabic_accents(blib.remove_links(x)) for x in old ] new = [remove_monosyllabic_accents(x) for x in new] if set(old) != set(new): pagemsg( "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s" % (stuff, ",".join(old), stuff, ",".join(new), unicode(headt), unicode(t))) return False return True
def compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props, headword_and_decl_text, pagemsg, adjust_for_missing_gen_forms=False, adjust_for_e_ae_gen=False, remove_headword_links=False): decl_forms = "" for slot in decl_slots: if slot in noun_props: decl_forms = noun_props[slot] break decl_forms = safe_split(decl_forms, ",") if remove_headword_links: headword_forms = [blib.remove_links(x) for x in headword_forms] corrected_headword_forms = [lengthen_ns_nf(x) for x in headword_forms] corrected_decl_forms = [lengthen_ns_nf(x) for x in decl_forms] if adjust_for_e_ae_gen: corrected_headword_forms = [re.sub(u"ē$", "ae", x) for x in headword_forms] if adjust_for_missing_gen_forms: # Nouns in -ius and -ium are commonly missing the shortened genitive # variants. Don't get tripped up by that. ii_decl_forms = [x for x in corrected_decl_forms if x.endswith(u"iī")] for ii_decl_form in ii_decl_forms: i_decl_form = re.sub(u"iī$", u"ī", ii_decl_form) if i_decl_form in corrected_decl_forms and i_decl_form not in corrected_headword_forms: corrected_headword_forms.append(i_decl_form) if set(corrected_headword_forms) != set(corrected_decl_forms): macronless_headword_forms = set(lalib.remove_macrons(x) for x in corrected_headword_forms) macronless_decl_forms = set(lalib.remove_macrons(x) for x in corrected_decl_forms) if macronless_headword_forms == macronless_decl_forms: pagemsg("WARNING: Headword %s=%s different from decl %s=%s in macrons only, skipping: %s" % ( id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms), headword_and_decl_text )) else: pagemsg("WARNING: Headword %s=%s different from decl %s=%s in more than just macrons, skipping: %s" % ( id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms), headword_and_decl_text )) return False return True
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) seen_trans = [pagetitle] parsed = blib.parse(page) for t in parsed.filter_templates(): tn = tname(t) if tn in ["t", "t+", "t-", "t+check", "t-check"]: trans = blib.remove_links(getparam(t, "2")) if trans not in seen_trans: seen_trans.append(trans) for trans in seen_trans: def pagemsg_with_trans(txt): pagemsg("%s: %s" % (trans, txt)) if blib.safe_page_exists(pywikibot.Page(site, trans), pagemsg_with_trans): msg("Page %s %s: Found existing translation for %s" % (index, trans, pagetitle))
def add(val, tr): val_to_add = blib.remove_links(val) if val_to_add: heads.add((val_to_add, tr))
def split_one_page_etymologies(page, index, pagetext, verbose): # Fetch pagename, create pagemsg() fn to output msg with page name included pagename = page.title() pagetext = unicode(pagetext) def pagemsg(text): msg("Page %s %s: %s" % (index, pagename, text)) comment = None notes = [] # Split off interwiki links at end m = re.match(r"^(.*?\n)(\n*(\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", pagetext, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = pagetext pagetail = "" # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Arabic section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("WARNING: Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S) if mm: sections[i:i+1] = [mm.group(1), mm.group(2)] elif i < len(sections) - 1: pagemsg("WARNING: Arabic language section %s is non-final and missing trailing separator" % i) for mm in re.finditer("^(==+)[^=\n](==+)$", sections[i], re.M): if mm.group(1) != mm.group(2): pagemsg("WARNING: Malconstructed header: %s" % mm.group(0)) subsections = re.split("(^===[^=\n]+=+\n)", sections[i], 0, re.M) if len(subsections) < 2: pagemsg("WARNING: Page missing any entries") etymologies = [] etymsections = [] sechead = subsections[0] if "\n===Etymology 1=" in sections[i]: etyms_were_separate = True for j in xrange(1, len(subsections), 2): if not re.match("^===Etymology [0-9]+=", subsections[j]): pagemsg("WARNING: Non-etymology level-3 header when split etymologies: %s" % subsections[j][0:-1]) etymsections = [subsections[j] for j in xrange(2, len(subsections), 2)] # Reduce indent by one. We will increase it again when we split # etymologies. for j in xrange(len(etymsections)): etymsections[j] = re.sub("^==", "=", etymsections[j], 0, re.M) else: etyms_were_separate = False etymsections = ''.join(subsections[1:]) for etymsection in etymsections: subsections = re.split("(^===[^=\n]+=+\n)", etymsection, 0, re.M) if len(subsections) < 2: pagemsg("WARNING: Section missing any entries") split_sections = [] next_split_section = 0 def append_section(k): while len(split_sections) <= next_split_section: split_sections.append("") split_sections[next_split_section] += \ subsections[k] + subsections[k + 1] last_lemma = None last_inflection_of_lemma = None for j in xrange(1, len(subsections), 2): if re.match("^===+(References|Related|See)", subsections[j]): pagemsg("Found level-3 section that should maybe be at higher level: %s" % subsections[j][0:-1]) append_section(j) elif re.match("^===+(Alternative|Etymology)", subsections[j]): append_section(j) else: parsed = blib.parse_text(subsections[j + 1]) lemma = None inflection_of_lemma = None for t in parsed.filter_templates(): if t.name in arabic_all_headword_templates: if lemma: if t.name not in ["ar-nisba", "ar-noun-nisba", "ar-verb", "ar-verb-form"]: pagemsg("Found multiple headword templates in section %s: %s" % (j, subsections[j][0:-1])) # Note: For verbs this is the form class, which we match on lemma = reorder_shadda(remove_links(getparam(t, "1"))) if t.name == "inflection of": if inflection_of_lemma: pagemsg("Found multiple 'inflection of' templates in section %s: %s" % (j, subsections[j][0:-1])) inflection_of_lemma = remove_diacritics( remove_links(getparam(t, "1"))) if not lemma: pagemsg("Warning: No headword template in section %s: %s" % (j, subsections[j][0:-1])) append_section(j) else: if lemma != last_lemma: next_split_section += 1 elif (inflection_of_lemma and last_inflection_of_lemma and inflection_of_lemma != last_inflection_of_lemma): pagemsg("Verb forms have different inflection-of lemmas %s and %s, splitting etym" % ( last_inflection_of_lemma, inflection_of_lemma)) next_split_section += 1 last_lemma = lemma last_inflection_of_lemma = inflection_of_lemma append_section(j) etymologies += split_sections # Combine adjacent etymologies with same verb form class I. # FIXME: We might not want to do this; the etymologies might be # legitimately split. Need to check each case. j = 0 while j < len(etymologies) - 1: def get_form_class(k): formclass = None parsed = blib.parse_text(etymologies[j]) for t in parsed.filter_templates(): if t.name in ["ar-verb", "ar-verb-form"]: newformclass = getparam(t, "1") if formclass and newformclass and formclass != newformclass: pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass)) formclass = newformclass return formclass formclassj = get_form_class(j) formclassj1 = get_form_class(j + 1) if formclassj == "I" and formclassj1 == "I": if not etymologies[j + 1].startswith("="): pagemsg("WARNING: Can't combine etymologies with same verb form class because second has etymology text") else: pagemsg("Combining etymologies with same verb form class I") etymologies[j] = etymologies[j].rstrip() + "\n\n" + etymologies[j + 1] # Cancel out effect of incrementing j below since we combined # the following etymology into this one j -= 1 j += 1 if len(etymologies) > 1: for j in xrange(len(etymologies)): # Stuff like "===Alternative forms===" that goes before the # etymology section should be moved after. newetymj = re.sub(r"^(.*?\n)(===Etymology===\n(\n|[^=\n].*?\n)*)", r"\2\1", etymologies[j], 0, re.S) if newetymj != etymologies[j]: pagemsg("Moved ===Alternative forms=== and such after Etymology") etymologies[j] = newetymj # Remove ===Etymology=== from beginning etymologies[j] = re.sub("^===Etymology===\n", "", etymologies[j]) # Fix up newlines around etymology section etymologies[j] = etyomologies[j].strip() + "\n\n" if etymologies[j].startswith("="): etymologies[j] = "\n" + etymologies[j] sections[i] = (sechead + ''.join(["===Etymology %s===\n" % (j + 1) + etymologies[j] for j in xrange(len(etymologies))])) elif len(etymologies) == 1: if etyms_were_separate: # We might need to add an Etymology header at the beginning. pagemsg("Combined formerly separate etymologies") if not re.match(r"^(=|\{\{wikipedia|\[\[File:)", etymologies[0].strip()): etymologies[0] = "===Etymology===\n" + etymologies[0] pagemsg("Added Etymology header when previously separate etymologies combined") # Put Alternative forms section before Etymology. newetym0 = re.sub(r"^((?:\n|[^=\n].*?\n)*)(===Etymology===\n(?:\n|[^=\n].*?\n)*)(===(Alternative.*?)===\n(?:\n|[^=\n].*?\n)*)", r"\1\3\2", etymologies[0], 0, re.S) if newetym0 != etymologies[0]: pagemsg("Moved ===Alternative forms=== and such before Etymology") etymologies[0] = newetym0 sections[i] = sechead + etymologies[0] else: sections[i] = sechead break # End of loop over sections in existing page; rejoin sections newtext = pagehead + ''.join(sections) + pagetail # Don't signal a save if only differences are whitespace at end, # since it appears that newlines at end get stripped when saving. if pagetext.rstrip() == newtext.rstrip(): pagemsg("No change in text") else: if verbose: pagemsg("Replacing [[%s]] with [[%s]]" % (pagetext, newtext)) else: pagemsg("Text has changed") pagetext = newtext # Construct and output comment. notestext = '; '.join(notes) if notestext: if comment: comment += " (%s)" % notestext else: comment = notestext assert(comment) pagemsg("comment = %s" % comment, simple = True) return pagetext, comment
def check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg, laxer_comparison=False): # FORM1 is the forms from ru-noun (or ru-proper noun); FORM2 is the combined # set of forms from ru-noun-table, and needs to be split on commas. # FORM1_LEMMA is true if the FORM1 values come from the ru-noun lemma. def compare_forms(case, form1, form2, form1_lemma=False): # Split on individual words and allow monosyllabic accent differences. # FIXME: Will still have problems with [[X|Y]]. def compare_single_form(f1, f2): words1 = re.split("[ -]", f1) words2 = re.split("[ -]", f2) if len(words1) != len(words2): return None for i in xrange(len(words1)): if words1[i] != words2[i]: w1 = fixup_link(words1[i]) w2 = words2[i] # Allow case where existing is monosyllabic and missing a stress # compared with proposed w1 = {w1, try_to_stress(w1)} # Allow case where existing is missing a link as compared to # proposed (but not other way around; we don't want a link # disappearing) w2 = {w2, blib.remove_links(w2)} if not (w1 & w2): return None return True form1 = [fixup_link(re.sub(u"ё́", u"ё", x)) for x in form1] form2 = re.split(",", form2) if laxer_comparison or not form1_lemma: # Ignore manual translit in decl forms when comparing non-lemma forms; # not available from ru-noun (and not displayed anyway); also when # laxer_comparison is set, which happens in add_noun_decl form2 = [re.sub("//.*$", "", x) for x in form2] # If existing value missing, OK; also allow for unstressed monosyllabic # existing form matching stressed monosyllabic new form if form1: if (set(form1) == set(form2) or set(try_to_stress(x) for x in form1) == set(form2) or len(form1) == 1 and len(form2) == 1 and compare_single_form(form1[0], form2[0])): pass else: pagemsg("WARNING: case %s, existing forms %s not same as proposed %s" %( case, ",".join(form1), ",".join(form2))) return None return True def compare_genders(g1, g2): if set(g1) == set(g2): return True if len(g1) == 1 and len(g2) == 1: # If genders don't match exactly, check if existing gender is missing # animacy and allow that, so it gets overwritten with new gender if g1[0] == re.sub("-(an|in)", "", g2[0]): pagemsg("Existing gender %s missing animacy spec compared with proposed %s, allowed" % ( ",".join(g1), ",".join(g2))) return True return None headwords = blib.fetch_param_chain(headword_template, "1", "head", subpagetitle) translits = blib.fetch_param_chain(headword_template, "tr", "tr") for i in xrange(len(translits)): if len(headwords) <= i: pagemsg("WARNING: Not enough headwords for translit tr%s=%s, skipping" % ( "" if i == 0 else str(i+1), translits[i])) return None else: headwords[i] += "//" + translits[i] genitives = blib.fetch_param_chain(headword_template, "3", "gen") plurals = blib.fetch_param_chain(headword_template, "4", "pl") genders = blib.fetch_param_chain(headword_template, "2", "g") cases_to_check = None if args["n"] == "s": if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or not compare_forms("gen_sg", genitives, args["gen_sg"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_sg", "gen_sg"] elif args["n"] == "p": if (not compare_forms("nom_pl", headwords, args["nom_pl_linked"], True) or not compare_forms("gen_pl", genitives, args["gen_pl"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_pl", "gen_pl"] elif args["n"] == "b": if (not compare_forms("nom_sg", headwords, args["nom_sg_linked"], True) or not compare_forms("gen_sg", genitives, args["gen_sg"]) or not compare_forms("nom_pl", plurals, args["nom_pl"])): pagemsg("Existing and proposed forms not same, skipping") return None cases_to_check = ["nom_sg", "gen_sg", "nom_pl"] else: pagemsg("WARNING: Unrecognized number spec %s, skipping" % args["n"]) return None for case in cases_to_check: raw_case = re.sub(u"△", "", blib.remove_links(args[case + "_raw"])) if args[case] != raw_case: pagemsg("WARNING: Raw case %s=%s contains footnote symbol" % ( case, args[case + "_raw"])) proposed_genders = re.split(",", args["g"]) if compare_genders(genders, proposed_genders): genders = [] else: # Check for animacy mismatch, punt if so cur_in = [x for x in genders if re.search(r"\bin\b", x)] cur_an = [x for x in genders if re.search(r"\ban\b", x)] proposed_in = [x for x in proposed_genders if re.search(r"\bin\b", x)] proposed_an = [x for x in proposed_genders if re.search(r"\ban\b", x)] if (cur_in or not cur_an) and proposed_an or (cur_an or not cur_in) and proposed_in: pagemsg("WARNING: Animacy mismatch, skipping: cur=%s proposed=%s" % ( ",".join(genders), ",".join(proposed_genders))) return None # Check for number mismatch, punt if so cur_pl = [x for x in genders if re.search(r"\bp\b", x)] if cur_pl and args["n"] != "p" or not cur_pl and args["n"] == "p": pagemsg("WARNING: Number mismatch, skipping: cur=%s, proposed=%s, n=%s" % ( ",".join(genders), ",".join(proposed_genders), args["n"])) return None pagemsg("WARNING: Gender mismatch, existing=%s, new=%s" % ( ",".join(genders), ",".join(proposed_genders))) return genders
lemma = re.sub(u"([кгхшжчщ])ый$", r"\1ий", lemma) pagemsg("WARNING: Inferring adjectival lemma from inflection, please check: lemma=%s, infl=%s" % (lemma, infl)) break else: pagemsg("WARNING: Assuming word is inflected adj or noun, please check: lemma=%s, infl=%s" % (lemma, infl)) else: infl = word lemma = ru.remove_accents(infl) saw_unlinked_word = True lemmas_infls.append((lemma, infl)) if see_template: pagemsg("Found decl-see template: %s" % unicode(see_template)) inflected_words = set(ru.remove_accents(blib.remove_links(unicode(x.value))) for x in see_template.params) if saw_unlinked_word: pagemsg("WARNING: Unlinked word(s) in headword, found decl-see template, proceeding, please check: %s" % headword) else: # Try to figure out which words are inflected and which words aren't pagemsg("No ru-decl-noun-see template, inferring which headword words are inflected") if saw_unlinked_word: pagemsg("WARNING: Unlinked word(s) in headword, no decl-see template, skipping: %s" % headword) return inflected_words = set() saw_noun = False reached_uninflected = False wordind = 0 for word, lemmainfl in zip(headwords, lemmas_infls): wordind += 1
def check_bad_head(text, arg): canontext = re.sub(u"[׳’]", "'", blib.remove_links(text)) canonpagetitle = re.sub(u"[׳’]", "'", pagetitle) if canontext != canonpagetitle: pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" % (arg, canontext, canonpagetitle, arg, text))
def infer_decl(t, pagemsg): if verbose: pagemsg("Processing %s" % unicode(t)) forms = {} # Initialize all cases to blank in case we don't set them again later for case, numparam in short_adj_cases_params: form = getparam(t, case) or getparam(t, numparam) form = form.strip() form = blib.remove_links(form) forms[case] = form def get_form(case): if forms[case] == "-": return "" return forms[case] m = get_form("short_m") f = get_form("short_f") n = get_form("short_n") p = get_form("short_p") specials = ["", m] explicit_msg = None stem = getparam(t, "1") decl = getparam(t, "2") if not m and not f and not n and not p: pagemsg("No short forms, skipping") return None elif not m and f and n and p: pagemsg("Missing short masculine but other short forms present, continuing") elif m and not f and not n and not p: pagemsg("Found only short m") stem, decl = combine_stem(stem, decl) args = [stem, decl] + ["short_m=%s" % m] if trymatch(t, args, pagemsg): return args else: return None elif not m or not f or not n or not p: pagemsg("WARNING: Some short forms missing, skipping: m=%s, f=%s, n=%s, p=%s" % (m or "blank", f or "blank", n or "blank", p or "blank")) return None if re.search("(^|:)[abc*]", decl): pagemsg("WARNING: Decl spec %s already has short accent class but short forms present? Skipping ...") return None if not decl: newstem, decl = detect_stem(stem, decl) if not decl: pagemsg("WARNING: Unable to detect stem type for stem=%s" % stem) return None stem = newstem if decl == "short" or decl == "mixed" or decl == u"ьий": if f or n or p: pagemsg("WARNING: Short forms found when not allowed: f=%s, n=%s, p=%s" % (f or "blank", n or "blank", p or "blank")) return None pagemsg("Skipping decl type %s, no short forms allowed" % decl) return None if "," in m: pagemsg("WARNING: Multiple masculine forms, something wrong: m=%s" % m) return None f2 = "," in f n2 = "," in n p2 = "," in p def get_stressed_form(form): if "," not in form: return form forms = re.split("\s*,\s*", form) if len(forms) > 2: pagemsg("WARNING: More than two forms in %s" % form) return None for frm in forms: if not re.search(AC + "$", frm): return frm pagemsg("WARNING: Multiple forms but none stem-stressed: %s" % form) return forms[0] sf = get_stressed_form(f) sn = get_stressed_form(n) sp = get_stressed_form(p) fend = re.search(AC + "$", f) nend = re.search(AC + "$", n) pend = re.search(AC + "$", p) mm = re.search(u"^(.*)[ая]́?$", sf) if not mm: pagemsg("WARNING: Unable to recognize feminine ending: %s" % sf) return None fstem = mm.group(1) mm = re.search(u"^(.*)[оеё]́?$", sn) if not mm: pagemsg("WARNING: Unable to recognize neuter ending: %s" % sn) return None nstem = mm.group(1) mm = re.search(u"^(.*)[ыи]́?$", sp) if not mm: pagemsg("WARNING: Unable to recognize plural ending: %s" % sp) return None pstem = mm.group(1) mm = re.search(u"^(.*?)[ъьй]?$", m) assert mm mstem = mm.group(1) short_stem = stem if is_stressed(fstem): short_stem = fstem elif is_stressed(nstem): short_stem = nstem elif is_stressed(pstem): short_stem = pstem else: if make_unstressed_once(fstem) == make_unstressed_once(mstem): short_stem = mstem if is_unstressed(stem): stem = make_ending_stressed(stem) short_stem = try_to_stress(short_stem) if stem == short_stem: short_stem = "" elif short_stem + u"н" == stem and re.search(u"нн[иы]й$", stem + decl): pagemsg("Found special (2): short stem %s, long stem %s" % (short_stem, stem)) specials = ["(2)"] short_stem = "" else: pagemsg("WARNING: Found short stem %s different from long stem %s" % (short_stem, stem)) real_short_stem = short_stem or stem if specials != ["(2)"] and mstem != real_short_stem: if mstem + u"н" == real_short_stem and re.search(u"нн$", real_short_stem): pagemsg("Found special (1): short stem %s, masculine stem %s" % ( real_short_stem, mstem)) specials = ["(1)"] elif make_unstressed_once(stem) == mstem: # Can happen with monosyllabic masculines pass elif not m: pagemsg("Missing short masculine singular") if real_short_stem.endswith(u"нн"): specials = ["(1)"] explicit_msg = "-" else: pagemsg("Masculine short stem %s differs from short stem %s, presumed reducible" % (mstem, real_short_stem)) if "(1)" in specials or "(2)" in specials: pagemsg("WARNING: Can't have reducible and special together") return None specials = ["*", m] ff = f2 and "both" or fend and "end" or "stem" nn = n2 and "both" or nend and "end" or "stem" pp = p2 and "both" or pend and "end" or "stem" def match(fval, nval, pval): return ff == fval and nn == nval and pp == pval stress = (match("stem", "stem", "stem") and "a" or match("both", "stem", "stem") and "a'" or match("end", "end", "end") and "b" or match("end", "end", "both") and "b'" or match("end", "stem", "stem") and "c" or match("end", "stem", "both") and "c'" or match("end", "both", "both") and "c''" or None) if "*" in specials and not is_monosyllabic(m) and ( (stress in ["b", "b'"]) != (not not is_ending_stressed(m))): pagemsg("WARNING: (De)reducible short masc sg %s has wrong stress for accent pattern %s, setting manual masc sg" % (m, stress)) explicit_msg = m if not stress: pagemsg("WARNING: Unrecognized stress: m=%s f=%s n=%s p=%s" % ( m, f, n, p)) return None stem, decl = combine_stem(stem, decl) for special in specials: if special not in ["", "*", "(1)", "(2)"]: if explicit_msg: if special == explicit_msg: pass else: pagemsg("WARNING: Something wrong; trying to set explicit short masc sg %s when there's an existing setting %s" % ( special, explicit_msg)) else: explicit_msg = special special = "" special = stress + special declspec = special + (short_stem and (":" + short_stem) or "") if decl: declspec = decl + ":" + declspec args = [stem, declspec] if explicit_msg: args.append("short_m=" + explicit_msg) if trymatch(t, args, pagemsg): return args pagemsg("WARNING: Unable to infer short accent") return None
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword and (not lemmas or pagetitle in lemmas): del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runoun.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runoun.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def create_declension(page, index, save, pos, tempname, decltempname, sgnum, removeparams, is_proper=False): pagename = page.title() comments = [] def pgmsg(text): msg("Page %s %s: %s" % (index, pagename, text)) # Starts with definite article al- def starts_with_al(text): return re.match(ALIF_ANY + A + "?" + L, text) def sub_if(fr, to, text): if re.search(fr, text): return re.sub(fr, to, text) else: return "" # Remove definite article al- from text def remove_al(text): return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text) or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text) or text) # Remove definite article al- from transliterated text def remove_al_tr(text): return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or sub_if("^a?l-", "", text) or text) # Split off interwiki links at end m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", page.text, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = page.text pagetail = "" # Split top-level sections (by language) splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off head and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Look for Arabic section for seci in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[seci], re.M) if not m: pgmsg("Can't find language name in text: [[%s]]" % (sections[seci])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S) if mm: secbody = mm.group(1) sectail = mm.group(2) else: secbody = sections[seci] sectail = "" # Split into subsections based on headers subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) # Go through each subsection for j in xrange(len(subsections)): notes = [] def add_note(note): if note not in notes: notes.append(note) # Look for subsections matching the given POS if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]): # Call reorder_shadda here so the templates we work with have # shadda in correct order but we don't mess with other text to # avoid unnecessary saving parsed = blib.parse_text(reorder_shadda(subsections[j])) def pagemsg(text): pgmsg("%s: [[%s]]" % (text, subsections[j])) # Check for various conditions causing us to skip this entry and # not try to add a declension table # Skip declension if certain templates found in definition. # We don't check for {{alternative form of|...}}, because it's # used for e.g. different ways of spelling "camera" in Arabic, # some with -ā and some with -a, so we still want to create # declensions for those. altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in ["alternative spelling of"]] if len(altspelling_templates) > 0: pagemsg("Alternative spelling redirect found in text, skipping") continue if pos == "Adjective": feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in ["feminine of"]] if len(feminine_of_templates) > 0: pagemsg("feminine-of template found for adjective, skipping") continue # Retrieve headword_template, make sure exactly one and it is the right type headword_templates = [temp for temp in parsed.filter_templates() if temp.name in ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun", "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl", "ar-noun-inf-cons", "ar-adj-inf-def", "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-in", "ar-adj-an"]] if len(headword_templates) == 0: pagemsg("WARNING: Can't find headword template in text, skipping") continue if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates in text, skipping") continue headword_template = headword_templates[0] if headword_template.name != tempname: pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name)) continue def getp(param): return getparam(headword_template, param) # NOTE: We physically add and remove parameters from the headword # template to get the list of parameters to use in creating the # declension template. These changes don't get propagated to the # headword template because we don't convert the parsed text back # to a string. def putp(param, value): addparam(headword_template, param, value) head = getp("1") orighead = head # Check for declension already present if (j + 1 < len(subsections) and re.match("^===+Declension===+\n", subsections[j + 1]) or j + 3 < len(subsections) and re.match("^===+Usage", subsections[j + 1]) and re.match("^===+Declension===+\n", subsections[j + 3]) ): pagemsg("Declension already found for head %s, skipping" % head) continue # Check for cpl # FIXME: Convert cpl into pl and fpl if getp("cpl"): pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head)) continue # Check for empty head. If w/o explicit translit, skip; else, # fetch head from page title. if not head: if not getp("tr"): pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping") continue else: pagemsg("Headword template head is empty but has explicit translit") add_note("empty head, using page name") head = pagename putp("1", head) # Try to handle cases with a modifier; we can't handle all of them yet headspace = False if ' ' in head: headspace = True words = re.split(r"\s", remove_links(head)) head = words[0] if len(words) > 2: pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead) continue assert(len(words) == 2) # Check for params we don't yet know how to handle must_continue = False for badparam in ["pl2", "pltr", "head2", "sing", "coll"]: if getp(badparam): # FIXME pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam)) must_continue = True break if must_continue: continue # Now check for various types of construction, all either # construct (ʾidāfa) or adjectival def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext): if word.endswith(undia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, undiatext)) return re.sub(undia + "$", "", word) if word.endswith(udia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, udiatext)) return re.sub(udia + "$", "", word) if re.search(DIACRITIC_ANY_BUT_SH + "$", word): pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word)) if word[0] == ALIF_WASLA: pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word)) add_note("changing %s alif wasla to plain alif" % (nomgen)) word = ALIF + word[1:] return word def remove_gen_i3rab(word): return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I") def remove_nom_i3rab(word): return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U") def remove_gen_i3rab_tr(word): return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i") def remove_nom_i3rab_tr(word): return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u") idafa = False word0al = starts_with_al(words[0]) word1al = starts_with_al(words[1]) words[0] = remove_al(words[0]) words[1] = remove_al(words[1]) putp("1", words[0]) putp("mod", words[1]) if word0al and word1al: pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead)) add_note("modifier definite adjective construction") putp("state", "def") elif word0al and not word1al: pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead)) continue elif is_proper: if words[0].endswith(ALIF) and word1al: pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead)) add_note("modifier proper noun + definite adjective construction") putp("state", "ind-def") elif remove_diacritics(words[0]) == u"جمهورية": if word1al: pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True assert sgnum == "sg" idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(ALIF): pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead)) add_note("modifier proper-noun ind-def idafa construction") assert sgnum == "sg" idafaval = "ind-def" putp("idafa", idafaval) else: pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead)) continue else: pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead)) continue elif not word0al and word1al: # Found an ʾidāfa construction pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True idafaval = "def-" + sgnum if idafaval == "def-sg": idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(I + Y): pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead)) continue elif words[1].endswith(I + Y + SH): pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead)) add_note("modifier indefinite nisba adjective construction") elif pagename in adjectival_phrases: pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead)) add_note("modifier indefinite adjective construction") else: pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead)) add_note("modifier indefinite idafa construction") idafa = True putp("idafa", sgnum) # Now remove any i3rab diacritics putp("1", remove_nom_i3rab(getp("1"))) if idafa: putp("mod", remove_gen_i3rab(getp("mod"))) else: putp("mod", remove_nom_i3rab(getp("mod"))) # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s has space in it and is plural" % (orighead)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead)) continue putp("pl", getp("1")) putp("1", "-") if not idafa: putp("modpl", getp("mod")) putp("mod", "-") # Now check if lemma has plural specified elif getp("pl"): pls = re.split(r"\s", remove_links(getp("pl"))) assert(len(pls) == 2) pls[0] = remove_al(pls[0]) pls[1] = remove_al(pls[1]) putp("pl", remove_nom_i3rab(pls[0])) if not idafa: putp("modpl", remove_nom_i3rab(pls[1])) else: if pls[1] != getp("mod"): pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod"))) # Now check if there's manual translit. We need to split the # manual translit in two and pair up manual translit with # corresponding Arabic words. But first remove -t indicating # construct state, and check to see if manual translit is # same as auto translit, in which case it's unnecessary. if getp("tr"): pagemsg("Headword template head %s has space in it and manual translit" % (orighead)) trwords = re.split(r"\s", getp("tr")) assert(len(trwords) == 2) trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0])) if idafa: trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1])) else: trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1])) # Remove any extraneous -t from translit, either from construct # state of from removal of i3rab in a feminine noun/adj. for i in [0, 1]: if words[i].endswith(TAM) and trwords[i].endswith("t"): trwords[i] = trwords[i][0:-1] if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"): trwords[i] += "h" if ar_translit.tr(words[0]) != trwords[0]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/manual translit") putp("1", "%s/%s" % (getp("1"), trwords[0])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/ignored manual translit") if ar_translit.tr(words[1]) != trwords[1]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/manual translit") putp("mod", "%s/%s" % (getp("mod"), trwords[1])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/ignored manual translit") else: # no space in head, not dealing with a modifier # If has link in it, just remove it if '[' in head or ']' in head or '|' in head: pagemsg("Headword template head %s has link in it" % (head)) add_note("removed links from head") head = remove_links(head) putp("1", head) # If starts with definite article, remove article from everything, # including transliterations, and set state=def if starts_with_al(head): pagemsg("Headword template head %s starts with definite article" % (head)) add_note("definite lemma") head = remove_al(head) putp("1", head) putp("state", "def") # Also remove al- from remaining head and pl params def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg("Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value)) params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"] for param in params_to_check: check_for_al(param) for i in xrange(2, 10): check_for_al("head%s" % i) for param in params_to_check: check_for_al("%s%s" % (param, i)) # Also remove al- from transliteration def check_for_al_tr(param): value = getparam(headword_template, param) if value: putp(param, remove_al_tr(value)) check_for_al("tr") for param in params_to_check: check_for_al("%str" % param) for i in xrange(2, 10): check_for_al("tr%s" % i) for param in params_to_check: check_for_al("%s%str" % (param, i)) elif is_proper: if head.endswith(ALIF): pagemsg(u"Headword template head %s ends in -ā" % (head)) putp("state", "ind-def") else: pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head)) continue if head.endswith(UN): pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head)) add_note("head has explicit i3rab (UN)") # We don't continue here because we handle this case below elif head.endswith(U): pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head)) add_note("head has explicit i3rab (U)") # We don't continue here because we don't need to handle this case # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s is plural" % (head)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head)) continue putp("pl", getp("1")) putp("1", "-") # Now fetch the parameters from the headword template, removing # any that we want to remove, removing the i3rab -UN ending, and # adding any specified manual translit as a / annotation. def param_should_be_removed(param): name = unicode(param.name) if name == "sc" and unicode(param.value) == "Arab": return True if name.endswith("tr"): return True for remove in removeparams: if name == remove: return True if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name): return True return False def remove_i3rab(param): text = unicode(param) if text.endswith(UN): pgmsg("Removing i3rab from %s: %s" % (text, unicode(headword_template))) add_note("removing i3rab") return re.sub(UN + "$", "", text) def trparam(name): if name == "1": return "tr" elif name.startswith("head"): return name.replace("head", "tr") else: return name + "tr" def process_param(param): arabic = remove_i3rab(param) # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in # to signal the strong plural. if arabic.endswith("=+"): newarabic = re.sub(r"=\+$", "=sp", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Value of - is used in ar-adj-in to signal an unknown # feminine plural. if arabic.endswith("=-"): newarabic = re.sub(r"=-$", "=?", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Don't process translit in modifier constructions, where the # translit is also processed. if not headspace: tr = getparam(headword_template, trparam(unicode(param.name))) if tr: return arabic + "/" + tr return arabic params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)]) # For templates that automatically supply the masculine plural, # supply it here, too if not overridden. if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"): params += '|pl=sp' # Separate off any [[Category: Foo]] declarators, insert before them m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$", subsections[j], re.S) if m: body = m.group(1) tail = m.group(2) else: body = subsections[j] tail = "" # Make sure there are two trailing newlines if body.endswith("\n\n"): pass elif body.endswith("\n"): body += "\n" else: body += "\n\n" body += (subsections[j - 1].replace(pos, "=Declension=") + "{{%s|%s}}\n\n" % (decltempname, params)) subsections[j] = body + tail comment = "added declension for %s %s" % (tempname, remove_links(orighead) or "%s/%s" % (pagename, getp("tr"))) note = ', '.join(notes) if note: comment = "%s (%s)" % (comment, note) comments.append(comment) sections[seci] = ''.join(subsections) + sectail newtext = pagehead + ''.join(sections) + pagetail comment = '; '.join(comments) assert((not comment) == (newtext == page.text)) if newtext != page.text: if verbose: msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext)) page.text = newtext msg("For page %s, comment = %s" % (pagename, comment)) if save: page.save(comment = comment)