def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj", "ru-conj-old"]: param1 = getparam(t, "1") param2 = getparam(t, "2") if not param2.startswith("7"): continue param3 = getparam(t, "3") param4 = getparam(t, "4") param5 = getparam(t, "5") assert not getparam(t, "6") if param2.startswith("7b"): if re.search( u"[еѣ]сти́(сь)?$", param3) and u"ё" not in param4 and u"ѣ̈" not in param4: assert not param5 param5 = u"ёе" param4 = rulib.make_unstressed_ru(param4) if re.search(u"(л[еѣ]́?зть|с[еѣ]́?сть|обокра́сть)(ся)?$", param3): param5 = "" # Fetch non-numbered params. non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in [ "lang", "nocat", "tr" ]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back numbered params. t.add("1", param1) t.add("2", param2) t.add("3", param3) t.add("4", param4) if param5: t.add("5", param5) # Put back non-numbered params. for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append( "rewrite class 7 verb to correspond to module changes") return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj", "ru-conj-old"]: param1 = getparam(t, "1") param2 = getparam(t, "2") if not param2.startswith("8b"): continue param3 = getparam(t, "3") param4 = getparam(t, "4") param5 = getparam(t, "5") assert not getparam(t, "6") if getparam(t, "past_m"): errmsg("WARNING: Has past_m=%s" % getparam(t, "past_m")) pap = getparam(t, "pap") or getparam(t, "past_adv_part") if pap: errmsg("WARNING: Has pap=%s" % pap) pap2 = getparam(t, "pap2") or getparam(t, "past_adv_part2") if pap2: errmsg("WARNING: Has pap2=%s" % pap2) param4 = rulib.make_unstressed_ru(param4) # Fetch non-numbered params. non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back numbered params. t.add("1", param1) t.add("2", param2) t.add("3", param3) t.add("4", param4) if param5: t.add("5", param5) # Put back non-numbered params. for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("rewrite class 8b verb to correspond to module changes") return unicode(parsed), notes
def do_line(direc, aspect, suffixes): links = [] if direc == "-": return "* (no equivalent)" else: for index, verb in enumerate(re.split(",", direc)): gender = "" notes = [] if verb: endbracket = False if verb.endswith("]"): endbracket = True verb = verb[:-1] if verb.endswith("-"): verb = verb[:-1] if aspect == "impf": verb = rulib.make_unstressed_ru(verb) verb = paste_verb(verb, suffixes[index]) while True: if verb.startswith("+"): gender = "|g=%s" % aspect verb = re.sub(r"^\+", "", verb) elif verb.startswith("(i)"): notes.append("iterative") verb = re.sub(r"^\(i\)", "", verb) elif verb.startswith("(n)"): notes.append("nonstandard") verb = re.sub(r"^\(n\)", "", verb) elif verb.startswith("(lc)"): notes.append("low colloquial") verb = re.sub(r"^\(lc\)", "", verb) elif verb.startswith("(d)"): notes.append("dated") verb = re.sub(r"^\(d\)", "", verb) else: break if verb.startswith("["): verb = verb[1:] assert endbracket links.append( "[{{l|ru|%s%s}}]%s" % (verb, gender, notes and " {{i|%s}}" % ", ".join(notes) or "")) else: links.append( "{{l|ru|%s%s}}%s" % (verb, gender, notes and " {{i|%s}}" % ", ".join(notes) or "")) return "* " + ", ".join(links)
def form_ppp(conjtype, pagetitle, args): def form_ppp_1(conjtype, pagetitle, args): def first_entry(forms): forms = re.sub(",.*", "", forms) return re.sub("//.*", "", forms) if not re.search("^[0-9]+", conjtype): return None conjtype = int(re.sub("^([0-9]+).*", r"\1", conjtype)) if ((pagetitle.endswith(u"ать") or pagetitle.endswith(u"ять")) and conjtype != 14): return re.sub(u"ть$", u"нный", pagetitle) if pagetitle.endswith(u"еть") and conjtype == 1: return re.sub(u"ть$", u"нный", pagetitle) if conjtype in [4, 5]: sg1 = ( args["pres_1sg"] if "pres_1sg" in args else args["futr_1sg"] if "futr_1sg" in args else None ) if not sg1 or sg1 == "-" or sg1.startswith(u"бу́ду "): return None sg1 = first_entry(sg1) assert re.search(u"[ую]́?$", sg1) return re.sub(u"[ую]́?$", u"енный", sg1) if conjtype in [7, 8]: sg3 = args["pres_3sg"] if "pres_3sg" in args else args["futr_3sg"] sg3 = first_entry(sg3) assert re.search(u"[её]́?т$", sg3) return re.sub(u"[её]́?т$", u"енный", sg3) if conjtype in [3, 10]: if pagetitle.endswith(u"чь"): return re.sub(u"чь", u"гнутый", pagetitle) return re.sub(u"ть$", u"тый", pagetitle) assert conjtype in [9, 11, 12, 14, 15, 16] if "past_m" not in args: # occurs with e.g. impersonal verbs e.g. спереть return None pastm = first_entry(args["past_m"]) return re.sub(u"л?$", u"тый", pastm) retval = form_ppp_1(conjtype, pagetitle, args) if retval: return rulib.make_unstressed_ru(retval) else: return None
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) param2 = getparam(t, "2") if unicode(t.name) in ["ru-conj"] and re.search(r"^8[ab]", param2): if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue past_m = getparam(t, "past_m") if past_m: rmparam(t, "past_m") stem = getparam(t, "3") if stem == past_m: pagemsg("Stem %s and past_m same" % stem) notes.append("remove redundant past_m %s" % past_m) elif (param2.startswith("8b") and not param2.startswith("8b/") and rulib.make_unstressed_ru(past_m) == stem): pagemsg( "Class 8b/b and stem %s is unstressed version of past_m %s, replacing stem with past_m" % (stem, past_m)) t.add("3", past_m) notes.append("moving past_m %s to arg 3" % past_m) else: pagemsg( "Stem %s and past_m %s are different, putting past_m in param 5" % (stem, past_m)) t.add("5", past_m) notes.append("moving past_m %s to arg 5" % past_m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj-7a", "ru-conj-7b"]: past_stem = getparam(t, "4") vowel_end = re.search(u"[аэыоуяеиёю́]$", past_stem) past_m = getparam(t, "past_m") past_f = getparam(t, "past_f") past_n = getparam(t, "past_n") past_pl = getparam(t, "past_pl") if past_m or past_f or past_n or past_pl: upast_stem = rulib.make_unstressed_ru(past_stem) expected_past_m = past_stem + (u"л" if vowel_end else "") expected_past_f = upast_stem + u"ла́" expected_past_n = upast_stem + u"ло́" expected_past_pl = upast_stem + u"ли́" if ((not past_m or expected_past_m == past_m) and expected_past_f == past_f and expected_past_n == past_n and expected_past_pl == past_pl): msg("Would remove past overrides and add arg5=b") else: msg("WARNING: Remaining past overrides: past_m=%s, past_f=%s, past_n=%s, past_pl=%s, expected_past_m=%s, expected_past_f=%s, expected_past_n=%s, expected_past_pl=%s" % (past_m, past_f, past_n, past_pl, expected_past_m, expected_past_f, expected_past_n, expected_past_pl)) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def form_ppp(verbtype, pagetitle, args): def form_ppp_1(verbtype, pagetitle, args): def first_entry(forms): forms = re.sub(",.*", "", forms) return re.sub("//.*", "", forms) if not re.search("^[0-9]+", verbtype): return None verbtype = int(re.sub("^([0-9]+).*", r"\1", verbtype)) if ((pagetitle.endswith(u"ать") or pagetitle.endswith(u"ять")) and verbtype != 14): return re.sub(u"ть$", u"нный", pagetitle) if pagetitle.endswith(u"еть") and verbtype == 1: return re.sub(u"ть$", u"нный", pagetitle) if verbtype in [4, 5]: sg1 = args["pres_1sg"] if "pres_1sg" in args else args["futr_1sg"] if not sg1 or sg1 == "-": return None sg1 = first_entry(sg1) assert re.search(u"[ую]́?$", sg1) return re.sub(u"[ую]́?$", u"енный", sg1) if verbtype in [7, 8]: sg3 = args["pres_3sg"] if "pres_3sg" in args else args["futr_3sg"] sg3 = first_entry(sg3) assert re.search(u"[её]́?т$", sg3) return re.sub(u"[её]́?т$", u"енный", sg3) if verbtype in [3, 10]: return re.sub(u"ть$", u"тый", pagetitle) assert verbtype in [9, 11, 12, 14, 15, 16] pastm = first_entry(args["past_m"]) return re.sub(u"л?$", u"тый", pastm) retval = form_ppp_1(verbtype, pagetitle, args) if retval: return rulib.make_unstressed_ru(retval) else: return None
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj", "ru-conj-old"]: assert not getparam(t, "4") inf = getparam(t, "3") inf = rulib.make_unstressed_ru(inf) inf = re.sub(u"нуть((ся)?)$", ur"ну́ть\1", inf) t.add("3", inf) notes.append("Remove stray accent from 3c infinitive") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse, codecs import rulib from collections import OrderedDict parser = argparse.ArgumentParser( description="Output short adjectives in Wiktionary, ordered by frequency.") parser.add_argument( "--freq-adjs", help=u"""Adjectives ordered by frequency, without accents or ё.""") parser.add_argument( "--wiktionary-short-adjs", help=u"""Adjectives in Wiktionary with short forms, in alphabetical order. Should be accented and with ё.""") args = parser.parse_args() short_adjs = OrderedDict( (rulib.make_unstressed_ru(x.strip()), True) for x in codecs.open(args.wiktionary_short_adjs, "r", "utf-8")) for line in codecs.open(args.freq_adjs, "r", "utf-8"): line = line.strip() if line in short_adjs: print line.encode("utf-8") del short_adjs[line] for line in short_adjs: print line.encode("utf-8")
def process_page(page, index, do_fix): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-conj", "ru-conj-old"]: if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") if tname == "ru-conj": tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) else: tempcall = re.sub(r"\{\{ru-conj-old", "{{ru-generate-verb-forms|old=y", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) for base in ["past_pasv_part", "ppp"]: forms_to_remove = [] if args[base] == "-": continue for form in re.split(",", args[base]): origform = form form = re.sub("//.*", "", form) fix_form = False if not re.search(ur"([аяеё]́?нный|тый)$", form): pagemsg( "WARNING: Past passive participle doesn't end correctly: %s" % form) fix_form = True unstressed_page = rulib.make_unstressed_ru(pagetitle) unstressed_form = rulib.make_unstressed_ru(form) warned = False if unstressed_form[0] != unstressed_page[0]: pagemsg( "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s" % form) warned = True fix_form = True if form.endswith(u"нный"): if pagetitle.endswith(u"ать"): good_ending = u"анный" elif pagetitle.endswith(u"ять"): good_ending = u"янный" else: good_ending = u"енный" if not unstressed_form.endswith(good_ending): pagemsg( "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s" % form) warned = True fix_form = True if not warned: correct_form = form_ppp(conjtype, pagetitle, args) if correct_form and unstressed_form != correct_form: pagemsg( "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s" % (unstressed_form, correct_form)) fix_form = True if fix_form: forms_to_remove.append(origform) if forms_to_remove and do_fix: curvals = [] for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]: val = getparam(t, base + i) if val: curvals.append(val) newvals = [x for x in curvals if x not in forms_to_remove] if len(curvals) - len(newvals) != len(forms_to_remove): pagemsg( "WARNING: Something wrong, couldn't remove all PPP forms %s" % ",".join(forms_to_remove)) curindex = 1 origt = unicode(t) for newval in newvals: t.add(base + ("" if curindex == 1 else str(curindex)), newval) curindex += 1 for i in xrange(curindex, 10): rmparam(t, base + ("" if i == 1 else str(i))) pagemsg("Replacing %s with %s" % (origt, unicode(t))) notes.append("removed bad past pasv part(s) %s" % ",".join(forms_to_remove))
def paste_verb(prefix, suffix): if rulib.is_stressed(prefix): verb = prefix + rulib.make_unstressed_ru(suffix) else: verb = prefix + suffix return rulib.remove_monosyllabic_accents(verb)
render_groups(groups) groups = [] group = [] pfsuffixes = None impfsuffixes = None elif line == "-": if group: groups.append(group) group = [] elif " " not in line: # A single prefix; combine with previous suffixes. # If it starts with a + (indicating include the apsect), that applies # only to the perfective verb. See лететь.der for good examples. group.append((combine_prefix(line, pfsuffixes, "pf"), combine_prefix( rulib.make_unstressed_ru(line).replace("+", ""), impfsuffixes, "impf"))) elif re.search(r" \+$", line): # Something like "об +" or "+об +". This indicates that the imperfective # (and maybe the perfective) should include the aspect. See лететь.der # for good examples. pf, impf = re.split(r"\s+", line) assert impf == "+" group.append((combine_prefix(pf, pfsuffixes, "pf"), combine_prefix("+" + rulib.make_unstressed_ru(pf), impfsuffixes, "impf"))) elif "!" in line: # Something like "об !" or "+об !" or "! об" or "! +об". This indicates # that one of the two is missing and the other should combine with # previous suffixes, maybe with the aspect included (see лететь.der for # good examples of this).
def process_page(index, page, save, verbose, nouns, adjectives): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if re.search(u"с[яь]$", pagetitle): pagemsg("Skipping reflexive verb") return text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-conj": if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) if "infinitive" not in args: # e.g. обнимать pagemsg("WARNING: No infinitive") continue infinitive = args["infinitive"] if "," in infinitive: pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive) continue if "//" in infinitive: pagemsg("WARNING: Infinitive has translit: %s" % infinitive) continue ppp = form_ppp(conjtype, pagetitle, args) if not ppp: continue if ppp.endswith(u"тый"): verbal_noun = re.sub(u"тый$", u"тие", ppp) verbal_noun_suffix = u"тие" verbal_adj = re.sub(u"тый$", u"тельный", ppp) verbal_adj_suffix = u"тельный" elif ppp.endswith(u"ённый"): verbal_noun = re.sub(u"ённый$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"ённый$", u"ительный", ppp) verbal_adj_suffix = u"ительный" elif ppp.endswith(u"енный"): verbal_noun = re.sub(u"енный$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"енный$", u"ительный", ppp) verbal_adj_suffix = u"ительный" else: assert ppp.endswith(u"анный") or ppp.endswith(u"янный") verbal_noun = re.sub(u"нный$", u"ние", ppp) verbal_adj = re.sub(u"нный$", u"тельный", ppp) m = re.search(u"(.)нный$", ppp) suffix_start = m.group(1) verbal_noun_suffix = suffix_start + u"ние" verbal_adj_suffix = suffix_start + u"тельный" agent_noun = re.sub(u"ный$", "", verbal_adj) agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix) stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix) stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix) stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix) if conjtype.startswith("7"): stem = getparam(t, "4") if infinitive.endswith(u"ть"): stem = stem.replace(u"ё", u"е́") else: stem = rulib.make_unstressed_ru(stem) stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-" elif conjtype.startswith("8"): stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-" else: stem = rulib.remove_monosyllabic_accents(infinitive) if verbal_noun in nouns: stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text) if not stressed_noun: msg("%s no-etym FIXME" % verbal_noun) elif stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % verbal_noun) else: if stressed_noun.endswith(stressed_verbal_noun_suffix): suffix = stressed_verbal_noun_suffix else: suffix = verbal_noun_suffix msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix)) if agent_noun in nouns: stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text) if stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % agent_noun) else: msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem)) if verbal_adj in adjectives: stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text) if stressed_adj == -1: pagemsg("Would add etym for %s but already has one" % verbal_adj) else: msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))