def combine_prefix(prefix, suffix): if ru.is_stressed(prefix): verb = prefix + ru.make_unstressed(suffix) else: verb = prefix + suffix verb = ru.remove_monosyllabic_accents(verb) return "* {{l|ru|" + verb + "}}"
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj-7a", "ru-conj-7b"]: past_stem = getparam(t, "4") vowel_end = re.search(u"[аэыоуяеиёю́]$", past_stem) past_m = getparam(t, "past_m") past_f = getparam(t, "past_f") past_n = getparam(t, "past_n") past_pl = getparam(t, "past_pl") if past_m or past_f or past_n or past_pl: upast_stem = ru.make_unstressed(past_stem) expected_past_m = past_stem + (u"л" if vowel_end else "") expected_past_f = upast_stem + u"ла́" expected_past_n = upast_stem + u"ло́" expected_past_pl = upast_stem + u"ли́" if ((not past_m or expected_past_m == past_m) and expected_past_f == past_f and expected_past_n == past_n and expected_past_pl == past_pl): msg("Would remove past overrides and add arg5=b") else: msg("WARNING: Remaining past overrides: past_m=%s, past_f=%s, past_n=%s, past_pl=%s, expected_past_m=%s, expected_past_f=%s, expected_past_n=%s, expected_past_pl=%s" % (past_m, past_f, past_n, past_pl, expected_past_m, expected_past_f, expected_past_n, expected_past_pl)) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) param2 = getparam(t, "2") if unicode(t.name) in ["ru-conj"] and re.search(r"^8[ab]", param2): if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue past_m = getparam(t, "past_m") if past_m: rmparam(t, "past_m") stem = getparam(t, "3") if stem == past_m: pagemsg("Stem %s and past_m same" % stem) notes.append("remove redundant past_m %s" % past_m) elif (param2.startswith("8b") and not param2.startswith("8b/") and ru.make_unstressed(past_m) == stem): pagemsg("Class 8b/b and stem %s is unstressed version of past_m %s, replacing stem with past_m" % ( stem, past_m)) t.add("3", past_m) notes.append("moving past_m %s to arg 3" % past_m) else: pagemsg("Stem %s and past_m %s are different, putting past_m in param 5" % ( stem, past_m)) t.add("5", past_m) notes.append("moving past_m %s to arg 5" % past_m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def form_ppp(verbtype, pagetitle, args): def form_ppp_1(verbtype, pagetitle, args): def first_entry(forms): forms = re.sub(",.*", "", forms) return re.sub("//.*", "", forms) if not re.search("^[0-9]+", verbtype): return None verbtype = int(re.sub("^([0-9]+).*", r"\1", verbtype)) if ((pagetitle.endswith(u"ать") or pagetitle.endswith(u"ять")) and verbtype != 14): return re.sub(u"ть$", u"нный", pagetitle) if pagetitle.endswith(u"еть") and verbtype == 1: return re.sub(u"ть$", u"нный", pagetitle) if verbtype in [4, 5]: sg1 = args["pres_1sg"] if "pres_1sg" in args else args["futr_1sg"] if not sg1 or sg1 == "-": return None sg1 = first_entry(sg1) assert re.search(u"[ую]́?$", sg1) return re.sub(u"[ую]́?$", u"енный", sg1) if verbtype in [7, 8]: sg3 = args["pres_3sg"] if "pres_3sg" in args else args["futr_3sg"] sg3 = first_entry(sg3) assert re.search(u"[её]́?т$", sg3) return re.sub(u"[её]́?т$", u"енный", sg3) if verbtype in [3, 10]: return re.sub(u"ть$", u"тый", pagetitle) assert verbtype in [9, 11, 12, 14, 15, 16] pastm = first_entry(args["past_m"]) return re.sub(u"л?$", u"тый", pastm) retval = form_ppp_1(verbtype, pagetitle, args) if retval: return rulib.make_unstressed(retval) else: return None
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errpagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) errmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if tname(t) in [ "ru-conj", "ru-conj-old", "User:Benwing2/ru-conj", "User:Benwing2/ru-conj-old" ] or tname(t) == "temp" and getparam(t, "1") == "ru-conj": verb_type, arg_sets = split_ru_conj_args(t, tname(t) == "temp") refl = "refl" in verb_type orig_arg_sets = copy.deepcopy(arg_sets) rm_pres_stem = False ##### First, modify arg_sets according to normalized params for arg_set in arg_sets: # This complex spec matches matches 3°a, 3oa, 4a1a, 6c1a, # 1a6a, 6a1as13, 6a1as14, etc. m = re.search(u"^([0-9]+[°o0-9abc]*[abc]s?1?[34]?)", arg_set[0]) if not m: m = re.search( u"^(irreg-?[абцдеѣфгчийклмнопярстувшхызёюжэщьъ%-]*)", arg_set[0]) if not m: errpagemsg("Unrecognized conjugation type: %s" % arg_set[0]) continue conj_type = m.group(1).replace("o", u"°") inf, tr = rulib.split_russian_tr(arg_set[1]) if refl: new_style = re.search(u"([тч]ься|ти́?сь)$", inf) else: new_style = re.search( u"([тч]ь|ти́?)$" if conj_type.startswith("7") or conj_type.startswith("irreg") else u"[тч]ь$", inf) if new_style: if arg_set[0].startswith("irreg-"): arg_set[0] = re.sub("^irreg-.*?(/.*|$)", r"irreg\1", arg_set[0]) arg_set[1] = rulib.paste_russian_tr( rulib.remove_monosyllabic_accents(inf), rulib.remove_tr_monosyllabic_accents(tr)) else: if not re.search("^[124]", conj_type): assert not tr if conj_type in ["1a", "2a", "2b"]: inf += u"ть" if tr: tr += u"tʹ" elif conj_type in ["3a", u"3°a"]: inf += u"нуть" elif conj_type in ["3b", u"3c"]: inf += u"у́ть" elif conj_type == "4a": inf += u"ить" if tr: tr += u"itʹ" elif conj_type in ["4b", "4c"]: inf, tr = rulib.make_unstressed( inf, rulib.decompose(tr)) inf += u"ить" if tr: tr += u"ítʹ" elif conj_type == "4a1a": inf = re.sub(u"[ая]$", "", inf) + u"ить" if tr: tr = re.sub("j?a$", "", tr) + u"itʹ" elif conj_type == "5a": inf = arg_set[2] + u"ть" if arg_set[ 2] else arg_set[1] + u"еть" normal_pres_stem = re.sub(u"[еая]ть$", "", inf) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif conj_type == "5b": inf = arg_set[2] + u"ть" normal_pres_stem = re.sub(u"[еая]́ть$", "", inf) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif conj_type == "5c": inf = arg_set[2] + u"ть" normal_pres_stem = rulib.make_ending_stressed_ru( re.sub(u"[еая]́ть$", "", inf)) if normal_pres_stem == arg_set[1]: arg_set[2] = "" else: arg_set[2] = arg_set[1] elif re.search(u"^6°?a", conj_type) or conj_type == "1a6a": assert not arg_set[3] if arg_set[2]: inf = arg_set[2] + u"ть" arg_set[2] = "" normal_pres_stem = rulib.make_ending_stressed_ru( re.sub(u"а́ть$", "", inf)) assert arg_set[1] == normal_pres_stem elif is_vowel_stem(inf): inf += u"ять" else: inf += u"ать" if getparam(t, "pres_stem"): arg_set[2] = getparam(t, "pres_stem") rm_pres_stem = True elif re.search(u"^6°?b", conj_type): if is_vowel_stem(inf): inf += u"я́ть" else: inf += u"а́ть" # arg_set[2] (present stem) remains elif re.search(u"^6°?c", conj_type): inf = rulib.make_unstressed_once_ru(inf) + u"а́ть" elif conj_type in ["7a", "7b"]: pass # nothing needed to do elif conj_type in ["8a", "8b"]: inf = arg_set[2] arg_set[2] = arg_set[1] elif conj_type == "9a": inf += u"еть" # arg_set[2] (present stem) remains elif conj_type == "9b": inf = rulib.make_unstressed_once_ru(inf) + u"е́ть" # arg_set[2] (present stem) remains # arg_set[3] (optional past participle stem) remains elif conj_type == "10a": inf += u"оть" elif conj_type == "10c": inf += u"ть" if rulib.make_unstressed_once_ru(arg_set[2]) == re.sub( u"о́$", "", arg_set[1]): arg_set[2] = "" elif conj_type == "11a": inf += u"ить" elif conj_type == "11b": inf += u"и́ть" if arg_set[2] == arg_set[1]: arg_set[2] = "" elif conj_type == "12a": inf += u"ть" if arg_set[2] == arg_set[1]: arg_set[2] = "" elif conj_type == "12b": inf += u"ть" if rulib.make_ending_stressed_ru( arg_set[2]) == arg_set[1]: arg_set[2] = "" elif conj_type == "13b": inf += u"ть" assert re.sub(u"ва́ть$", "", inf) == arg_set[2] arg_set[2] = "" elif conj_type in ["14a", "14b", "14c"]: inf += u"ть" # arg_set[2] (present stem) remains elif conj_type in ["15a", "16a", "16b"]: inf += u"ть" elif conj_type == u"irreg-минуть": inf = u"мину́ть" elif conj_type == u"irreg-живописать-миновать": inf += u"ть" arg_set[2] = "" elif conj_type == u"irreg-слыхать-видать": inf += u"ть" elif conj_type == u"irreg-стелить-стлать": inf = arg_set[2] + inf + u"ть" arg_set[2] = "" arg_set[3] = "" elif conj_type == u"irreg-ссать-сцать": assert arg_set[2] == re.sub(u"а́$", "", inf) inf = arg_set[3] + inf + u"ть" arg_set[2] = "" arg_set[3] = "" elif conj_type in [ u"irreg-сыпать", u"irreg-ехать", u"irreg-ѣхать" ]: infstem = re.sub("^irreg-", "", conj_type) if arg_set[1] != u"вы́": infstem = rulib.make_beginning_stressed_ru(infstem) inf = arg_set[1] + infstem elif conj_type == u"irreg-обязывать": if arg_set[1] == u"вы́": inf = u"вы́обязывать" else: inf = arg_set[1] + u"обя́зывать" elif conj_type == u"irreg-зиждиться": if arg_set[1] == u"вы́": inf = u"вы́зиждить" else: inf = arg_set[1] + u"зи́ждить" elif conj_type == u"irreg-идти": if not arg_set[1]: inf = u"идти́" elif arg_set[1] == u"вы́": inf = u"вы́йти" else: inf = arg_set[1] + u"йти́" elif re.search("^irreg-", conj_type): infstem = re.sub("^irreg-", "", conj_type) if arg_set[1] != u"вы́": infstem = rulib.make_ending_stressed_ru(infstem) inf = arg_set[1] + infstem else: error("Unknown conjugation type " + conj_type) if inf: if refl: if re.search(u"[тч]ь$", inf): inf += u"ся" if tr: tr += "sja" else: assert re.search(u"и́?$", inf) inf += u"сь" if tr: tr += u"sʹ" arg_set[1] = rulib.paste_russian_tr( rulib.remove_monosyllabic_accents(inf), rulib.remove_tr_monosyllabic_accents(tr)) ##### If something changed ... if orig_arg_sets != arg_sets or rm_pres_stem: ##### ... compare the forms generated by the original and new ##### arguments and make sure they're the same. if not pagetitle.startswith("User:Benwing2/"): # 1. Generate and expand the appropriate call to # {{ru-generate-verb-forms}} for the original arguments. orig_args = paste_arg_sets(orig_arg_sets, t, verb_type, rm_pres_stem=False, as_string=True) orig_tempcall = "{{ru-generate-verb-forms|%s%s}}" % ( "|".join(orig_args), "|old=1" if tname(t).endswith("ru-conj-old") else "") orig_result = expand_text(orig_tempcall) if not orig_result: errpagemsg( "WARNING: Error expanding original template %s" % orig_tempcall) continue orig_forms = blib.split_generate_args(orig_result) # 2. Generate and expand the appropriate call to # {{ru-generate-verb-forms}} for the new arguments. new_args = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=True) new_tempcall = "{{ru-generate-verb-forms|%s%s}}" % ( "|".join(new_args), "|old=1" if tname(t).endswith("ru-conj-old") else "") new_result = expand_text(new_tempcall) if not new_result: errpagemsg("WARNING: Error expanding new template %s" % new_tempcall) continue new_forms = blib.split_generate_args(new_result) # 3. Compare each form and accumulate a list of mismatches. all_keys = set(orig_forms.keys()) | set(new_forms.keys()) def sort_numbers_first(key): if re.search("^[0-9]+$", key): return "%05d" % int(key) return key all_keys = sorted(list(all_keys), key=sort_numbers_first) mismatches = [] for key in all_keys: origval = orig_forms.get(key, "<<missing>>") newval = new_forms.get(key, "<<missing>>") if origval != newval: mismatches.append("%s: old=%s new=%s" % (key, origval, newval)) # 4. If mismatches, output them and don't change anything. if mismatches: errpagemsg( "WARNING: Mismatch comparing old %s to new %s: %s" % (orig_tempcall, new_tempcall, " || ".join(mismatches))) continue # 5. If no mismatches, modify the template to contain the new args. new_params = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=False, is_temp=tname(t) == "temp") del t.params[:] if tname(t) == "temp": t.add("1", "ru-conj") for name, value in new_params: t.add(name, value) # 6. Build up the save comment. orig_changed_params = paste_arg_sets(orig_arg_sets, t, verb_type, rm_pres_stem=False, as_string=True, change_only=True) new_changed_params = paste_arg_sets(arg_sets, t, verb_type, rm_pres_stem, as_string=True, change_only=True) notes.append("ru-conj: normalized %s to %s" % ("|".join(orig_changed_params), "|".join(new_changed_params))) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
# End of table; other tables may follow if group: groups.append(group) if groups: render_groups(groups) groups = [] group = [] pfsuffix = None impfsuffix = None elif line == "-": if group: groups.append(group) group = [] elif " " not in line: group.append((combine_prefix(line, pfsuffix), combine_prefix(ru.make_unstressed(line), impfsuffix))) elif "!" in line: pf, impf = re.split(r"\s+", line) assert pf == "!" or impf == "!" if pf == "!": group.append(("* (no equivalent)", combine_prefix(ru.make_unstressed(impf), impfsuffix))) else: group.append((combine_prefix(pf, pfsuffix), "* (no equivalent)")) else: pf, impf = re.split(r"\s+", line) if pf.startswith("-") and impf.startswith("-"): pfsuffix = re.sub("^-", "", pf) impfsuffix = re.sub("^-", "", impf) continue def do_line(direc, aspect): links = []
#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse, codecs import rulib as ru from collections import OrderedDict parser = argparse.ArgumentParser(description="Output short adjectives in Wiktionary, ordered by frequency.") parser.add_argument("--freq-adjs", help=u"""Adjectives ordered by frequency, without accents or ё.""") parser.add_argument("--wiktionary-short-adjs", help=u"""Adjectives in Wiktionary with short forms, in alphabetical order. Should be accented and with ё.""") args = parser.parse_args() short_adjs = OrderedDict((ru.make_unstressed(x.strip()), True) for x in codecs.open(args.wiktionary_short_adjs, "r", "utf-8")) for line in codecs.open(args.freq_adjs, "r", "utf-8"): line = line.strip() if line in short_adjs: print line.encode("utf-8") del short_adjs[line] for line in short_adjs: print line.encode("utf-8")
def process_page(index, page, save, verbose, fix_pages): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-conj", "ru-conj-old"]: if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") if tname == "ru-conj": tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) else: tempcall = re.sub(r"\{\{ru-conj-old", "{{ru-generate-verb-forms|old=y", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = rulib.split_generate_args(result) for base in ["past_pasv_part", "ppp"]: forms_to_remove = [] if args[base] == "-": continue for form in re.split(",", args[base]): origform = form form = re.sub("//.*", "", form) fix_form = False if not re.search(ur"([аяеё]́?нный|тый)$", form): pagemsg("WARNING: Past passive participle doesn't end correctly: %s" % form) fix_form = True unstressed_page = rulib.make_unstressed(pagetitle) unstressed_form = rulib.make_unstressed(form) warned = False if unstressed_form[0] != unstressed_page[0]: pagemsg("WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s" % form) warned = True fix_form = True if form.endswith(u"нный"): if pagetitle.endswith(u"ать"): good_ending = u"анный" elif pagetitle.endswith(u"ять"): good_ending = u"янный" else: good_ending = u"енный" if not unstressed_form.endswith(good_ending): pagemsg("WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s" % form) warned = True fix_form = True if not warned: correct_form = form_ppp(conjtype, pagetitle, args) if correct_form and unstressed_form != correct_form: pagemsg("WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s" % (unstressed_form, correct_form)) fix_form = True if fix_form: forms_to_remove.append(origform) if forms_to_remove and pagetitle in fix_pages: curvals = [] for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]: val = getparam(t, base + i) if val: curvals.append(val) newvals = [x for x in curvals if x not in forms_to_remove] if len(curvals) - len(newvals) != len(forms_to_remove): pagemsg("WARNING: Something wrong, couldn't remove all PPP forms %s" % ",".join(forms_to_remove)) curindex = 1 origt = unicode(t) for newval in newvals: t.add(base + ("" if curindex == 1 else str(curindex)), newval) curindex += 1 for i in xrange(curindex, 10): rmparam(t, base + ("" if i == 1 else str(i))) pagemsg("Replacing %s with %s" % (origt, unicode(t))) notes.append("removed bad past pasv part(s) %s" % ",".join(forms_to_remove))