コード例 #1
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))


    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        if unicode(t.name) in ["ru-conj", "ru-conj-old"]:
            param1 = getparam(t, "1")
            param2 = getparam(t, "2")
            if not param2.startswith("7"):
            param3 = getparam(t, "3")
            param4 = getparam(t, "4")
            param5 = getparam(t, "5")
            assert not getparam(t, "6")
            if param2.startswith("7b"):
                if re.search(
                        param3) and u"ё" not in param4 and u"ѣ̈" not in param4:
                    assert not param5
                    param5 = u"ёе"
                param4 = rulib.make_unstressed_ru(param4)
            if re.search(u"(л[еѣ]́?зть|с[еѣ]́?сть|обокра́сть)(ся)?$", param3):
                param5 = ""
            # Fetch non-numbered params.
            non_numbered_params = []
            for param in t.params:
                pname = unicode(param.name)
                if not re.search(r"^[0-9]+$", pname) and pname not in [
                        "lang", "nocat", "tr"
                    non_numbered_params.append((pname, param.value))
            # Erase all params.
            del t.params[:]
            # Put back numbered params.
            t.add("1", param1)
            t.add("2", param2)
            t.add("3", param3)
            t.add("4", param4)
            if param5:
                t.add("5", param5)
            # Put back non-numbered params.
            for name, value in non_numbered_params:
                t.add(name, value)
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))
                "rewrite class 7 verb to correspond to module changes")

    return unicode(parsed), notes
コード例 #2
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))


  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj", "ru-conj-old"]:
      param1 = getparam(t, "1")
      param2 = getparam(t, "2")
      if not param2.startswith("8b"):
      param3 = getparam(t, "3")
      param4 = getparam(t, "4")
      param5 = getparam(t, "5")
      assert not getparam(t, "6")
      if getparam(t, "past_m"):
        errmsg("WARNING: Has past_m=%s" % getparam(t, "past_m"))
      pap = getparam(t, "pap") or getparam(t, "past_adv_part")
      if pap:
        errmsg("WARNING: Has pap=%s" % pap)
      pap2 = getparam(t, "pap2") or getparam(t, "past_adv_part2")
      if pap2:
        errmsg("WARNING: Has pap2=%s" % pap2)
      param4 = rulib.make_unstressed_ru(param4)
      # Fetch non-numbered params.
      non_numbered_params = []
      for param in t.params:
        pname = unicode(param.name)
        if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]:
          non_numbered_params.append((pname, param.value))
      # Erase all params.
      del t.params[:]
      # Put back numbered params.
      t.add("1", param1)
      t.add("2", param2)
      t.add("3", param3)
      t.add("4", param4)
      if param5:
        t.add("5", param5)
      # Put back non-numbered params.
      for name, value in non_numbered_params:
        t.add(name, value)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))
      notes.append("rewrite class 8b verb to correspond to module changes")

  return unicode(parsed), notes
コード例 #3
 def do_line(direc, aspect, suffixes):
     links = []
     if direc == "-":
         return "* (no equivalent)"
         for index, verb in enumerate(re.split(",", direc)):
             gender = ""
             notes = []
             if verb:
                 endbracket = False
                 if verb.endswith("]"):
                     endbracket = True
                     verb = verb[:-1]
                 if verb.endswith("-"):
                     verb = verb[:-1]
                     if aspect == "impf":
                         verb = rulib.make_unstressed_ru(verb)
                     verb = paste_verb(verb, suffixes[index])
                 while True:
                     if verb.startswith("+"):
                         gender = "|g=%s" % aspect
                         verb = re.sub(r"^\+", "", verb)
                     elif verb.startswith("(i)"):
                         verb = re.sub(r"^\(i\)", "", verb)
                     elif verb.startswith("(n)"):
                         verb = re.sub(r"^\(n\)", "", verb)
                     elif verb.startswith("(lc)"):
                         notes.append("low colloquial")
                         verb = re.sub(r"^\(lc\)", "", verb)
                     elif verb.startswith("(d)"):
                         verb = re.sub(r"^\(d\)", "", verb)
                 if verb.startswith("["):
                     verb = verb[1:]
                     assert endbracket
                         "[{{l|ru|%s%s}}]%s" %
                         (verb, gender, notes
                          and " {{i|%s}}" % ", ".join(notes) or ""))
                         "{{l|ru|%s%s}}%s" %
                         (verb, gender, notes
                          and " {{i|%s}}" % ", ".join(notes) or ""))
         return "* " + ", ".join(links)
コード例 #4
def form_ppp(conjtype, pagetitle, args):
  def form_ppp_1(conjtype, pagetitle, args):
    def first_entry(forms):
      forms = re.sub(",.*", "", forms)
      return re.sub("//.*", "", forms)
    if not re.search("^[0-9]+", conjtype):
      return None
    conjtype = int(re.sub("^([0-9]+).*", r"\1", conjtype))
    if ((pagetitle.endswith(u"ать") or pagetitle.endswith(u"ять")) and
        conjtype != 14):
      return re.sub(u"ть$", u"нный", pagetitle)
    if pagetitle.endswith(u"еть") and conjtype == 1:
      return re.sub(u"ть$", u"нный", pagetitle)
    if conjtype in [4, 5]:
      sg1 = (
        args["pres_1sg"] if "pres_1sg" in args else
        args["futr_1sg"] if "futr_1sg" in args else
      if not sg1 or sg1 == "-" or sg1.startswith(u"бу́ду "):
        return None
      sg1 = first_entry(sg1)
      assert re.search(u"[ую]́?$", sg1)
      return re.sub(u"[ую]́?$", u"енный", sg1)
    if conjtype in [7, 8]:
      sg3 = args["pres_3sg"] if "pres_3sg" in args else args["futr_3sg"]
      sg3 = first_entry(sg3)
      assert re.search(u"[её]́?т$", sg3)
      return re.sub(u"[её]́?т$", u"енный", sg3)
    if conjtype in [3, 10]:
      if pagetitle.endswith(u"чь"):
        return re.sub(u"чь", u"гнутый", pagetitle)
      return re.sub(u"ть$", u"тый", pagetitle)
    assert conjtype in [9, 11, 12, 14, 15, 16]
    if "past_m" not in args: # occurs with e.g. impersonal verbs e.g. спереть
      return None
    pastm = first_entry(args["past_m"])
    return re.sub(u"л?$", u"тый", pastm)

  retval = form_ppp_1(conjtype, pagetitle, args)
  if retval:
    return rulib.make_unstressed_ru(retval)
    return None
コード例 #5
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))


    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        param2 = getparam(t, "2")
        if unicode(t.name) in ["ru-conj"] and re.search(r"^8[ab]", param2):
            if [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
            past_m = getparam(t, "past_m")
            if past_m:
                rmparam(t, "past_m")
                stem = getparam(t, "3")
                if stem == past_m:
                    pagemsg("Stem %s and past_m same" % stem)
                    notes.append("remove redundant past_m %s" % past_m)
                elif (param2.startswith("8b") and not param2.startswith("8b/")
                      and rulib.make_unstressed_ru(past_m) == stem):
                        "Class 8b/b and stem %s is unstressed version of past_m %s, replacing stem with past_m"
                        % (stem, past_m))
                    t.add("3", past_m)
                    notes.append("moving past_m %s to arg 3" % past_m)
                        "Stem %s and past_m %s are different, putting past_m in param 5"
                        % (stem, past_m))
                    t.add("5", past_m)
                    notes.append("moving past_m %s to arg 5" % past_m)
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
コード例 #6
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")


  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj-7a", "ru-conj-7b"]:
      past_stem = getparam(t, "4")
      vowel_end = re.search(u"[аэыоуяеиёю́]$", past_stem)
      past_m = getparam(t, "past_m")
      past_f = getparam(t, "past_f")
      past_n = getparam(t, "past_n")
      past_pl = getparam(t, "past_pl")
      if past_m or past_f or past_n or past_pl:
        upast_stem = rulib.make_unstressed_ru(past_stem)
        expected_past_m = past_stem + (u"л" if vowel_end else "")
        expected_past_f = upast_stem + u"ла́"
        expected_past_n = upast_stem + u"ло́"
        expected_past_pl = upast_stem + u"ли́"
        if ((not past_m or expected_past_m == past_m) and
            expected_past_f == past_f and
            expected_past_n == past_n and
            expected_past_pl == past_pl):
          msg("Would remove past overrides and add arg5=b")
          msg("WARNING: Remaining past overrides: past_m=%s, past_f=%s, past_n=%s, past_pl=%s, expected_past_m=%s, expected_past_f=%s, expected_past_n=%s, expected_past_pl=%s" %
              (past_m, past_f, past_n, past_pl, expected_past_m, expected_past_f, expected_past_n, expected_past_pl))
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
コード例 #7
ファイル: infer_ppp.py プロジェクト: benwing2/RuNounChanges
def form_ppp(verbtype, pagetitle, args):
    def form_ppp_1(verbtype, pagetitle, args):
        def first_entry(forms):
            forms = re.sub(",.*", "", forms)
            return re.sub("//.*", "", forms)

        if not re.search("^[0-9]+", verbtype):
            return None
        verbtype = int(re.sub("^([0-9]+).*", r"\1", verbtype))
        if ((pagetitle.endswith(u"ать") or pagetitle.endswith(u"ять"))
                and verbtype != 14):
            return re.sub(u"ть$", u"нный", pagetitle)
        if pagetitle.endswith(u"еть") and verbtype == 1:
            return re.sub(u"ть$", u"нный", pagetitle)
        if verbtype in [4, 5]:
            sg1 = args["pres_1sg"] if "pres_1sg" in args else args["futr_1sg"]
            if not sg1 or sg1 == "-":
                return None
            sg1 = first_entry(sg1)
            assert re.search(u"[ую]́?$", sg1)
            return re.sub(u"[ую]́?$", u"енный", sg1)
        if verbtype in [7, 8]:
            sg3 = args["pres_3sg"] if "pres_3sg" in args else args["futr_3sg"]
            sg3 = first_entry(sg3)
            assert re.search(u"[её]́?т$", sg3)
            return re.sub(u"[её]́?т$", u"енный", sg3)
        if verbtype in [3, 10]:
            return re.sub(u"ть$", u"тый", pagetitle)
        assert verbtype in [9, 11, 12, 14, 15, 16]
        pastm = first_entry(args["past_m"])
        return re.sub(u"л?$", u"тый", pastm)

    retval = form_ppp_1(verbtype, pagetitle, args)
    if retval:
        return rulib.make_unstressed_ru(retval)
        return None
コード例 #8
ファイル: fix_3c.py プロジェクト: benwing2/RuNounChanges
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))


  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) in ["ru-conj", "ru-conj-old"]:
      assert not getparam(t, "4")
      inf = getparam(t, "3")
      inf = rulib.make_unstressed_ru(inf)
      inf = re.sub(u"нуть((ся)?)$", ur"ну́ть\1", inf)
      t.add("3", inf)
      notes.append("Remove stray accent from 3c infinitive")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return unicode(parsed), notes
コード例 #9
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse, codecs
import rulib
from collections import OrderedDict

parser = argparse.ArgumentParser(
    description="Output short adjectives in Wiktionary, ordered by frequency.")
    help=u"""Adjectives ordered by frequency, without accents or ё.""")
    help=u"""Adjectives in Wiktionary with short forms, in alphabetical order.
Should be accented and with ё.""")
args = parser.parse_args()

short_adjs = OrderedDict(
    (rulib.make_unstressed_ru(x.strip()), True)
    for x in codecs.open(args.wiktionary_short_adjs, "r", "utf-8"))
for line in codecs.open(args.freq_adjs, "r", "utf-8"):
    line = line.strip()
    if line in short_adjs:
        print line.encode("utf-8")
        del short_adjs[line]
for line in short_adjs:
    print line.encode("utf-8")
コード例 #10
def process_page(page, index, do_fix):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)


    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname in ["ru-conj", "ru-conj-old"]:
            if [x for x in t.params if unicode(x.value) == "or"]:
                pagemsg("WARNING: Skipping multi-arg conjugation: %s" %
            conjtype = getparam(t, "2")
            if tname == "ru-conj":
                tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms",
                tempcall = re.sub(r"\{\{ru-conj-old",
                                  "{{ru-generate-verb-forms|old=y", unicode(t))
            result = expand_text(tempcall)
            if not result:
                pagemsg("WARNING: Error generating forms, skipping")
            args = blib.split_generate_args(result)
            for base in ["past_pasv_part", "ppp"]:
                forms_to_remove = []
                if args[base] == "-":
                for form in re.split(",", args[base]):
                    origform = form
                    form = re.sub("//.*", "", form)
                    fix_form = False
                    if not re.search(ur"([аяеё]́?нный|тый)$", form):
                            "WARNING: Past passive participle doesn't end correctly: %s"
                            % form)
                        fix_form = True
                    unstressed_page = rulib.make_unstressed_ru(pagetitle)
                    unstressed_form = rulib.make_unstressed_ru(form)
                    warned = False
                    if unstressed_form[0] != unstressed_page[0]:
                            "WARNING: Past passive participle doesn't begin with same letter, probably for wrong aspect: %s"
                            % form)
                        warned = True
                        fix_form = True
                    if form.endswith(u"нный"):
                        if pagetitle.endswith(u"ать"):
                            good_ending = u"анный"
                        elif pagetitle.endswith(u"ять"):
                            good_ending = u"янный"
                            good_ending = u"енный"
                        if not unstressed_form.endswith(good_ending):
                                "WARNING: Past passive participle doesn't end right, probably for wrong aspect: %s"
                                % form)
                            warned = True
                            fix_form = True
                    if not warned:
                        correct_form = form_ppp(conjtype, pagetitle, args)
                        if correct_form and unstressed_form != correct_form:
                                "WARNING: Past passive participle not formed according to rule, probably wrong: found %s, expected %s"
                                % (unstressed_form, correct_form))
                            fix_form = True
                    if fix_form:
                if forms_to_remove and do_fix:
                    curvals = []
                    for i in ["", "2", "3", "4", "5", "6", "7", "8", "9"]:
                        val = getparam(t, base + i)
                        if val:
                    newvals = [x for x in curvals if x not in forms_to_remove]
                    if len(curvals) - len(newvals) != len(forms_to_remove):
                            "WARNING: Something wrong, couldn't remove all PPP forms %s"
                            % ",".join(forms_to_remove))
                    curindex = 1
                    origt = unicode(t)
                    for newval in newvals:
                        t.add(base + ("" if curindex == 1 else str(curindex)),
                        curindex += 1
                    for i in xrange(curindex, 10):
                        rmparam(t, base + ("" if i == 1 else str(i)))
                    pagemsg("Replacing %s with %s" % (origt, unicode(t)))
                    notes.append("removed bad past pasv part(s) %s" %
コード例 #11
def paste_verb(prefix, suffix):
    if rulib.is_stressed(prefix):
        verb = prefix + rulib.make_unstressed_ru(suffix)
        verb = prefix + suffix
    return rulib.remove_monosyllabic_accents(verb)
コード例 #12
     groups = []
     group = []
     pfsuffixes = None
     impfsuffixes = None
 elif line == "-":
     if group:
     group = []
 elif " " not in line:
     # A single prefix; combine with previous suffixes.
     # If it starts with a + (indicating include the apsect), that applies
     # only to the perfective verb. See лететь.der for good examples.
     group.append((combine_prefix(line, pfsuffixes, "pf"),
                       rulib.make_unstressed_ru(line).replace("+", ""),
                       impfsuffixes, "impf")))
 elif re.search(r" \+$", line):
     # Something like "об +" or "+об +". This indicates that the imperfective
     # (and maybe the perfective) should include the aspect. See лететь.der
     # for good examples.
     pf, impf = re.split(r"\s+", line)
     assert impf == "+"
     group.append((combine_prefix(pf, pfsuffixes, "pf"),
                   combine_prefix("+" + rulib.make_unstressed_ru(pf),
                                  impfsuffixes, "impf")))
 elif "!" in line:
     # Something like "об !" or "+об !" or "! об" or "! +об". This indicates
     # that one of the two is missing and the other should combine with
     # previous suffixes, maybe with the aspect included (see лететь.der for
     # good examples of this).
コード例 #13
def process_page(index, page, save, verbose, nouns, adjectives):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)


  if re.search(u"с[яь]$", pagetitle):
    pagemsg("Skipping reflexive verb")

  text = unicode(page.text)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-conj":
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
      conjtype = getparam(t, "2")
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
      args = blib.split_generate_args(result)
      if "infinitive" not in args: # e.g. обнимать
        pagemsg("WARNING: No infinitive")
      infinitive = args["infinitive"]
      if "," in infinitive:
        pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive)
      if "//" in infinitive:
        pagemsg("WARNING: Infinitive has translit: %s" % infinitive)
      ppp = form_ppp(conjtype, pagetitle, args)
      if not ppp:
      if ppp.endswith(u"тый"):
        verbal_noun = re.sub(u"тый$", u"тие", ppp)
        verbal_noun_suffix = u"тие"
        verbal_adj = re.sub(u"тый$", u"тельный", ppp)
        verbal_adj_suffix = u"тельный"
      elif ppp.endswith(u"ённый"):
        verbal_noun = re.sub(u"ённый$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"ённый$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      elif ppp.endswith(u"енный"):
        verbal_noun = re.sub(u"енный$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"енный$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
        assert ppp.endswith(u"анный") or ppp.endswith(u"янный")
        verbal_noun = re.sub(u"нный$", u"ние", ppp)
        verbal_adj = re.sub(u"нный$", u"тельный", ppp)
        m = re.search(u"(.)нный$", ppp)
        suffix_start = m.group(1)
        verbal_noun_suffix = suffix_start + u"ние"
        verbal_adj_suffix = suffix_start + u"тельный"
      agent_noun = re.sub(u"ный$", "", verbal_adj)
      agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix)
      stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix)
      stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix)
      stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix)
      if conjtype.startswith("7"):
        stem = getparam(t, "4")
        if infinitive.endswith(u"ть"):
          stem = stem.replace(u"ё", u"е́")
          stem = rulib.make_unstressed_ru(stem)
        stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-"
      elif conjtype.startswith("8"):
        stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-"
        stem = rulib.remove_monosyllabic_accents(infinitive)

      if verbal_noun in nouns:
        stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text)
        if not stressed_noun:
          msg("%s no-etym FIXME" % verbal_noun)
        elif stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_noun)
          if stressed_noun.endswith(stressed_verbal_noun_suffix):
            suffix = stressed_verbal_noun_suffix
            suffix = verbal_noun_suffix
          msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix))

      if agent_noun in nouns:
        stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text)
        if stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % agent_noun)
          msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem))

      if verbal_adj in adjectives:
        stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text)
        if stressed_adj == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_adj)
          msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))