def combine_prefix(prefix, suffix):
  if ru.is_stressed(prefix):
    verb = prefix + ru.make_unstressed(suffix)
  else:
    verb = prefix + suffix
  verb = ru.remove_monosyllabic_accents(verb)
  return "* {{l|ru|" + verb + "}}"
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [
        "verb", "verbs", "adjective", "adjectives"]:
      pos = getparam(t, "2")
      if pos in ["verb", "verbs"]:
        newtn = "bg-verb"
      else:
        newtn = "bg-adj"
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if (pname not in ["1", "2", "head", "g"] or
            pname == "g" and (newtn != "bg-adj" or pval != "m")):
          pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt))
          break
      else: # no break
        rmparam(t, "1")
        rmparam(t, "2")
        rmparam(t, "g")
        head = getparam(t, "head")
        rmparam(t, "head")
        blib.set_template_name(t, newtn)
        t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
        notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn))
    elif tn == "bg-verb" or tn == "bg-adj":
      if tn == "bg-adj":
        g = getparam(t, "g")
        if g and g != "m":
          pagemsg("WARNING: Saw g=%s in %s" % (g, origt))
          continue
        if t.has("g"):
          rmparam(t, "g")
          notes.append("remove g=%s from {{%s}}" % (g, tn))
      head = getparam(t, "head") or getparam(t, "1")
      rmparam(t, "head")
      rmparam(t, "1")
      a = getparam(t, "a") or getparam(t, "2")
      rmparam(t, "a")
      rmparam(t, "2")
      if a in ["impf-pf", "pf-impf", "dual", "ip", "both"]:
        a = "both"
      elif a and a not in ["impf", "pf"]:
        pagemsg("WARNING: Unrecognized aspect %s in %s" % (a, origt))
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if not pval:
          continue
        params.append((pname, pval, showkey))
      # Erase all params.
      del t.params[:]
      # Put back new params.
      t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
      notes.append("move head= to 1= in {{%s}}" % tn)
      if a:
        t.add("2", a)
        notes.append("move a= to 2= in {{%s}}" % tn)
      for pname, pval, showkey in params:
        t.add(pname, pval, showkey=showkey, preserve_spacing=False)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return parsed, notes
Beispiel #3
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errpagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))
        errmsg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        if tname(t) in [
                "ru-conj", "ru-conj-old", "User:Benwing2/ru-conj",
                "User:Benwing2/ru-conj-old"
        ] or tname(t) == "temp" and getparam(t, "1") == "ru-conj":
            verb_type, arg_sets = split_ru_conj_args(t, tname(t) == "temp")
            refl = "refl" in verb_type
            orig_arg_sets = copy.deepcopy(arg_sets)
            rm_pres_stem = False

            ##### First, modify arg_sets according to normalized params

            for arg_set in arg_sets:
                # This complex spec matches matches 3°a, 3oa, 4a1a, 6c1a,
                # 1a6a, 6a1as13, 6a1as14, etc.
                m = re.search(u"^([0-9]+[°o0-9abc]*[abc]s?1?[34]?)",
                              arg_set[0])
                if not m:
                    m = re.search(
                        u"^(irreg-?[абцдеѣфгчийклмнопярстувшхызёюжэщьъ%-]*)",
                        arg_set[0])
                    if not m:
                        errpagemsg("Unrecognized conjugation type: %s" %
                                   arg_set[0])
                        continue
                conj_type = m.group(1).replace("o", u"°")
                inf, tr = rulib.split_russian_tr(arg_set[1])
                if refl:
                    new_style = re.search(u"([тч]ься|ти́?сь)$", inf)
                else:
                    new_style = re.search(
                        u"([тч]ь|ти́?)$" if conj_type.startswith("7")
                        or conj_type.startswith("irreg") else u"[тч]ь$", inf)
                if new_style:
                    if arg_set[0].startswith("irreg-"):
                        arg_set[0] = re.sub("^irreg-.*?(/.*|$)", r"irreg\1",
                                            arg_set[0])
                    arg_set[1] = rulib.paste_russian_tr(
                        rulib.remove_monosyllabic_accents(inf),
                        rulib.remove_tr_monosyllabic_accents(tr))
                else:
                    if not re.search("^[124]", conj_type):
                        assert not tr
                    if conj_type in ["1a", "2a", "2b"]:
                        inf += u"ть"
                        if tr:
                            tr += u"tʹ"
                    elif conj_type in ["3a", u"3°a"]:
                        inf += u"нуть"
                    elif conj_type in ["3b", u"3c"]:
                        inf += u"у́ть"
                    elif conj_type == "4a":
                        inf += u"ить"
                        if tr:
                            tr += u"itʹ"
                    elif conj_type in ["4b", "4c"]:
                        inf, tr = rulib.make_unstressed(
                            inf, rulib.decompose(tr))
                        inf += u"ить"
                        if tr:
                            tr += u"ítʹ"
                    elif conj_type == "4a1a":
                        inf = re.sub(u"[ая]$", "", inf) + u"ить"
                        if tr:
                            tr = re.sub("j?a$", "", tr) + u"itʹ"
                    elif conj_type == "5a":
                        inf = arg_set[2] + u"ть" if arg_set[
                            2] else arg_set[1] + u"еть"
                        normal_pres_stem = re.sub(u"[еая]ть$", "", inf)
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif conj_type == "5b":
                        inf = arg_set[2] + u"ть"
                        normal_pres_stem = re.sub(u"[еая]́ть$", "", inf)
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif conj_type == "5c":
                        inf = arg_set[2] + u"ть"
                        normal_pres_stem = rulib.make_ending_stressed_ru(
                            re.sub(u"[еая]́ть$", "", inf))
                        if normal_pres_stem == arg_set[1]:
                            arg_set[2] = ""
                        else:
                            arg_set[2] = arg_set[1]
                    elif re.search(u"^6°?a", conj_type) or conj_type == "1a6a":
                        assert not arg_set[3]
                        if arg_set[2]:
                            inf = arg_set[2] + u"ть"
                            arg_set[2] = ""
                            normal_pres_stem = rulib.make_ending_stressed_ru(
                                re.sub(u"а́ть$", "", inf))
                            assert arg_set[1] == normal_pres_stem
                        elif is_vowel_stem(inf):
                            inf += u"ять"
                        else:
                            inf += u"ать"
                        if getparam(t, "pres_stem"):
                            arg_set[2] = getparam(t, "pres_stem")
                            rm_pres_stem = True
                    elif re.search(u"^6°?b", conj_type):
                        if is_vowel_stem(inf):
                            inf += u"я́ть"
                        else:
                            inf += u"а́ть"
                        # arg_set[2] (present stem) remains
                    elif re.search(u"^6°?c", conj_type):
                        inf = rulib.make_unstressed_once_ru(inf) + u"а́ть"
                    elif conj_type in ["7a", "7b"]:
                        pass  # nothing needed to do
                    elif conj_type in ["8a", "8b"]:
                        inf = arg_set[2]
                        arg_set[2] = arg_set[1]
                    elif conj_type == "9a":
                        inf += u"еть"
                        # arg_set[2] (present stem) remains
                    elif conj_type == "9b":
                        inf = rulib.make_unstressed_once_ru(inf) + u"е́ть"
                        # arg_set[2] (present stem) remains
                        # arg_set[3] (optional past participle stem) remains
                    elif conj_type == "10a":
                        inf += u"оть"
                    elif conj_type == "10c":
                        inf += u"ть"
                        if rulib.make_unstressed_once_ru(arg_set[2]) == re.sub(
                                u"о́$", "", arg_set[1]):
                            arg_set[2] = ""
                    elif conj_type == "11a":
                        inf += u"ить"
                    elif conj_type == "11b":
                        inf += u"и́ть"
                        if arg_set[2] == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "12a":
                        inf += u"ть"
                        if arg_set[2] == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "12b":
                        inf += u"ть"
                        if rulib.make_ending_stressed_ru(
                                arg_set[2]) == arg_set[1]:
                            arg_set[2] = ""
                    elif conj_type == "13b":
                        inf += u"ть"
                        assert re.sub(u"ва́ть$", "", inf) == arg_set[2]
                        arg_set[2] = ""
                    elif conj_type in ["14a", "14b", "14c"]:
                        inf += u"ть"
                        # arg_set[2] (present stem) remains
                    elif conj_type in ["15a", "16a", "16b"]:
                        inf += u"ть"
                    elif conj_type == u"irreg-минуть":
                        inf = u"мину́ть"
                    elif conj_type == u"irreg-живописать-миновать":
                        inf += u"ть"
                        arg_set[2] = ""
                    elif conj_type == u"irreg-слыхать-видать":
                        inf += u"ть"
                    elif conj_type == u"irreg-стелить-стлать":
                        inf = arg_set[2] + inf + u"ть"
                        arg_set[2] = ""
                        arg_set[3] = ""
                    elif conj_type == u"irreg-ссать-сцать":
                        assert arg_set[2] == re.sub(u"а́$", "", inf)
                        inf = arg_set[3] + inf + u"ть"
                        arg_set[2] = ""
                        arg_set[3] = ""
                    elif conj_type in [
                            u"irreg-сыпать", u"irreg-ехать", u"irreg-ѣхать"
                    ]:
                        infstem = re.sub("^irreg-", "", conj_type)
                        if arg_set[1] != u"вы́":
                            infstem = rulib.make_beginning_stressed_ru(infstem)
                        inf = arg_set[1] + infstem
                    elif conj_type == u"irreg-обязывать":
                        if arg_set[1] == u"вы́":
                            inf = u"вы́обязывать"
                        else:
                            inf = arg_set[1] + u"обя́зывать"
                    elif conj_type == u"irreg-зиждиться":
                        if arg_set[1] == u"вы́":
                            inf = u"вы́зиждить"
                        else:
                            inf = arg_set[1] + u"зи́ждить"
                    elif conj_type == u"irreg-идти":
                        if not arg_set[1]:
                            inf = u"идти́"
                        elif arg_set[1] == u"вы́":
                            inf = u"вы́йти"
                        else:
                            inf = arg_set[1] + u"йти́"
                    elif re.search("^irreg-", conj_type):
                        infstem = re.sub("^irreg-", "", conj_type)
                        if arg_set[1] != u"вы́":
                            infstem = rulib.make_ending_stressed_ru(infstem)
                        inf = arg_set[1] + infstem
                    else:
                        error("Unknown conjugation type " + conj_type)
                    if inf:
                        if refl:
                            if re.search(u"[тч]ь$", inf):
                                inf += u"ся"
                                if tr:
                                    tr += "sja"
                            else:
                                assert re.search(u"и́?$", inf)
                                inf += u"сь"
                                if tr:
                                    tr += u"sʹ"
                        arg_set[1] = rulib.paste_russian_tr(
                            rulib.remove_monosyllabic_accents(inf),
                            rulib.remove_tr_monosyllabic_accents(tr))

            ##### If something changed ...

            if orig_arg_sets != arg_sets or rm_pres_stem:

                ##### ... compare the forms generated by the original and new
                ##### arguments and make sure they're the same.

                if not pagetitle.startswith("User:Benwing2/"):
                    # 1. Generate and expand the appropriate call to
                    #    {{ru-generate-verb-forms}} for the original arguments.

                    orig_args = paste_arg_sets(orig_arg_sets,
                                               t,
                                               verb_type,
                                               rm_pres_stem=False,
                                               as_string=True)
                    orig_tempcall = "{{ru-generate-verb-forms|%s%s}}" % (
                        "|".join(orig_args),
                        "|old=1" if tname(t).endswith("ru-conj-old") else "")
                    orig_result = expand_text(orig_tempcall)
                    if not orig_result:
                        errpagemsg(
                            "WARNING: Error expanding original template %s" %
                            orig_tempcall)
                        continue
                    orig_forms = blib.split_generate_args(orig_result)

                    # 2. Generate and expand the appropriate call to
                    #    {{ru-generate-verb-forms}} for the new arguments.

                    new_args = paste_arg_sets(arg_sets,
                                              t,
                                              verb_type,
                                              rm_pres_stem,
                                              as_string=True)
                    new_tempcall = "{{ru-generate-verb-forms|%s%s}}" % (
                        "|".join(new_args),
                        "|old=1" if tname(t).endswith("ru-conj-old") else "")
                    new_result = expand_text(new_tempcall)
                    if not new_result:
                        errpagemsg("WARNING: Error expanding new template %s" %
                                   new_tempcall)
                        continue
                    new_forms = blib.split_generate_args(new_result)

                    # 3. Compare each form and accumulate a list of mismatches.

                    all_keys = set(orig_forms.keys()) | set(new_forms.keys())

                    def sort_numbers_first(key):
                        if re.search("^[0-9]+$", key):
                            return "%05d" % int(key)
                        return key

                    all_keys = sorted(list(all_keys), key=sort_numbers_first)
                    mismatches = []
                    for key in all_keys:
                        origval = orig_forms.get(key, "<<missing>>")
                        newval = new_forms.get(key, "<<missing>>")
                        if origval != newval:
                            mismatches.append("%s: old=%s new=%s" %
                                              (key, origval, newval))

                    # 4. If mismatches, output them and don't change anything.

                    if mismatches:
                        errpagemsg(
                            "WARNING: Mismatch comparing old %s to new %s: %s"
                            % (orig_tempcall, new_tempcall,
                               " || ".join(mismatches)))
                        continue

                # 5. If no mismatches, modify the template to contain the new args.

                new_params = paste_arg_sets(arg_sets,
                                            t,
                                            verb_type,
                                            rm_pres_stem,
                                            as_string=False,
                                            is_temp=tname(t) == "temp")
                del t.params[:]
                if tname(t) == "temp":
                    t.add("1", "ru-conj")
                for name, value in new_params:
                    t.add(name, value)

                # 6. Build up the save comment.

                orig_changed_params = paste_arg_sets(orig_arg_sets,
                                                     t,
                                                     verb_type,
                                                     rm_pres_stem=False,
                                                     as_string=True,
                                                     change_only=True)
                new_changed_params = paste_arg_sets(arg_sets,
                                                    t,
                                                    verb_type,
                                                    rm_pres_stem,
                                                    as_string=True,
                                                    change_only=True)
                notes.append("ru-conj: normalized %s to %s" %
                             ("|".join(orig_changed_params),
                              "|".join(new_changed_params)))

            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
Beispiel #4
0
def remove_monosyllabic_accents(ru, tr):
    return rulib.remove_monosyllabic_accents(
        ru), rulib.remove_tr_monosyllabic_accents(tr)
def process_page(index, page, save, verbose, adverbs, all_derived_lemmas):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    # ending and whether final consonant is palatal
    endings = [
        (u"ывать", False),
        (u"ивать", False),
        (u"ать", False),
        (u"ять", True),
        (u"еть", True),
        (u"ить", True),
        (u"нуть", False),
        (u"ия", True),
        (u"ие", True),
        (u"я", True),
        (u"е", True),
        (u"ь", True),
        (u"и", True),
        (u"а", False),
        (u"о", False),
        (u"ы", False),
        (u"ый", False),
        (u"ий", True),
        (u"ой", False),
    ]
    stems = []
    for ending, is_palatal in endings:
        if pagetitle.endswith(ending):
            stem = re.sub(ending + "$", "", pagetitle)
            stems.append((stem, is_palatal))
    if not stems:
        stems.append((pagetitle, False))
    possible = []

    def append_possible(stem_to_try, suffix):
        possible.append((stem_to_try.lower() + suffix, suffix))

    # Try -ный/-ной, -ка, -ко
    for stem, palatal in stems:
        stems_to_try = []

        def frob(stem):
            stem = first_palatalization(stem)
            if stem.endswith(u"л"):
                stem += u"ь"
            if re.search("[" + rulib.vowel + "]$", stem):
                stem += u"й"
            return stem

        to_try_1 = frob(stem)
        to_try_2 = rulib.dereduce_stem(stem, False)
        if to_try_2:
            to_try_2 = frob(rulib.remove_accents(to_try_2))
        to_try_3 = rulib.dereduce_stem(stem, True)
        if to_try_3:
            to_try_3 = frob(rulib.remove_accents(to_try_3))
        stems_to_try.append(to_try_1)
        if to_try_2:
            stems_to_try.append(to_try_2)
        if to_try_3 and to_try_3 != to_try_2:
            stems_to_try.append(to_try_3)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"ный")
            append_possible(stem_to_try, u"ной")
            append_possible(stem_to_try, u"ский")
            append_possible(stem_to_try, u"ской")
            append_possible(stem_to_try, u"ник")
            append_possible(stem_to_try, u"чик")
            append_possible(stem_to_try, u"щик")
            append_possible(stem_to_try, u"ка")
            append_possible(stem_to_try, u"ко")
            append_possible(stem_to_try, u"ство")
    # Try -овый/-евый/-ёвый/-овой/-евой, -ик, -ок/-ек/-ёк
    for stem, palatal in stems:
        stems_to_try = []
        stems_to_try.append(stem)
        reduced = rulib.reduce_stem(stem)
        if reduced:
            stems_to_try.append(reduced)
        for stem_to_try in stems_to_try:
            if stem_to_try.endswith(u"й"):
                stem_to_try = stem_to_try[:-1]
            append_possible(stem_to_try, u"овый")
            append_possible(stem_to_try, u"евый")
            append_possible(stem_to_try, u"ёвый")
            append_possible(stem_to_try, u"овой")
            append_possible(stem_to_try, u"евой")
            stem_to_try = first_palatalization(stem_to_try)
            append_possible(stem_to_try, u"еский")
            append_possible(stem_to_try, u"ический")
            append_possible(stem_to_try, u"ество")
            append_possible(stem_to_try, u"ик")
            append_possible(stem_to_try, u"ок")
            append_possible(stem_to_try, u"ек")
            append_possible(stem_to_try, u"ёк")
            append_possible(stem_to_try, u"ец")
    # If derived adverbs, try -о, -е, -и
    if adverbs:
        for stem, palatal in stems:
            stems_to_try = []
            stems_to_try.append(stem)
        for stem_to_try in stems_to_try:
            append_possible(stem_to_try, u"о")
            append_possible(stem_to_try, u"е")
            append_possible(stem_to_try, u"и")

    would_output = False
    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            would_output = True
    if not would_output:
        return

    text = unicode(page.text)

    if rulib.check_for_alt_yo_terms(text, pagemsg):
        return

    base_lemmas = []

    for possible_derived, suffix in possible:
        if possible_derived in all_derived_lemmas:
            derived_section = blib.find_lang_section(possible_derived,
                                                     "Russian", pagemsg,
                                                     errandpagemsg)
            if not derived_section:
                errandpagemsg(
                    "WARNING: Couldn't find Russian section for derived term %s"
                    % possible_derived)
                continue
            if "==Etymology" in derived_section:
                pagemsg(
                    "Skipping derived term %s because it already has an etymology"
                    % possible_derived)
                continue
            derived_defns = rulib.find_defns(derived_section)
            if not derived_defns:
                errandpagemsg(
                    "WARNING: Couldn't find definitions for derived term %s" %
                    possible_derived)
                continue

            derived_parsed = blib.parse_text(derived_section)
            derived_lemmas = find_noun_lemmas(
                derived_parsed, possible_derived,
                errandpagemsg, lambda tempcall: blib.expand_text(
                    tempcall, possible_derived, pagemsg, verbose))
            for t in derived_parsed.filter_templates():
                if tname(t) in ["ru-adj", "ru-adv"]:
                    lemmas = blib.fetch_param_chain(t, "1", "head",
                                                    possible_derived)
                    trs = blib.fetch_param_chain(t, "tr", "tr")
                    if trs:
                        lemmas = [
                            "%s//%s" % (lemma, tr)
                            for lemma, tr in zip(lemmas, trs)
                        ]
                    for lemma in lemmas:
                        add_if_not(derived_lemmas, lemma)

            if not derived_lemmas:
                errandpagemsg("WARNING: No derived term lemmas for %s" %
                              possible_derived)
                return

            if not base_lemmas:
                base_parsed = blib.parse_text(text)
                base_lemmas = find_noun_lemmas(base_parsed, pagetitle,
                                               errandpagemsg, expand_text)

                for t in base_parsed.filter_templates():
                    if tname(t) in ["ru-verb", "ru-adj"]:
                        lemmas = blib.fetch_param_chain(
                            t, "1", "head", pagetitle)
                        trs = blib.fetch_param_chain(t, "tr", "tr")
                        if trs:
                            lemmas = [
                                "%s//%s" % (lemma, tr)
                                for lemma, tr in zip(lemmas, trs)
                            ]
                        for lemma in lemmas:
                            add_if_not(base_lemmas, lemma)

                if not base_lemmas:
                    errandpagemsg("WARNING: No base lemmas")
                    return

                base_lemmas = [
                    rulib.remove_monosyllabic_accents(x) for x in base_lemmas
                ]

                warnings = []
                if len(base_lemmas) > 1:
                    warnings.append("multiple-lemmas")
                if any("//" in lemma for lemma in base_lemmas):
                    warnings.append("translit-in-lemma")

                base_section = blib.find_lang_section_from_text(
                    text, "Russian", pagemsg)
                if not base_section:
                    errandpagemsg(
                        "WARNING: Couldn't find Russian section for base")
                    return

                base_defns = rulib.find_defns(base_section)
                if not base_defns:
                    errandpagemsg(
                        "WARNING: Couldn't find definitions for base")
                    return

            def concat_defns(defns):
                return ";".join(defns).replace("_", r"\u").replace(" ", "_")

            suffixes_with_stress = []
            for suf in [
                    suffix,
                    rulib.make_beginning_stressed_ru(suffix),
                    rulib.make_ending_stressed_ru(suffix)
            ]:
                for derived_lemma in derived_lemmas:
                    if derived_lemma.endswith(suf):
                        add_if_not(suffixes_with_stress, suf)
            msg("%s %s+-%s%s no-etym possible-suffixed %s //// %s" %
                (",".join(derived_lemmas), ",".join(base_lemmas),
                 ",".join(suffixes_with_stress),
                 " WARNING:%s" % ",".join(warnings) if warnings else "",
                 concat_defns(base_defns), concat_defns(derived_defns)))
def paste_verb(prefix, suffix):
    if rulib.is_stressed(prefix):
        verb = prefix + rulib.make_unstressed_ru(suffix)
    else:
        verb = prefix + suffix
    return rulib.remove_monosyllabic_accents(verb)
Beispiel #7
0
def process_page(index, page, save, verbose, nouns, adjectives):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  pagemsg("Processing")

  if re.search(u"с[яь]$", pagetitle):
    pagemsg("Skipping reflexive verb")
    return

  text = unicode(page.text)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-conj":
      if [x for x in t.params if unicode(x.value) == "or"]:
        pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t))
        continue
      conjtype = getparam(t, "2")
      tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t))
      result = expand_text(tempcall)
      if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        continue
      args = blib.split_generate_args(result)
      if "infinitive" not in args: # e.g. обнимать
        pagemsg("WARNING: No infinitive")
        continue
      infinitive = args["infinitive"]
      if "," in infinitive:
        pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive)
        continue
      if "//" in infinitive:
        pagemsg("WARNING: Infinitive has translit: %s" % infinitive)
        continue
      ppp = form_ppp(conjtype, pagetitle, args)
      if not ppp:
        continue
      if ppp.endswith(u"тый"):
        verbal_noun = re.sub(u"тый$", u"тие", ppp)
        verbal_noun_suffix = u"тие"
        verbal_adj = re.sub(u"тый$", u"тельный", ppp)
        verbal_adj_suffix = u"тельный"
      elif ppp.endswith(u"ённый"):
        verbal_noun = re.sub(u"ённый$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"ённый$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      elif ppp.endswith(u"енный"):
        verbal_noun = re.sub(u"енный$", u"ение", ppp)
        verbal_noun_suffix = u"ение"
        verbal_adj = re.sub(u"енный$", u"ительный", ppp)
        verbal_adj_suffix = u"ительный"
      else:
        assert ppp.endswith(u"анный") or ppp.endswith(u"янный")
        verbal_noun = re.sub(u"нный$", u"ние", ppp)
        verbal_adj = re.sub(u"нный$", u"тельный", ppp)
        m = re.search(u"(.)нный$", ppp)
        suffix_start = m.group(1)
        verbal_noun_suffix = suffix_start + u"ние"
        verbal_adj_suffix = suffix_start + u"тельный"
      agent_noun = re.sub(u"ный$", "", verbal_adj)
      agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix)
      stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix)
      stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix)
      stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix)
      if conjtype.startswith("7"):
        stem = getparam(t, "4")
        if infinitive.endswith(u"ть"):
          stem = stem.replace(u"ё", u"е́")
        else:
          stem = rulib.make_unstressed_ru(stem)
        stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-"
      elif conjtype.startswith("8"):
        stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-"
      else:
        stem = rulib.remove_monosyllabic_accents(infinitive)

      if verbal_noun in nouns:
        stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text)
        if not stressed_noun:
          msg("%s no-etym FIXME" % verbal_noun)
        elif stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_noun)
        else:
          if stressed_noun.endswith(stressed_verbal_noun_suffix):
            suffix = stressed_verbal_noun_suffix
          else:
            suffix = verbal_noun_suffix
          msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix))

      if agent_noun in nouns:
        stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text)
        if stressed_noun == -1:
          pagemsg("Would add etym for %s but already has one" % agent_noun)
        else:
          msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem))

      if verbal_adj in adjectives:
        stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text)
        if stressed_adj == -1:
          pagemsg("Would add etym for %s but already has one" % verbal_adj)
        else:
          msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))
Beispiel #8
0
def process_line(index, line, add_passive_of, override_etym, save, verbose):
  def error(text):
    errmsg("ERROR: Processing line: %s" % line)
    errmsg("ERROR: %s" % text)
    assert False

  def check_stress(word):
    word = re.sub(r"|.*", "", word)
    if word.startswith("-") or word.endswith("-"):
      # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать)
      return
    if rulib.needs_accents(word, split_dash=True):
      error("Word %s missing an accent" % word)

  # Skip lines consisting entirely of comments
  if line.startswith("#"):
    return
  if line.startswith("!"):
    override_etym = True
    line = line[1:]
  # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be
  # included as part of the second element.
  els = do_split(r"\s+", line, 1)
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  if not els[1].startswith("raw:"):
    els = do_split(r"\s+", line)
  # Replace _ with space and \u
  els = [el.replace("_", " ").replace(r"\u", "_") for el in els]
  if len(els) != 2:
    error("Expected two fields, saw %s" % len(els))
  accented_term = els[0]
  term = rulib.remove_accents(accented_term)
  etym = els[1]

  pagetitle = term

  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  # Handle etymology
  adjformtext = ""
  if etym == "?":
    error("Etymology consists of bare question mark")
  elif etym == "-":
    etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n"
  elif etym == "--":
    etymtext = ""
  elif re.search(r"^(part|adj|partadj)([fnp]):", etym):
    m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym)
    forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]}
    infleclines = ["# {{inflection of|lang=ru|%s||%s}}" %
        (m.group(3), form) for form in forms[m.group(2)]]
    if m.group(1) in ["adj", "partadj"]:
      adjinfltext = """===Adjective===
{{head|ru|adjective form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      adjinfltext = ""
    if m.group(1) in ["part", "partadj"]:
      partinfltext = """===Participle===
{{head|ru|participle form|head=%s%s}}

%s\n\n""" % (headterm, trtext, "\n".join(infleclines))
    else:
      partinfltext = ""
    adjformtext = partinfltext + adjinfltext
    etymtext = ""
  else:
    if etym.startswith("acr:"):
      _, fullexpr, meaning = do_split(":", etym)
      etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning)
    elif etym.startswith("deverb:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm
    elif etym.startswith("back:"):
      _, sourceterm = do_split(":", etym)
      etymtext = "{{back-form|lang=ru|%s}}" % sourceterm
    elif etym.startswith("raw:"):
      etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym))
    elif ":" in etym and "+" not in etym:
      if etym.startswith("?"):
        prefix = "Perhaps borrowed from "
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately borrowed from "
        etym = re.sub(r"^<<", "", etym)
      else:
        prefix = "Borrowed from "
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if not m:
        error("Bad etymology form: %s" % etym)
      etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2))
    else:
      prefix = ""
      suffix = ""
      if etym.startswith("?"):
        prefix = "Perhaps from "
        suffix = "."
        etym = re.sub(r"^\?", "", etym)
      elif etym.startswith("<<"):
        prefix = "Ultimately from "
        suffix = "."
        etym = re.sub(r"^<<", "", etym)
      m = re.search(r"^([a-zA-Z.-]+):(.*)", etym)
      if m:
        langtext = "|lang1=%s" % m.group(1)
        etym = m.group(2)
      else:
        langtext = ""
      etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix,
          "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext,
          suffix)
    etymbody = etymtext + "\n\n"
    etymtext = "===Etymology===\n" + etymbody

  if not etymtext:
    pagemsg("No etymology text, skipping")

  # Load page
  page = pywikibot.Page(site, pagetitle)

  if not blib.try_repeatedly(lambda: page.exists(), pagemsg,
      "check page existence"):
    pagemsg("Page doesn't exist, can't add etymology")
    return
    
  pagemsg("Adding etymology")
  notes = []
  pagetext = unicode(page.text)

  # Split into sections
  splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M)
  # Extract off pagehead and recombine section headers with following text
  pagehead = splitsections[0]
  sections = []
  for i in xrange(1, len(splitsections)):
    if (i % 2) == 1:
      sections.append("")
    sections[-1] += splitsections[i]

  # Go through each section in turn, looking for existing Russian section
  for i in xrange(len(sections)):
    m = re.match("^==([^=\n]+)==$", sections[i], re.M)
    if not m:
      pagemsg("Can't find language name in text: [[%s]]" % (sections[i]))
    elif m.group(1) == "Russian":
      if override_etym:
        subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)

        replaced_etym = False
        for j in xrange(2, len(subsections), 2):
          if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]:
            subsections[j] = etymbody
            replaced_etym = True
            break

        if replaced_etym:
          sections[i] = "".join(subsections)
          newtext = "".join(sections)
          notes.append("replace Etymology section in Russian lemma with manually specified etymology")
          break

      if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]:
        errandpagemsg("WARNING: Already found etymology, skipping")
        return

      subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M)
          
      insert_before = 1
      if "===Alternative forms===" in subsections[insert_before]:
        insert_before += 2

      subsections[insert_before] = etymtext + subsections[insert_before]
      sections[i] = "".join(subsections)
      if add_passive_of:
        active_term = rulib.remove_monosyllabic_accents(
          re.sub(u"с[яь]$", "", accented_term))
        sections[i] = re.sub(r"(^(#.*\n)+)",
          r"\1# {{passive of|lang=ru|%s}}\n" % active_term,
          sections[i], 1, re.M)

      newtext = pagehead + "".join(sections)
      notes.append("add (manually specified) Etymology section to Russian lemma")
      break
  else:
    errandpagemsg("WARNING: Can't find Russian section, skipping")
    return

  if newtext != pagetext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext))
    assert notes
    comment = "; ".join(group_notes(notes))
    if save:
      blib.safe_page_save(page, comment, errandpagemsg)
    else:
      pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [
        "noun", "nouns", "proper noun", "proper nouns"]:
      pos = getparam(t, "2")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if (pname not in ["1", "2", "head", "g", "g2", "g3", "3", "4", "5", "6", "7", "8", "9", "10"] or
            pname == "3" and pval not in ["masculine", "feminine"] or
            pname in ["5", "7", "9"] and pval != "or"):
          pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt))
          break
      else: # no break
        rmparam(t, "1")
        rmparam(t, "2")
        m = []
        f = []
        head = getparam(t, "head")
        rmparam(t, "head")
        genders = []
        def process_gender(g):
          if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
            genders.append(g)
          else:
            pagemsg("WARNING: Unrecognized gender '%s'" % g)
        g = getparam(t, "g")
        if g:
          process_gender(g)
        rmparam(t, "g")
        g2 = getparam(t, "g2")
        if g2:
          process_gender(g2)
        rmparam(t, "g2")
        g3 = getparam(t, "g3")
        if g3:
          process_gender(g3)
        rmparam(t, "g3")
        def handle_mf(array):
          array.append(getparam(t, "4"))
          rmparam(t, "3")
          rmparam(t, "4")
          i = 5
          while getparam(t, str(i)) == "or":
            array.append(getparam(t, str(i + 1)))
            rmparam(t, str(i))
            rmparam(t, str(i + 1))
            i += 2
        if getparam(t, "3") == "masculine":
          handle_mf(m)
        if getparam(t, "3") == "feminine":
          handle_mf(f)
        if pos in ["noun", "nouns"]:
          newtn = "bg-noun"
        else:
          newtn = "bg-proper noun"
        blib.set_template_name(t, newtn)
        t.add("1", head or pagetitle)
        blib.set_param_chain(t, genders, "2", "g")
        if m:
          blib.set_param_chain(t, m, "m", "m")
        if f:
          blib.set_param_chain(t, f, "f", "f")
        notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn))
    elif tn in ["bg-noun", "bg-proper noun"]:
      g = None
      cur1 = getparam(t, "1")
      if cur1 in ["m", "f"]:
        g = cur1
      elif re.search("[a-zA-Z]", cur1):
        pagemsg("WARNING: Saw Latin in 1=%s in %s" % (cur1, origt))
        continue
      head = getparam(t, "head") or getparam(t, "sg")
      rmparam(t, "head")
      rmparam(t, "sg")
      genders = []
      def process_gender(g):
        if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
          genders.append(g)
        elif g in ["mf", "fm"]:
          genders.append("m")
          genders.append("f")
        elif g in ["mn", "nm"]:
          genders.append("m")
          genders.append("n")
        elif g in ["fn", "nf"]:
          genders.append("f")
          genders.append("n")
        elif g in ["mfn", "fmn", "mnf", "nmf", "fnm", "nfm"]:
          genders.append("m")
          genders.append("f")
          genders.append("n")
        else:
          pagemsg("WARNING: Unrecognized gender '%s'" % g)
      if g:
        process_gender(g)
        rmparam(t, "1")
      g = getparam(t, "2")
      if g:
        process_gender(g)
      g = getparam(t, "g")
      if g:
        process_gender(g)
      rmparam(t, "g")
      g2 = getparam(t, "g2")
      if g2:
        process_gender(g2)
      rmparam(t, "g2")
      g3 = getparam(t, "g3")
      if g3:
        process_gender(g3)
      rmparam(t, "g3")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if not pval:
          continue
        params.append((pname, pval, showkey))
      # Erase all params.
      del t.params[:]
      # Put back new params.
      t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
      blib.set_param_chain(t, genders, "2", "g")
      for pname, pval, showkey in params:
        t.add(pname, pval, showkey=showkey, preserve_spacing=False)
      if origt != unicode(t):
        notes.append("move head=/sg= to 1=, g= to 2= in {{%s}}" % tn)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return parsed, notes
def process_page(index, page, lemmas):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    pagetext = unicode(page.text)

    section = blib.find_lang_section_from_text(pagetext, "Russian", pagemsg)
    if not section:
        errandpagemsg("WARNING: Couldn't find Russian section")
        return

    if "==Etymology" in section:
        return
    if rulib.check_for_alt_yo_terms(section, pagemsg):
        return
    parsed = blib.parse_text(section)
    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-participle of"]:
            pagemsg("Skipping participle")
            return
    saw_verb = False
    saw_passive = False
    saw_bad_passive = False
    for t in parsed.filter_templates():
        if unicode(t.name) in ["passive of", "passive form of"]:
            saw_passive = True
    if not saw_passive and ("passive of" in section
                            or "passive form of" in section):
        saw_bad_passive = True
    splits = []
    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-verb":
            saw_verb = True
            saw_paired_verb = False
            printed_msg = False
            heads = blib.fetch_param_chain(t, "1", "head") or [pagetitle]
            refl = heads[0].endswith(u"ся") or heads[0].endswith(u"сь")
            if refl:
                m = re.search(u"^(.*)(с[яь])$", heads[0])
                assert m
                transverb_no_passive = (False if
                                        (saw_passive or saw_bad_passive) else
                                        is_transitive_verb(
                                            rulib.remove_accents(m.group(1)),
                                            pagemsg, errandpagemsg))
                if (saw_passive or saw_bad_passive or transverb_no_passive):
                    splits.append(
                        (heads, [m.group(1)
                                 ], "%s+-%s" % (m.group(1), m.group(2)),
                         "active-passive%s%s" %
                         (saw_bad_passive and " (saw-bad-passive)" or "",
                          transverb_no_passive and " (missing-passive-decl)"
                          or "")))
                    continue
            if getparam(t, "2").startswith("impf"):
                pfs = blib.fetch_param_chain(t, "pf", "pf")
                for otheraspect in pfs:
                    if heads[0][0:2] == otheraspect[0:2]:
                        saw_paired_verb = True
                if saw_paired_verb:
                    splits.append((heads, pfs, ",".join(pfs), "paired-impf"))
                    printed_msg = True
            if getparam(t, "2").startswith("pf"):
                prefixes = [
                    u"взъ", u"вз", u"вс", u"возъ", u"воз", u"вос", u"вы́",
                    u"въ", u"в", u"до", u"за", u"изъ", u"из", u"ис", u"на",
                    u"объ", u"об", u"отъ", u"от", u"о", u"пере", u"подъ",
                    u"под", u"по", u"предъ", u"пред", u"пре", u"при", u"про",
                    u"разъ", u"раз", u"рас", u"съ", u"с", u"у"
                ]
                for break_reflexives in [False, True]:
                    head = heads[0]
                    if break_reflexives:
                        if not head.endswith(u"ся") and not head.endswith(
                                u"сь"):
                            break
                        reflsuf = "+-" + head[-2:]  # fetch reflexive suffix
                        head = head[:-2]  # drop reflexive suffix
                    else:
                        reflsuf = ""
                    for prefix in prefixes:
                        m = re.match("^(%s)(.*)$" % prefix, head)
                        if m:
                            base = rulib.remove_monosyllabic_accents(
                                re.sub(u"^ы", u"и", m.group(2)))
                            if rulib.remove_accents(base) in lemmas:
                                base_to_do = base
                            elif rulib.remove_accents("-" + base) in lemmas:
                                base_to_do = "-" + base
                            else:
                                base_to_do = None
                            if base_to_do:
                                prefix = prefix.replace(u"ъ", "")
                                if m.group(1) == u"вы́":
                                    need_accent = "-NEED-ACCENT"
                                else:
                                    need_accent = ""
                                splits.append((
                                    heads, [base_to_do], "%s-+%s%s%s" %
                                    (prefix, base_to_do, reflsuf, need_accent),
                                    "strip-prefix"))
                                printed_msg = True
            if not printed_msg:
                msg("%s no-etym misc" % ",".join(heads))
    for derived_terms, base_terms, analysis, comment in splits:
        warnings = []
        base_terms_no_accent = []
        for term in base_terms:
            term = rulib.remove_accents(term)
            if term not in base_terms_no_accent:
                base_terms_no_accent.append(term)
        if len(base_terms_no_accent) > 1:
            errandpagemsg(
                "WARNING: Multiple base pages %s for base lemmas %s" %
                (",".join(base_terms_no_accent), ",".join(base_terms)))
            continue
        if base_terms_no_accent[0] not in lemmas:
            continue
        derived_defns = rulib.find_defns(section)
        if not derived_defns:
            errandpagemsg(
                "WARNING: Couldn't find definitions for derived term %s" %
                ",".join(derived_terms))
            continue
        base_section = blib.find_lang_section(base_terms_no_accent[0],
                                              "Russian", pagemsg,
                                              errandpagemsg)
        if not base_section:
            errandpagemsg(
                "WARNING: Couldn't find Russian section for base term %s" %
                base_terms_no_accent[0])
            continue
        base_defns = rulib.find_defns(base_section)
        if not base_defns:
            errandpagemsg(
                "WARNING: Couldn't find definitions for base term %s" %
                ",".join(base_terms))
            continue

        def concat_defns(defns):
            return ";".join(defns).replace("_", r"\u").replace(" ", "_")

        msg("%s %s%s no-etym %s %s //// %s" %
            (",".join(derived_terms), analysis,
             " WARNING:%s" % ",".join(warnings) if warnings else "", comment,
             concat_defns(base_defns), concat_defns(derived_defns)))
    if not saw_verb:
        msg("%s no-etym misc" % pagetitle)