Ejemplo n.º 1
0
     corverb for corverb in corverbs
     if corverb.startswith("*"))
 if isrefl:
     refverb = re.sub(u"с[ья]$", "",
                      verb) + gender_arg
     correfverbs = []
     for corverb in corverbs:
         correfverbs.append(
             "%s|g=%s" %
             (re.sub(u"с[ья]$", "",
                     re.sub(r"^\*", "", corverb)),
              "impf" if headword_aspect == "pf"
              or corverb.startswith("*") else "pf"))
 else:
     refverb = (re.search(u"и́?$", verb) and verb +
                u"сь" or rulib.try_to_stress(verb) +
                u"ся") + gender_arg
     correfverbs = []
     for corverb in corverbs:
         impf_override = corverb.startswith("*")
         corverb = re.sub(r"^\*", "", corverb)
         correfverbs.append("%s|g=%s" % (
             (re.search(u"и́?$", corverb)
              and corverb + u"сь" or
              rulib.try_to_stress(corverb) + u"ся"),
             "impf" if headword_aspect == "pf"
             or impf_override else "pf"))
 if headword_aspect == "pf" or corverb_impf_override:
     refverbs = correfverbs + [refverb]
 else:
     refverbs = [refverb] + correfverbs
Ejemplo n.º 2
0
def try_to_stress(form):
    if "//" in form:
        m = re.search("^(.*?)//(.*)$", form)
        # FIXME: This should stress the translit as well
        return rulib.try_to_stress(m.group(1)) + "//" + m.group(2)
    return rulib.try_to_stress(form)
Ejemplo n.º 3
0
def convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg,
    headword_template=None):
  zdecl = unicode(decl_z_template)
  zdeclcopy = blib.parse_text(zdecl).filter_templates()[0]
  decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
  # {{ru-decl-noun-z|ёж|m-inan|b}}
  def getp(param):
    rmparam(zdeclcopy, param)
    return getparam(decl_z_template, param).strip()
  zlemma = getp("1")
  zgender_anim = getp("2")
  zstress = getp("3")
  zspecial = re.sub(u"ё", u";ё", getp("4"))
  m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim)
  if not m:
    pagemsg("WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s" %
        zgender_anim)
    return None
  zgender, zanim = m.groups()

  if not zlemma:
    pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl)
    return None

  # Remove unnecessary gender
  need_gender = (re.search(u"[иы]́?$", zlemma) or
      zgender == "n" and re.search(u"[яа]́?$", zlemma) or
      zgender == "m" and re.search(u"[яа]́?$", zlemma) and "(1)" in zspecial or
      zlemma.endswith(u"ь"))
  if not need_gender:
    normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n" or
        re.search(u"[ая]́?$", zlemma) and "f" or "m")
    if normal_gender != zgender:
      pagemsg("WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender" %
          (normal_gender, zgender))
      need_gender = True
  if need_gender:
    pagemsg("Preserving gender in z-decl: %s" % zdecl)
    zspecial = zgender + zspecial
  else:
    pagemsg("Not preserving gender in z-decl: %s" % zdecl)

  # Remove unnecessary stress
  stressed_lemma = ru.try_to_stress(zlemma)
  def check_defstress(defstr, reason):
    if defstr == zstress:
      pagemsg("Removing stress %s as default because %s: stressed_lemma=%s, template=%s" %
          (defstr, reason, stressed_lemma, zdecl))
    return defstr
  if ru.is_nonsyllabic(stressed_lemma):
    default_stress = check_defstress("b", "nonsyllabic lemma")
  elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma):
    default_stress = check_defstress("b", "ending-accented lemma")
  # No need for special-casing for ёнок or а́нин, as they are considered
  # accent a by ru-decl-noun-z
  else:
    default_stress = check_defstress("a", "stem-accented lemma")
  if default_stress == zstress:
    zstress = ""
  else:
    pagemsg("Not removing stress %s: %s" % (zstress, zdecl))

  # Remove unnecessary lemma
  if ru.try_to_stress(subpagetitle) == stressed_lemma:
    pagemsg(u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s" %
        (zlemma, subpagetitle, zdecl))
    zlemma = ""

  if zstress:
    decl_template.add("1", zstress)
    offset = 1
  else:
    offset = 0
  decl_template.add(str(1 + offset), zlemma)
  decl_template.add(str(2 + offset), zspecial)
  if not getparam(decl_template, "3"):
    rmparam(decl_template, "3")
    if not getparam(decl_template, "2"):
      rmparam(decl_template, "2")
      if not getparam(decl_template, "1"):
        rmparam(decl_template, "1")

  headword_anim_spec = headword_template and extract_headword_anim_spec(headword_template)
  def anim_mismatch(zdecl_an, allowed_headword_ans):
    if headword_anim_spec and headword_anim_spec not in allowed_headword_ans:
      pagemsg("WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s" %
          (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans),
            zdecl, unicode(headword_template)))

  if zanim == "an":
    anim_mismatch(zanim, ["an"])
    pagemsg("Preserving z-decl -an as a=an: %s" % zdecl)
    decl_template.add("a", "an")
  elif zanim == "inan":
    anim_mismatch(zanim, ["ai", "ia"])
    if headword_anim_spec in ["ai", "ia"]:
      pagemsg("Converting z-decl -inan to a=%s: %s" %
          (headword_anim_spec, zdecl))
      decl_template.add("a", headword_anim_spec)
    else:
      pagemsg("WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s" %
          (zdecl, unicode(headword_template or "(no headword)")))
      decl_template.add("a", "bi")
  else:
    assert(zanim == "in")
    anim_mismatch(zanim, ["in"])
    pagemsg("Dropping z-decl -in as default: %s" % zdecl)

  znum = getp("n")
  if znum:
    if znum == "pl":
      pagemsg("WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s" %
          zdecl)
    pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl))
    decl_template.add("n", znum)

  preserve_params = [
    'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg',
    'nom_pl', 'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl',
    'voc'
  ]
  renamed_params = {'prp_sg':'pre_sg', 'prp_pl':'pre_pl'}

  for param in preserve_params:
    val = getp(param)
    if not val:
      continue
    newval = fixup_link(val)
    newvals = re.split(r"\s*,\s*", newval)
    newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals]
    newval= ",".join(newvals)
    newparam = renamed_params.get(param, param)
    pagemsg("Preserving z-decl override %s=%s%s%s: %s" % (
      newparam, newval,
      "" if newparam == param else "; renamed from %s" % param,
      "" if newval == val else "; canonicalized from %s=%s" % (param, val),
      zdecl))
    decl_template.add(newparam, newval)
  loc = getp("loc")
  if loc:
    if loc == u"в":
      newloc = u"в +"
    elif loc == u"на":
      newloc = u"на +"
    else:
      newloc = u"в/на +"
    pagemsg("Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s" %
        (newloc, loc, zdecl))
    decl_template.add("loc", newloc)
  par = getp("par")
  if par:
    newpar="+"
    pagemsg("Preserving z-decl partitive par=%s (canonicalized from par=%s): %s" %
        (newpar, par, zdecl))
    decl_template.add('par', newpar)
  notes = getp("note")
  if notes:
    pagemsg("WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s" %
        (notes, zdecl))
    decl_template.add('notes', notes)

  if zdeclcopy.params:
    pagemsg("WARNING: Extraneous params in z-decl: %s" % unicode(zdeclcopy))

  #pagemsg("Replacing z-decl %s with regular decl %s" %
  #    (zdecl, unicode(decl_template)))
  return decl_template
Ejemplo n.º 4
0
def convert_zdecl_to_ru_noun_table(decl_z_template,
                                   subpagetitle,
                                   pagemsg,
                                   headword_template=None):
    zdecl = unicode(decl_z_template)
    zdeclcopy = blib.parse_text(zdecl).filter_templates()[0]
    decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]

    # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
    # {{ru-decl-noun-z|ёж|m-inan|b}}
    def getp(param):
        rmparam(zdeclcopy, param)
        return getparam(decl_z_template, param).strip()

    zlemma = getp("1")
    zgender_anim = getp("2")
    zstress = getp("3")
    zspecial = re.sub(u"ё", u";ё", getp("4"))
    m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim)
    if not m:
        pagemsg(
            "WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s"
            % zgender_anim)
        return None
    zgender, zanim = m.groups()

    if not zlemma:
        pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl)
        return None

    # Remove unnecessary gender
    need_gender = (re.search(u"[иы]́?$", zlemma)
                   or zgender == "n" and re.search(u"[яа]́?$", zlemma)
                   or zgender == "m" and re.search(u"[яа]́?$", zlemma)
                   and "(1)" in zspecial or zlemma.endswith(u"ь"))
    if not need_gender:
        normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n"
                         or re.search(u"[ая]́?$", zlemma) and "f" or "m")
        if normal_gender != zgender:
            pagemsg(
                "WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender"
                % (normal_gender, zgender))
            need_gender = True
    if need_gender:
        pagemsg("Preserving gender in z-decl: %s" % zdecl)
        zspecial = zgender + zspecial
    else:
        pagemsg("Not preserving gender in z-decl: %s" % zdecl)

    # Remove unnecessary stress
    stressed_lemma = rulib.try_to_stress(zlemma)

    def check_defstress(defstr, reason):
        if defstr == zstress:
            pagemsg(
                "Removing stress %s as default because %s: stressed_lemma=%s, template=%s"
                % (defstr, reason, stressed_lemma, zdecl))
        return defstr

    if rulib.is_nonsyllabic(stressed_lemma):
        default_stress = check_defstress("b", "nonsyllabic lemma")
    elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma):
        default_stress = check_defstress("b", "ending-accented lemma")
    # No need for special-casing for ёнок or а́нин, as they are considered
    # accent a by ru-decl-noun-z
    else:
        default_stress = check_defstress("a", "stem-accented lemma")
    if default_stress == zstress:
        zstress = ""
    else:
        pagemsg("Not removing stress %s: %s" % (zstress, zdecl))

    # Remove unnecessary lemma
    if rulib.try_to_stress(subpagetitle) == stressed_lemma:
        pagemsg(
            u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s"
            % (zlemma, subpagetitle, zdecl))
        zlemma = ""

    if zstress:
        decl_template.add("1", zstress)
        offset = 1
    else:
        offset = 0
    decl_template.add(str(1 + offset), zlemma)
    decl_template.add(str(2 + offset), zspecial)
    if not getparam(decl_template, "3"):
        rmparam(decl_template, "3")
        if not getparam(decl_template, "2"):
            rmparam(decl_template, "2")
            if not getparam(decl_template, "1"):
                rmparam(decl_template, "1")

    headword_anim_spec = headword_template and extract_headword_anim_spec(
        headword_template)

    def anim_mismatch(zdecl_an, allowed_headword_ans):
        if headword_anim_spec and headword_anim_spec not in allowed_headword_ans:
            pagemsg(
                "WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s"
                %
                (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans),
                 zdecl, unicode(headword_template)))

    if zanim == "an":
        anim_mismatch(zanim, ["an"])
        pagemsg("Preserving z-decl -an as a=an: %s" % zdecl)
        decl_template.add("a", "an")
    elif zanim == "inan":
        anim_mismatch(zanim, ["ai", "ia"])
        if headword_anim_spec in ["ai", "ia"]:
            pagemsg("Converting z-decl -inan to a=%s: %s" %
                    (headword_anim_spec, zdecl))
            decl_template.add("a", headword_anim_spec)
        else:
            pagemsg(
                "WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s"
                % (zdecl, unicode(headword_template or "(no headword)")))
            decl_template.add("a", "bi")
    else:
        assert (zanim == "in")
        anim_mismatch(zanim, ["in"])
        pagemsg("Dropping z-decl -in as default: %s" % zdecl)

    znum = getp("n")
    if znum:
        if znum == "pl":
            pagemsg(
                "WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s"
                % zdecl)
        pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl))
        decl_template.add("n", znum)

    preserve_params = [
        'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg', 'nom_pl',
        'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl', 'voc'
    ]
    renamed_params = {'prp_sg': 'pre_sg', 'prp_pl': 'pre_pl'}

    for param in preserve_params:
        val = getp(param)
        if not val:
            continue
        newval = fixup_link(val)
        newvals = re.split(r"\s*,\s*", newval)
        newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals]
        newval = ",".join(newvals)
        newparam = renamed_params.get(param, param)
        pagemsg("Preserving z-decl override %s=%s%s%s: %s" %
                (newparam, newval,
                 "" if newparam == param else "; renamed from %s" % param,
                 "" if newval == val else "; canonicalized from %s=%s" %
                 (param, val), zdecl))
        decl_template.add(newparam, newval)
    loc = getp("loc")
    if loc:
        if loc == u"в":
            newloc = u"в +"
        elif loc == u"на":
            newloc = u"на +"
        else:
            newloc = u"в/на +"
        pagemsg(
            "Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s"
            % (newloc, loc, zdecl))
        decl_template.add("loc", newloc)
    par = getp("par")
    if par:
        newpar = "+"
        pagemsg(
            "Preserving z-decl partitive par=%s (canonicalized from par=%s): %s"
            % (newpar, par, zdecl))
        decl_template.add('par', newpar)
    notes = getp("note")
    if notes:
        pagemsg(
            "WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s"
            % (notes, zdecl))
        decl_template.add('notes', notes)

    if zdeclcopy.params:
        pagemsg("WARNING: Extraneous params in z-decl: %s" %
                unicode(zdeclcopy))

    #pagemsg("Replacing z-decl %s with regular decl %s" %
    #    (zdecl, unicode(decl_template)))
    return decl_template
Ejemplo n.º 5
0
def try_to_stress(form):
  if "//" in form:
    m = re.search("^(.*?)//(.*)$", form)
    # FIXME: This should stress the translit as well
    return ru.try_to_stress(m.group(1)) + "//" + m.group(2)
  return ru.try_to_stress(form)