Example #1
0
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, []

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  frobbed_manual_translit = []
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  # Retrieve headword translit and maybe transfer to decl
  headword_tr = getparam(headword_template, "tr")
  if headword_tr:
    if verbose:
      pagemsg("Found headword manual translit tr=%s" % headword_tr)
    if "," in headword_tr:
      pagemsg("WARNING: Comma in headword manual translit, skipping: %s" %
          headword_tr)
      return None
    # Punt if multi-arg-set, can't handle yet
    for decl_template in decl_templates:
      for param in decl_template.params:
        if not param.showkey:
          val = unicode(param.value)
          if val == "or":
            pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
          if val == "-" or val == "_" or val.startswith("join:"):
            pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template))
            return None
      for i in xrange(2, 10):
        if getparam(headword_template, "tr%s" % i):
          pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % (
            i, unicode(headword_template)))
          return None
      if runoun.arg1_is_stress(getparam(decl_template, "1")):
        lemma_arg = "2"
      else:
        lemma_arg = "1"
      lemmaval = getparam(decl_template, lemma_arg)
      if not lemmaval:
        lemmaval = subpagetitle
      if "//" in lemmaval:
        m = re.search("^(.*?)//(.*)$", lemmaval)
        if m.group(2) != headword_tr:
          pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % (
            lemmaval, headword_tr))
          return None
        else:
          pagemsg("Already found manual translit in decl template %s" %
              lemmaval)
      else:
        lemmaval += "//" + headword_tr
        orig_decl_template = unicode(decl_template)
        decl_template.add(lemma_arg, lemmaval)
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))
        frobbed_manual_translit = [headword_tr]

  genders = blib.fetch_param_chain(headword_template, "2", "g")

  bian_replaced = 0

  # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
  # headword template
  for decl_template in decl_templates:
    if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
      saw_in = -1
      saw_an = -1
      for i,g in enumerate(genders):
        if re.search(r"\bin\b", g) and saw_in < 0:
          saw_in = i
        if re.search(r"\ban\b", g) and saw_an < 0:
          saw_an = i
      if saw_in >= 0 and saw_an >= 0:
        orig_decl_template = unicode(decl_template)
        if saw_in < saw_an:
          pagemsg("Replacing a=bi with a=ia in decl template")
          decl_template.add("a", "ia")
          bian_replaced = 1
        else:
          pagemsg("Replacing a=bi with a=ai in decl template")
          decl_template.add("a", "ai")
          bian_replaced = 1
        pagemsg("Replacing decl %s with %s" % (orig_decl_template,
          unicode(decl_template)))

  generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
      unicode(noun_table_template))
  generate_result = expand_text(generate_template)
  if not generate_result:
    pagemsg("WARNING: Error generating noun args, skipping")
    return None
  args = ru.split_generate_args(generate_result)

  genders = runoun.check_old_noun_headword_forms(headword_template, args,
      subpagetitle, pagemsg)
  if genders == None:
    return None

  new_params = []
  for param in noun_table_template.params:
    new_params.append((param.name, param.value))

  orig_headword_template = unicode(headword_template)
  params_to_preserve = runoun.fix_old_headword_params(headword_template,
      new_params, genders, pagemsg)
  if params_to_preserve == None:
    return None

  if unicode(headword_template.name) == "ru-proper noun":
    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(headword_template, "n"):
      pagemsg("Adding n=both to headword tempate")
      headword_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        else:
          pagemsg("Removing n=sg from headword tempate")
          rmparam(headword_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  headword_template.params.extend(params_to_preserve)
  ru_noun_changed = 0
  ru_proper_noun_changed = 0
  if unicode(headword_template.name) == "ru-noun":
    headword_template.name = "ru-noun+"
    ru_noun_changed = 1
  else:
    headword_template.name = "ru-proper noun+"
    ru_proper_noun_changed = 1

  pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template)))

  return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
Example #2
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    origtext = page.text
    parsed = blib.parse_text(origtext)

    # Find the declension arguments for LEMMA and inflected form INFL,
    # the WORDINDth word in the expression. Return value is a tuple of
    # four items: a list of (NAME, VALUE) tuples for the arguments, whether
    # the word is an adjective, the value of n= (if given), and the value
    # of a= (if given).
    def find_decl_args(lemma, infl, wordind):
        declpage = pywikibot.Page(site, lemma)
        if rulib.remove_accents(infl) == lemma:
            wordlink = "[[%s]]" % infl
        else:
            wordlink = "[[%s|%s]]" % (lemma, infl)

        if not declpage.exists():
            if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
                pagemsg(
                    "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return [("1", wordlink), ("2", "+")], True, None, None
            else:
                pagemsg(
                    "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
        parsed = blib.parse_text(declpage.text)
        decl_templates = []
        headword_templates = []
        decl_z_templates = []
        for t in parsed.filter_templates():
            tname = unicode(t.name)
            if tname in ["ru-noun-table", "ru-decl-adj"]:
                pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
                decl_templates.append(t)
            if tname in ["ru-noun", "ru-proper noun"]:
                pagemsg("find_decl_args: Found headword template: %s" %
                        unicode(t))
                headword_templates.append(t)
            if tname in ["ru-decl-noun-z"]:
                pagemsg("find_decl_args: Found z-decl template: %s" %
                        unicode(t))
                decl_z_templates.append(t)

        if not decl_templates:
            if decl_z_templates:
                # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
                # {{ru-decl-noun-z|ёж|m-inan|b}}
                if len(decl_z_templates) > 1:
                    pagemsg(
                        "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None
                else:
                    decl_z_template = decl_z_templates[0]
                    headword_template = None
                    pagemsg("find_decl_args: Using z-decl template: %s" %
                            unicode(decl_z_template))
                    if len(headword_templates) == 0:
                        pagemsg(
                            "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    elif len(headword_templates) > 1:
                        pagemsg(
                            "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    else:
                        headword_template = headword_templates[0]
                        pagemsg(
                            "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s"
                            %
                            (wordind, lemma, infl, unicode(headword_template),
                             unicode(decl_z_template)))
                    decl_template = runounlib.convert_zdecl_to_ru_noun_table(
                        decl_z_template,
                        subpagetitle,
                        pagemsg,
                        headword_template=headword_template)
                    decl_templates = [decl_template]

            elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
                    x for x in headword_templates if getparam(x, "3") == "-"
            ]:
                return [("1", wordlink), ("2", "$")], False, None, None
            else:
                pagemsg(
                    "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        if len(decl_templates) == 1:
            decl_template = decl_templates[0]
        else:
            # Multiple decl templates
            for t in decl_templates:
                if unicode(t.name) == "ru-decl-adj" and re.search(
                        u"(ий|ый|ой)$", lemma):
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    decl_template = t
                    break
            else:
                if lemma in use_given_decl:
                    overriding_decl = use_given_decl[lemma]
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                        % (wordind, overriding_decl, lemma, infl))
                    decl_template = blib.parse_text(
                        overriding_decl).filter_templates()[0]
                elif pagetitle in use_given_page_decl:
                    overriding_decl = use_given_page_decl[pagetitle].get(
                        lemma, None)
                    if not overriding_decl:
                        pagemsg(
                            "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        return
                    else:
                        pagemsg(
                            "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                            % (wordind, overriding_decl, lemma, infl))
                        decl_template = blib.parse_text(
                            overriding_decl).filter_templates()[0]
                else:
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None

        pagemsg("find_decl_args: Using decl template: %s" %
                unicode(decl_template))
        if unicode(decl_template.name) == "ru-decl-adj":
            if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
                return [("1", wordlink), ("2", u"+ь")], True, None, None
            else:
                return [("1", wordlink), ("2", "+")], True, None, None

        # ru-noun-table
        assert unicode(decl_template.name) == "ru-noun-table"

        # Split out the arg sets in the declension and check the
        # lemma of each one, taking care to handle cases where there is no lemma
        # (it would default to the page name).

        highest_numbered_param = 0
        for p in decl_template.params:
            pname = unicode(p.name)
            if re.search("^[0-9]+$", pname):
                highest_numbered_param = max(highest_numbered_param,
                                             int(pname))

        # Now gather the numbered arguments into arg sets. Code taken from
        # ru-noun.lua.
        offset = 0
        arg_sets = []
        arg_set = []
        for i in xrange(1, highest_numbered_param + 2):
            end_arg_set = False
            val = getparam(decl_template, str(i))
            if i == highest_numbered_param + 1:
                end_arg_set = True
            elif val == "_" or val == "-" or re.search("^join:", val):
                pagemsg(
                    "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
            elif val == "or":
                end_arg_set = True

            if end_arg_set:
                arg_sets.append(arg_set)
                arg_set = []
                offset = i
            else:
                arg_set.append(val)

        canon_infl = rulib.remove_accents(infl).lower()
        canon_lemma = lemma.lower()
        ispl = False
        need_sc1 = False
        found_gender = None
        if canon_infl != canon_lemma:
            for sgend, plend, gender, is_sc1 in pl_data:
                if sgend:
                    check_sgend = sgend
                else:
                    check_sgend = consonant_re
                if re.search(check_sgend + "$",
                             canon_lemma) and canon_infl == re.sub(
                                 sgend + "$", plend, canon_lemma):
                    ispl = True
                    found_gender = gender
                    need_sc1 = is_sc1
                    break
            else:
                pagemsg(
                    "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        # Substitute the wordlink for any lemmas in the declension.
        # If plural, also add gender and verify special case (1) as necessary.
        # Concatenate all the numbered params, substituting the wordlink into
        # the lemma as necessary.
        numbered_params = []
        for arg_set in arg_sets:
            lemma_arg = 0
            if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]):
                lemma_arg = 1
            if len(arg_set) <= lemma_arg:
                arg_set.append("")
            arglemma = arg_set[lemma_arg]
            manualtr = ""
            if "//" in arglemma:
                arglemma, manualtr = re.search("^(.*?)(//.*?)$",
                                               arglemma).groups()
            if (not arglemma or arglemma.lower() == infl.lower()
                    or rulib.is_monosyllabic(infl)
                    and rulib.remove_accents(arglemma).lower()
                    == rulib.remove_accents(infl).lower() or ispl and
                    rulib.remove_accents(arglemma).lower() == lemma.lower()):
                arg_set[lemma_arg] = wordlink + manualtr
            else:
                pagemsg(
                    "WARNING: Can't sub word link %s into decl lemma %s%s" %
                    (wordlink, arg_set[lemma_arg], ispl and ", skipping"
                     or ""))
                if ispl:
                    return None

            if ispl:
                # Add the gender
                if len(arg_set) <= lemma_arg + 1:
                    arg_set.append("")
                declarg = arg_set[lemma_arg + 1]

                # First, sub in gender
                m = re.search("(3f|[mfn])", declarg)
                if found_gender == "mf":
                    if not m:
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    decl_gender = m.group(1)
                    if decl_gender == "n":
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    elif decl_gender in ["m", "3f"]:
                        pagemsg(
                            u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                            % (decl_gender, wordind, lemma, infl))
                    else:
                        assert gender == "f"
                        pagemsg(
                            u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        declarg = re.sub("f", "3f", declarg, 1)
                else:
                    if m:
                        decl_gender = m.group(1)
                        if decl_gender == found_gender:
                            pagemsg(
                                "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                                % (found_gender, wordind, lemma, infl))
                        else:
                            pagemsg(
                                "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s"
                                % (decl_gender, wordind, found_gender, lemma,
                                   infl))
                            declarg = re.sub("(3f|[mfn])", found_gender,
                                             declarg, 1)
                    else:
                        pagemsg(
                            "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s"
                            % (wordind, found_gender, lemma, infl))
                        declarg = found_gender + declarg

                # Now check special case 1
                if need_sc1 != ("(1)" in declarg):
                    if need_sc1:
                        pagemsg(
                            "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None
                    else:
                        pagemsg(
                            "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None

                arg_set[lemma_arg + 1] = declarg

            if numbered_params:
                numbered_params.append("or")
            numbered_params.extend(arg_set)

        # Now gather all params, including named ones.
        params = []
        params.extend(
            (str(i + 1), val)
            for i, val in zip(xrange(len(numbered_params)), numbered_params))
        num = None
        anim = None
        for p in decl_template.params:
            pname = unicode(p.name)
            val = unicode(p.value)
            if pname == "a":
                anim = val
            elif pname == "n":
                num = val
            elif pname == "notes":
                params.append((pname, val))
            elif pname == "title":
                pagemsg(
                    "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s"
                    % (wordind, lemma, infl, val))
            elif re.search("^[0-9]+$", pname):
                pass
            else:
                keepparam = True
                if pname == "loc":
                    if pagetitle in keep_locative:
                        pagemsg(
                            "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    else:
                        pagemsg(
                            "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                        keepparam = False
                if pname == "par":
                    pagemsg(
                        "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if pname == "voc":
                    pagemsg(
                        "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if keepparam:
                    if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
                        pagemsg(
                            u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    pname += str(wordind)
                    params.append((pname, val))
Example #3
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  origtext = page.text
  parsed = blib.parse_text(origtext)

  # Find the declension arguments for LEMMA and inflected form INFL,
  # the WORDINDth word in the expression. Return value is a tuple of
  # four items: a list of (NAME, VALUE) tuples for the arguments, whether
  # the word is an adjective, the value of n= (if given), and the value
  # of a= (if given).
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None

    # ru-noun-table
    assert unicode(decl_template.name) == "ru-noun-table"

    # Split out the arg sets in the declension and check the
    # lemma of each one, taking care to handle cases where there is no lemma
    # (it would default to the page name).

    highest_numbered_param = 0
    for p in decl_template.params:
      pname = unicode(p.name)
      if re.search("^[0-9]+$", pname):
        highest_numbered_param = max(highest_numbered_param, int(pname))

    # Now gather the numbered arguments into arg sets. Code taken from
    # ru-noun.lua.
    offset = 0
    arg_sets = []
    arg_set = []
    for i in xrange(1, highest_numbered_param + 2):
      end_arg_set = False
      val = getparam(decl_template, str(i))
      if i == highest_numbered_param + 1:
        end_arg_set = True
      elif val == "_" or val == "-" or re.search("^join:", val):
        pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
      elif val == "or":
        end_arg_set = True

      if end_arg_set:
        arg_sets.append(arg_set)
        arg_set = []
        offset = i
      else:
        arg_set.append(val)

    canon_infl = ru.remove_accents(infl).lower()
    canon_lemma = lemma.lower()
    ispl = False
    need_sc1 = False
    found_gender = None
    if canon_infl != canon_lemma:
      for sgend, plend, gender, is_sc1 in pl_data:
        if sgend:
          check_sgend = sgend
        else:
          check_sgend = consonant_re
        if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
          ispl = True
          found_gender = gender
          need_sc1 = is_sc1
          break
      else:
        pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    # Substitute the wordlink for any lemmas in the declension.
    # If plural, also add gender and verify special case (1) as necessary.
    # Concatenate all the numbered params, substituting the wordlink into
    # the lemma as necessary.
    numbered_params = []
    for arg_set in arg_sets:
      lemma_arg = 0
      if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]):
        lemma_arg = 1
      if len(arg_set) <= lemma_arg:
        arg_set.append("")
      arglemma = arg_set[lemma_arg]
      manualtr = ""
      if "//" in arglemma:
        arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups()
      if (not arglemma or arglemma.lower() == infl.lower() or
          ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() ==
          ru.remove_accents(infl).lower() or
          ispl and ru.remove_accents(arglemma).lower() == lemma.lower()
          ):
        arg_set[lemma_arg] = wordlink + manualtr
      else:
        pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % (
          wordlink, arg_set[lemma_arg], ispl and ", skipping" or ""))
        if ispl:
          return None

      if ispl:
        # Add the gender
        if len(arg_set) <= lemma_arg + 1:
          arg_set.append("")
        declarg = arg_set[lemma_arg + 1]

        # First, sub in gender
        m = re.search("(3f|[mfn])", declarg)
        if found_gender == "mf":
          if not m:
            pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          decl_gender = m.group(1)
          if decl_gender == "n":
            pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          elif decl_gender in ["m", "3f"]:
            pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                (decl_gender, wordind, lemma, infl))
          else:
            assert gender == "f"
            pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" %
                (wordind, lemma, infl))
            declarg = re.sub("f", "3f", declarg, 1)
        else:
          if m:
            decl_gender = m.group(1)
            if decl_gender == found_gender:
              pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                  (found_gender, wordind, lemma, infl))
            else:
              pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" %
                  (decl_gender, wordind, found_gender, lemma, infl))
              declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1)
          else:
            pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" %
                (wordind, found_gender, lemma, infl))
            declarg = found_gender + declarg

        # Now check special case 1
        if need_sc1 != ("(1)" in declarg):
          if need_sc1:
            pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None
          else:
            pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None

        arg_set[lemma_arg + 1] = declarg

      if numbered_params:
        numbered_params.append("or")
      numbered_params.extend(arg_set)

    # Now gather all params, including named ones.
    params = []
    params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params))
    num = None
    anim = None
    for p in decl_template.params:
      pname = unicode(p.name)
      val = unicode(p.value)
      if pname == "a":
        anim = val
      elif pname == "n":
        num = val
      elif pname == "notes":
        params.append((pname, val))
      elif pname == "title":
        pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" %
            (wordind, lemma, infl, val))
      elif re.search("^[0-9]+$", pname):
        pass
      else:
        keepparam = True
        if pname == "loc":
          if pagetitle in keep_locative:
            pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          else:
            pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
            keepparam = False
        if pname == "par":
          pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if pname == "voc":
          pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if keepparam:
          if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
            pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" %
                (wordind, val, lemma, infl))
          pname += str(wordind)
          params.append((pname, val))
Example #4
0
def process_page_section(index, page, section, verbose):
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    if not page.exists():
        pagemsg("WARNING: Page doesn't exist, skipping")
        return None

    parsed = blib.parse_text(section)

    noun_table_templates = []
    noun_old_templates = []

    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-decl-noun-see":
            pagemsg("Found ru-decl-noun-see, skipping")
            return None

    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-noun-table":
            noun_table_templates.append(t)
        if unicode(t.name) == "ru-noun-old":
            noun_old_templates.append(t)

    if len(noun_table_templates) > 1:
        pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
        return None
    if len(noun_old_templates) > 1:
        pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
        return None
    if not noun_table_templates and not noun_old_templates:
        return unicode(parsed), 0, 0, 0, []

    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
            pagemsg("Found ru-noun+ or ru-proper noun+, skipping")
            return None

    headword_templates = []

    for t in parsed.filter_templates():
        if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
            headword_templates.append(t)

    if len(headword_templates) > 1:
        pagemsg("WARNING: Found multiple headword templates, skipping")
        return None
    if len(headword_templates) < 1:
        return unicode(parsed), 0, 0, 0, []

    noun_table_template = noun_table_templates[0] if len(
        noun_table_templates) == 1 else None
    noun_old_template = noun_old_templates[0] if len(
        noun_old_templates) == 1 else None
    if noun_old_template and not noun_table_template:
        noun_table_template = noun_old_template
        noun_old_template = None
    headword_template = headword_templates[0]
    frobbed_manual_translit = []
    decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

    if verbose:
        pagemsg("Found headword template: %s" % unicode(headword_template))
        pagemsg("Found decl template: %s" % unicode(noun_table_template))
        if noun_old_template:
            pagemsg("Found old decl template: %s" % unicode(noun_old_template))

    # Retrieve headword translit and maybe transfer to decl
    headword_tr = getparam(headword_template, "tr")
    if headword_tr:
        if verbose:
            pagemsg("Found headword manual translit tr=%s" % headword_tr)
        if "," in headword_tr:
            pagemsg(
                "WARNING: Comma in headword manual translit, skipping: %s" %
                headword_tr)
            return None
        # Punt if multi-arg-set, can't handle yet
        for decl_template in decl_templates:
            for param in decl_template.params:
                if not param.showkey:
                    val = unicode(param.value)
                    if val == "or":
                        pagemsg(
                            "WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s"
                            % unicode(decl_template))
                        return None
                    if val == "-" or val == "_" or val.startswith("join:"):
                        pagemsg(
                            "WARNING: Manual translit and multi-word templates, can't handle, skipping: %s"
                            % unicode(decl_template))
                        return None
            for i in xrange(2, 10):
                if getparam(headword_template, "tr%s" % i):
                    pagemsg(
                        "WARNING: Headword template has translit param tr%s, can't handle, skipping: %s"
                        % (i, unicode(headword_template)))
                    return None
            if runounlib.arg1_is_stress(getparam(decl_template, "1")):
                lemma_arg = "2"
            else:
                lemma_arg = "1"
            lemmaval = getparam(decl_template, lemma_arg)
            if not lemmaval:
                lemmaval = subpagetitle
            if "//" in lemmaval:
                m = re.search("^(.*?)//(.*)$", lemmaval)
                if m.group(2) != headword_tr:
                    pagemsg(
                        "WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping"
                        % (lemmaval, headword_tr))
                    return None
                else:
                    pagemsg(
                        "Already found manual translit in decl template %s" %
                        lemmaval)
            else:
                lemmaval += "//" + headword_tr
                orig_decl_template = unicode(decl_template)
                decl_template.add(lemma_arg, lemmaval)
                pagemsg("Replacing decl %s with %s" %
                        (orig_decl_template, unicode(decl_template)))
                frobbed_manual_translit = [headword_tr]

    genders = blib.fetch_param_chain(headword_template, "2", "g")

    bian_replaced = 0

    # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in
    # headword template
    for decl_template in decl_templates:
        if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]:
            saw_in = -1
            saw_an = -1
            for i, g in enumerate(genders):
                if re.search(r"\bin\b", g) and saw_in < 0:
                    saw_in = i
                if re.search(r"\ban\b", g) and saw_an < 0:
                    saw_an = i
            if saw_in >= 0 and saw_an >= 0:
                orig_decl_template = unicode(decl_template)
                if saw_in < saw_an:
                    pagemsg("Replacing a=bi with a=ia in decl template")
                    decl_template.add("a", "ia")
                    bian_replaced = 1
                else:
                    pagemsg("Replacing a=bi with a=ai in decl template")
                    decl_template.add("a", "ai")
                    bian_replaced = 1
                pagemsg("Replacing decl %s with %s" %
                        (orig_decl_template, unicode(decl_template)))

    generate_template = re.sub(
        r"^\{\{ru-noun-old", "{{ru-generate-noun-args|old=1",
        re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
               unicode(noun_table_template)))
    generate_result = expand_text(generate_template)
    if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
    args = blib.split_generate_args(generate_result)

    genders = runounlib.check_old_noun_headword_forms(headword_template, args,
                                                      subpagetitle, pagemsg)
    if genders == None:
        return None

    new_params = []
    for param in noun_table_template.params:
        new_params.append((param.name, param.value))

    orig_headword_template = unicode(headword_template)
    params_to_preserve = runounlib.fix_old_headword_params(
        headword_template, new_params, genders, pagemsg)
    if params_to_preserve == None:
        return None

    if unicode(headword_template.name) == "ru-proper noun":
        # If proper noun and n is both then we need to add n=both because
        # proper noun+ defaults to n=sg
        if args["n"] == "b" and not getparam(headword_template, "n"):
            pagemsg("Adding n=both to headword tempate")
            headword_template.add("n", "both")
        # Correspondingly, if n is sg then we can usually remove n=sg;
        # but we need to check that the number is actually sg with n=sg
        # removed because of the possibility of plurale tantum lemmas
        if args["n"] == "s":
            generate_template_with_ndef = generate_template.replace(
                "}}", "|ndef=sg}}")
            generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
                                                 generate_template_with_ndef)
            generate_result = expand_text(generate_template_with_ndef)
            if not generate_result:
                pagemsg("WARNING: Error generating noun args, skipping")
                return None
            ndef_args = blib.split_generate_args(generate_result)
            if ndef_args["n"] == "s":
                existing_n = getparam(headword_template, "n")
                if existing_n and not re.search(r"^s", existing_n):
                    pagemsg(
                        "WARNING: Something wrong: Found n=%s, not singular" %
                        existing_n)
                else:
                    pagemsg("Removing n=sg from headword tempate")
                    rmparam(headword_template, "n")
            else:
                pagemsg(
                    "WARNING: Unable to remove n= from headword template because n=%s"
                    % ndef_args["n"])

    headword_template.params.extend(params_to_preserve)
    ru_noun_changed = 0
    ru_proper_noun_changed = 0
    if unicode(headword_template.name) == "ru-noun":
        headword_template.name = "ru-noun+"
        ru_noun_changed = 1
    else:
        headword_template.name = "ru-proper noun+"
        ru_proper_noun_changed = 1
    if unicode(noun_table_template).startswith("{{ru-noun-old"):
        headword_template.add("old", "1")

    pagemsg("Replacing headword %s with %s" %
            (orig_headword_template, unicode(headword_template)))

    return unicode(
        parsed
    ), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit