def check_need_accent(text):
  for word in re.split(" +", text):
    word = blib.remove_links(word)
    if u"\u0301" in word or u"ё" in word:
      continue
    if not ru.is_monosyllabic(word):
      return True
  return False
Example #2
0
def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  if ":" in pagetitle:
    pagemsg("WARNING: Colon in page title, skipping")
    return

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  origtext = page.text
  parsed = blib.parse_text(origtext)

  # Find the declension arguments for LEMMA and inflected form INFL,
  # the WORDINDth word in the expression. Return value is a tuple of
  # four items: a list of (NAME, VALUE) tuples for the arguments, whether
  # the word is an adjective, the value of n= (if given), and the value
  # of a= (if given).
  def find_decl_args(lemma, infl, wordind):
    declpage = pywikibot.Page(site, lemma)
    if ru.remove_accents(infl) == lemma:
      wordlink = "[[%s]]" % infl
    else:
      wordlink = "[[%s|%s]]" % (lemma, infl)

    if not declpage.exists():
      if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
        pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return [("1", wordlink), ("2", "+")], True, None, None
      else:
        pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
    parsed = blib.parse_text(declpage.text)
    decl_templates = []
    headword_templates = []
    decl_z_templates = []
    for t in parsed.filter_templates():
      tname = unicode(t.name)
      if tname in ["ru-noun-table", "ru-decl-adj"]:
        pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
        decl_templates.append(t)
      if tname in ["ru-noun", "ru-proper noun"]:
        pagemsg("find_decl_args: Found headword template: %s" % unicode(t))
        headword_templates.append(t)
      if tname in ["ru-decl-noun-z"]:
        pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t))
        decl_z_templates.append(t)

    if not decl_templates:
      if decl_z_templates:
        # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
        # {{ru-decl-noun-z|ёж|m-inan|b}}
        if len(decl_z_templates) > 1:
          pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          return None
        else:
          decl_z_template = decl_z_templates[0]
          headword_template = None
          pagemsg("find_decl_args: Using z-decl template: %s" %
              unicode(decl_z_template))
          if len(headword_templates) == 0:
            pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          elif len(headword_templates) > 1:
            pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" %
                (wordind, lemma, infl, unicode(decl_z_template)))
          else:
            headword_template = headword_templates[0]
            pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" %
                (wordind, lemma, infl, unicode(headword_template),
                  unicode(decl_z_template)))
          decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template,
              subpagetitle, pagemsg, headword_template=headword_template)
          decl_templates = [decl_template]

      elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
        x for x in headword_templates if getparam(x, "3") == "-"]:
        return [("1", wordlink), ("2", "$")], False, None, None
      else:
        pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    if len(decl_templates) == 1:
      decl_template = decl_templates[0]
    else:
      # Multiple decl templates
      for t in decl_templates:
        if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma):
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
          decl_template = t
          break
      else:
        if lemma in use_given_decl:
          overriding_decl = use_given_decl[lemma]
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
              (wordind, overriding_decl, lemma, infl))
          decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        elif pagetitle in use_given_page_decl:
          overriding_decl = use_given_page_decl[pagetitle].get(lemma, None)
          if not overriding_decl:
            pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
            return
          else:
            pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" %
                (wordind, overriding_decl, lemma, infl))
            decl_template = blib.parse_text(overriding_decl).filter_templates()[0]
        else:
          pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" %
              (wordind, lemma, infl))
          return None

    pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template))
    if unicode(decl_template.name) == "ru-decl-adj":
      if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
        return [("1", wordlink), ("2", u"+ь")], True, None, None
      else:
        return [("1", wordlink), ("2", "+")], True, None, None

    # ru-noun-table
    assert unicode(decl_template.name) == "ru-noun-table"

    # Split out the arg sets in the declension and check the
    # lemma of each one, taking care to handle cases where there is no lemma
    # (it would default to the page name).

    highest_numbered_param = 0
    for p in decl_template.params:
      pname = unicode(p.name)
      if re.search("^[0-9]+$", pname):
        highest_numbered_param = max(highest_numbered_param, int(pname))

    # Now gather the numbered arguments into arg sets. Code taken from
    # ru-noun.lua.
    offset = 0
    arg_sets = []
    arg_set = []
    for i in xrange(1, highest_numbered_param + 2):
      end_arg_set = False
      val = getparam(decl_template, str(i))
      if i == highest_numbered_param + 1:
        end_arg_set = True
      elif val == "_" or val == "-" or re.search("^join:", val):
        pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None
      elif val == "or":
        end_arg_set = True

      if end_arg_set:
        arg_sets.append(arg_set)
        arg_set = []
        offset = i
      else:
        arg_set.append(val)

    canon_infl = ru.remove_accents(infl).lower()
    canon_lemma = lemma.lower()
    ispl = False
    need_sc1 = False
    found_gender = None
    if canon_infl != canon_lemma:
      for sgend, plend, gender, is_sc1 in pl_data:
        if sgend:
          check_sgend = sgend
        else:
          check_sgend = consonant_re
        if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma):
          ispl = True
          found_gender = gender
          need_sc1 = is_sc1
          break
      else:
        pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" %
            (wordind, lemma, infl))
        return None

    # Substitute the wordlink for any lemmas in the declension.
    # If plural, also add gender and verify special case (1) as necessary.
    # Concatenate all the numbered params, substituting the wordlink into
    # the lemma as necessary.
    numbered_params = []
    for arg_set in arg_sets:
      lemma_arg = 0
      if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]):
        lemma_arg = 1
      if len(arg_set) <= lemma_arg:
        arg_set.append("")
      arglemma = arg_set[lemma_arg]
      manualtr = ""
      if "//" in arglemma:
        arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups()
      if (not arglemma or arglemma.lower() == infl.lower() or
          ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() ==
          ru.remove_accents(infl).lower() or
          ispl and ru.remove_accents(arglemma).lower() == lemma.lower()
          ):
        arg_set[lemma_arg] = wordlink + manualtr
      else:
        pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % (
          wordlink, arg_set[lemma_arg], ispl and ", skipping" or ""))
        if ispl:
          return None

      if ispl:
        # Add the gender
        if len(arg_set) <= lemma_arg + 1:
          arg_set.append("")
        declarg = arg_set[lemma_arg + 1]

        # First, sub in gender
        m = re.search("(3f|[mfn])", declarg)
        if found_gender == "mf":
          if not m:
            pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          decl_gender = m.group(1)
          if decl_gender == "n":
            pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" %
                (wordinfl, lemma, infl))
            return None
          elif decl_gender in ["m", "3f"]:
            pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                (decl_gender, wordind, lemma, infl))
          else:
            assert gender == "f"
            pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" %
                (wordind, lemma, infl))
            declarg = re.sub("f", "3f", declarg, 1)
        else:
          if m:
            decl_gender = m.group(1)
            if decl_gender == found_gender:
              pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" %
                  (found_gender, wordind, lemma, infl))
            else:
              pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" %
                  (decl_gender, wordind, found_gender, lemma, infl))
              declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1)
          else:
            pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" %
                (wordind, found_gender, lemma, infl))
            declarg = found_gender + declarg

        # Now check special case 1
        if need_sc1 != ("(1)" in declarg):
          if need_sc1:
            pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None
          else:
            pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (
              wordind, declarg, lemma, infl))
            return None

        arg_set[lemma_arg + 1] = declarg

      if numbered_params:
        numbered_params.append("or")
      numbered_params.extend(arg_set)

    # Now gather all params, including named ones.
    params = []
    params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params))
    num = None
    anim = None
    for p in decl_template.params:
      pname = unicode(p.name)
      val = unicode(p.value)
      if pname == "a":
        anim = val
      elif pname == "n":
        num = val
      elif pname == "notes":
        params.append((pname, val))
      elif pname == "title":
        pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" %
            (wordind, lemma, infl, val))
      elif re.search("^[0-9]+$", pname):
        pass
      else:
        keepparam = True
        if pname == "loc":
          if pagetitle in keep_locative:
            pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          else:
            pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
            keepparam = False
        if pname == "par":
          pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if pname == "voc":
          pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (
            wordind, val, lemma, infl))
          keepparam = False
        if keepparam:
          if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
            pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" %
                (wordind, val, lemma, infl))
          pname += str(wordind)
          params.append((pname, val))
Example #3
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping")
        return

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    origtext = page.text
    parsed = blib.parse_text(origtext)

    # Find the declension arguments for LEMMA and inflected form INFL,
    # the WORDINDth word in the expression. Return value is a tuple of
    # four items: a list of (NAME, VALUE) tuples for the arguments, whether
    # the word is an adjective, the value of n= (if given), and the value
    # of a= (if given).
    def find_decl_args(lemma, infl, wordind):
        declpage = pywikibot.Page(site, lemma)
        if rulib.remove_accents(infl) == lemma:
            wordlink = "[[%s]]" % infl
        else:
            wordlink = "[[%s|%s]]" % (lemma, infl)

        if not declpage.exists():
            if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma):
                pagemsg(
                    "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return [("1", wordlink), ("2", "+")], True, None, None
            else:
                pagemsg(
                    "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
        parsed = blib.parse_text(declpage.text)
        decl_templates = []
        headword_templates = []
        decl_z_templates = []
        for t in parsed.filter_templates():
            tname = unicode(t.name)
            if tname in ["ru-noun-table", "ru-decl-adj"]:
                pagemsg("find_decl_args: Found decl template: %s" % unicode(t))
                decl_templates.append(t)
            if tname in ["ru-noun", "ru-proper noun"]:
                pagemsg("find_decl_args: Found headword template: %s" %
                        unicode(t))
                headword_templates.append(t)
            if tname in ["ru-decl-noun-z"]:
                pagemsg("find_decl_args: Found z-decl template: %s" %
                        unicode(t))
                decl_z_templates.append(t)

        if not decl_templates:
            if decl_z_templates:
                # {{ru-decl-noun-z|звезда́|f-in|d|ё}}
                # {{ru-decl-noun-z|ёж|m-inan|b}}
                if len(decl_z_templates) > 1:
                    pagemsg(
                        "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None
                else:
                    decl_z_template = decl_z_templates[0]
                    headword_template = None
                    pagemsg("find_decl_args: Using z-decl template: %s" %
                            unicode(decl_z_template))
                    if len(headword_templates) == 0:
                        pagemsg(
                            "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    elif len(headword_templates) > 1:
                        pagemsg(
                            "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s"
                            % (wordind, lemma, infl, unicode(decl_z_template)))
                    else:
                        headword_template = headword_templates[0]
                        pagemsg(
                            "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s"
                            %
                            (wordind, lemma, infl, unicode(headword_template),
                             unicode(decl_z_template)))
                    decl_template = runounlib.convert_zdecl_to_ru_noun_table(
                        decl_z_template,
                        subpagetitle,
                        pagemsg,
                        headword_template=headword_template)
                    decl_templates = [decl_template]

            elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [
                    x for x in headword_templates if getparam(x, "3") == "-"
            ]:
                return [("1", wordlink), ("2", "$")], False, None, None
            else:
                pagemsg(
                    "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        if len(decl_templates) == 1:
            decl_template = decl_templates[0]
        else:
            # Multiple decl templates
            for t in decl_templates:
                if unicode(t.name) == "ru-decl-adj" and re.search(
                        u"(ий|ый|ой)$", lemma):
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    decl_template = t
                    break
            else:
                if lemma in use_given_decl:
                    overriding_decl = use_given_decl[lemma]
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                        % (wordind, overriding_decl, lemma, infl))
                    decl_template = blib.parse_text(
                        overriding_decl).filter_templates()[0]
                elif pagetitle in use_given_page_decl:
                    overriding_decl = use_given_page_decl[pagetitle].get(
                        lemma, None)
                    if not overriding_decl:
                        pagemsg(
                            "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        return
                    else:
                        pagemsg(
                            "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s"
                            % (wordind, overriding_decl, lemma, infl))
                        decl_template = blib.parse_text(
                            overriding_decl).filter_templates()[0]
                else:
                    pagemsg(
                        "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s"
                        % (wordind, lemma, infl))
                    return None

        pagemsg("find_decl_args: Using decl template: %s" %
                unicode(decl_template))
        if unicode(decl_template.name) == "ru-decl-adj":
            if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U):
                return [("1", wordlink), ("2", u"+ь")], True, None, None
            else:
                return [("1", wordlink), ("2", "+")], True, None, None

        # ru-noun-table
        assert unicode(decl_template.name) == "ru-noun-table"

        # Split out the arg sets in the declension and check the
        # lemma of each one, taking care to handle cases where there is no lemma
        # (it would default to the page name).

        highest_numbered_param = 0
        for p in decl_template.params:
            pname = unicode(p.name)
            if re.search("^[0-9]+$", pname):
                highest_numbered_param = max(highest_numbered_param,
                                             int(pname))

        # Now gather the numbered arguments into arg sets. Code taken from
        # ru-noun.lua.
        offset = 0
        arg_sets = []
        arg_set = []
        for i in xrange(1, highest_numbered_param + 2):
            end_arg_set = False
            val = getparam(decl_template, str(i))
            if i == highest_numbered_param + 1:
                end_arg_set = True
            elif val == "_" or val == "-" or re.search("^join:", val):
                pagemsg(
                    "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None
            elif val == "or":
                end_arg_set = True

            if end_arg_set:
                arg_sets.append(arg_set)
                arg_set = []
                offset = i
            else:
                arg_set.append(val)

        canon_infl = rulib.remove_accents(infl).lower()
        canon_lemma = lemma.lower()
        ispl = False
        need_sc1 = False
        found_gender = None
        if canon_infl != canon_lemma:
            for sgend, plend, gender, is_sc1 in pl_data:
                if sgend:
                    check_sgend = sgend
                else:
                    check_sgend = consonant_re
                if re.search(check_sgend + "$",
                             canon_lemma) and canon_infl == re.sub(
                                 sgend + "$", plend, canon_lemma):
                    ispl = True
                    found_gender = gender
                    need_sc1 = is_sc1
                    break
            else:
                pagemsg(
                    "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s"
                    % (wordind, lemma, infl))
                return None

        # Substitute the wordlink for any lemmas in the declension.
        # If plural, also add gender and verify special case (1) as necessary.
        # Concatenate all the numbered params, substituting the wordlink into
        # the lemma as necessary.
        numbered_params = []
        for arg_set in arg_sets:
            lemma_arg = 0
            if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]):
                lemma_arg = 1
            if len(arg_set) <= lemma_arg:
                arg_set.append("")
            arglemma = arg_set[lemma_arg]
            manualtr = ""
            if "//" in arglemma:
                arglemma, manualtr = re.search("^(.*?)(//.*?)$",
                                               arglemma).groups()
            if (not arglemma or arglemma.lower() == infl.lower()
                    or rulib.is_monosyllabic(infl)
                    and rulib.remove_accents(arglemma).lower()
                    == rulib.remove_accents(infl).lower() or ispl and
                    rulib.remove_accents(arglemma).lower() == lemma.lower()):
                arg_set[lemma_arg] = wordlink + manualtr
            else:
                pagemsg(
                    "WARNING: Can't sub word link %s into decl lemma %s%s" %
                    (wordlink, arg_set[lemma_arg], ispl and ", skipping"
                     or ""))
                if ispl:
                    return None

            if ispl:
                # Add the gender
                if len(arg_set) <= lemma_arg + 1:
                    arg_set.append("")
                declarg = arg_set[lemma_arg + 1]

                # First, sub in gender
                m = re.search("(3f|[mfn])", declarg)
                if found_gender == "mf":
                    if not m:
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    decl_gender = m.group(1)
                    if decl_gender == "n":
                        pagemsg(
                            u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s"
                            % (wordinfl, lemma, infl))
                        return None
                    elif decl_gender in ["m", "3f"]:
                        pagemsg(
                            u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                            % (decl_gender, wordind, lemma, infl))
                    else:
                        assert gender == "f"
                        pagemsg(
                            u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s"
                            % (wordind, lemma, infl))
                        declarg = re.sub("f", "3f", declarg, 1)
                else:
                    if m:
                        decl_gender = m.group(1)
                        if decl_gender == found_gender:
                            pagemsg(
                                "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s"
                                % (found_gender, wordind, lemma, infl))
                        else:
                            pagemsg(
                                "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s"
                                % (decl_gender, wordind, found_gender, lemma,
                                   infl))
                            declarg = re.sub("(3f|[mfn])", found_gender,
                                             declarg, 1)
                    else:
                        pagemsg(
                            "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s"
                            % (wordind, found_gender, lemma, infl))
                        declarg = found_gender + declarg

                # Now check special case 1
                if need_sc1 != ("(1)" in declarg):
                    if need_sc1:
                        pagemsg(
                            "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None
                    else:
                        pagemsg(
                            "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s"
                            % (wordind, declarg, lemma, infl))
                        return None

                arg_set[lemma_arg + 1] = declarg

            if numbered_params:
                numbered_params.append("or")
            numbered_params.extend(arg_set)

        # Now gather all params, including named ones.
        params = []
        params.extend(
            (str(i + 1), val)
            for i, val in zip(xrange(len(numbered_params)), numbered_params))
        num = None
        anim = None
        for p in decl_template.params:
            pname = unicode(p.name)
            val = unicode(p.value)
            if pname == "a":
                anim = val
            elif pname == "n":
                num = val
            elif pname == "notes":
                params.append((pname, val))
            elif pname == "title":
                pagemsg(
                    "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s"
                    % (wordind, lemma, infl, val))
            elif re.search("^[0-9]+$", pname):
                pass
            else:
                keepparam = True
                if pname == "loc":
                    if pagetitle in keep_locative:
                        pagemsg(
                            "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    else:
                        pagemsg(
                            "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                        keepparam = False
                if pname == "par":
                    pagemsg(
                        "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if pname == "voc":
                    pagemsg(
                        "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s"
                        % (wordind, val, lemma, infl))
                    keepparam = False
                if keepparam:
                    if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U):
                        pagemsg(
                            u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s"
                            % (wordind, val, lemma, infl))
                    pname += str(wordind)
                    params.append((pname, val))
def find_accented_2(term, termtr, verbose, pagemsg):
  if term in accentless_multisyllable:
    pagemsg("Not accenting unaccented multisyllabic particle %s" % term)
    return term, termtr
  # This can happen if e.g. we're passed "[[FOO|BAR]] BAZ"; we will reject it,
  # but it will then be word-split and handled correctly ("[[FOO|BAR]]" is
  # special-cased in find_accented_1()).
  if "|" in term:
    #pagemsg("Can't handle links with vertical bars: %s" % term)
    return term, termtr
  # This can happen if e.g. we're passed "[[FOO]] [[BAR]]"; we will reject it,
  # but it will then be word-split and handled correctly ("[[FOO]]" is
  # special-cased in find_accented_1()).
  if "[" in term or "]" in term:
    #pagemsg("Can't handle stray bracket in %s" % term)
    return term, termtr
  if "<" in term or ">" in term:
    pagemsg("Can't handle stray < or >: %s" % term)
    return term, termtr
  if u"\u0301" in term or u"ё" in term:
    pagemsg(u"Term has accent or ё, not looking up accents: %s" % term)
    return term, termtr
  if ru.is_monosyllabic(term):
    pagemsg("Term is monosyllabic, not looking up accents: %s" % term)
    return term, termtr
  pagename = ru.remove_accents(term)
  # We can't use expand_text() from find_accented_1() because it has a
  # different value for PAGENAME, and the proper value is important in
  # expanding ru-noun+ and ru-proper noun+.
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose)

  # Look up the page
  if semi_verbose:
    pagemsg("find_accented: Finding heads on page %s" % pagename)

  cached_redirect = False
  global num_cache_lookups
  num_cache_lookups += 1
  if pagename in accented_cache:
    global num_cache_hits
    num_cache_hits += 1
    result = accented_cache[pagename]
    cached = True
    if result is None:
      if semi_verbose:
        pagemsg("find_accented: Page %s doesn't exist (cached)" % pagename)
      return term, termtr
    elif result == "redirect":
      cached_redirect = True
      heads = set()
      saw_head = False
    else:
      heads, saw_head = result
  else:
    cached = False
    page = pywikibot.Page(site, pagename)
    try:
      if not page.exists():
        if semi_verbose:
          pagemsg("find_accented: Page %s doesn't exist" % pagename)
        if not global_disable_cache:
          accented_cache[pagename] = None
        return term, termtr
    except Exception as e:
      pagemsg("WARNING: Error checking page existence: %s" % unicode(e))
      if not global_disable_cache:
        accented_cache[pagename] = None
      return term, termtr

    # Page exists, find the heads
    heads = set()
    def add(val, tr):
      val_to_add = blib.remove_links(val)
      if val_to_add:
        heads.add((val_to_add, tr))
    saw_head = False
    for t in blib.parse(page).filter_templates():
      tname = unicode(t.name)
      if tname in ru_head_templates:
        saw_head = True
        if getparam(t, "1"):
          add(getparam(t, "1"), getparam(t, "tr"))
        elif getparam(t, "head"):
          add(getparam(t, "head"), getparam(t, "tr"))
      elif tname == "head" and getparam(t, "1") == "ru":
        saw_head = True
        add(getparam(t, "head"), getparam(t, "tr"))
      elif tname in ["ru-noun+", "ru-proper noun+"]:
        saw_head = True
        lemma = ru.fetch_noun_lemma(t, expand_text)
        lemmas = re.split(",", lemma)
        lemmas = [split_ru_tr(lemma) for lemma in lemmas]
        # Group lemmas by Russian, to group multiple translits
        lemmas = ru.group_translits(lemmas, pagemsg, expand_text)
        for val, tr in lemmas:
          add(val, tr)
      if saw_head:
        for i in xrange(2, 10):
          headn = getparam(t, "head" + str(i))
          if headn:
            add(headn, getparam(t, "tr" + str(i)))
    if not global_disable_cache:
      accented_cache[pagename] = (heads, saw_head)

  # We have the heads
  cached_msg = " (cached)" if cached else ""
  if len(heads) == 0:
    if not saw_head:
      if cached_redirect:
        pagemsg("Redirect without heads (cached)")
      elif not cached and re.match("#redirect", page.text, re.I):
        if not global_disable_cache:
          accented_cache[pagename] = "redirect"
        pagemsg("Redirect without heads")
      else:
        pagemsg("WARNING: Can't find any heads: %s%s" % (pagename, cached_msg))
    return term, termtr
  if len(heads) > 1:
    pagemsg("WARNING: Found multiple heads for %s%s: %s" % (pagename, cached_msg, ",".join("%s%s" % (ru, "//%s" % tr if tr else "") for ru, tr in heads)))
    return term, termtr
  newterm, newtr = list(heads)[0]
  if semi_verbose:
    pagemsg("find_accented: Found head %s%s%s" % (newterm, "//%s" % newtr if newtr else "", cached_msg))
  if re.search("[!?]$", newterm) and not re.search("[!?]$", term):
    newterm_wo_punc = re.sub("[!?]$", "", newterm)
    if ru.remove_accents(newterm_wo_punc) == ru.remove_accents(term):
      pagemsg("Removing punctuation from %s when matching against %s" % (
        newterm, term))
      newterm = newterm_wo_punc
  if ru.remove_accents(newterm) != ru.remove_accents(term):
    pagemsg("WARNING: Accented term %s differs from %s in more than just accents%s" % (
      newterm, term, cached_msg))
  return newterm, newtr
Example #5
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    subpagetitle = re.sub("^.*:", "", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    override_pos = pages_pos.get(pagetitle, None)
    if override_pos:
        del pages_pos[pagetitle]

    if ":" in pagetitle:
        pagemsg("WARNING: Colon in page title, skipping page")
        return

    titlewords = split_words(pagetitle, True)
    saw_e = False
    for word in titlewords:
        if word.endswith(u"е") and not rulib.is_monosyllabic(word):
            saw_e = True
            break
    if not saw_e:
        pagemsg(u"No possible final unstressed -е in page title, skipping")
        return

    #if (" " in pagetitle or "-" in pagetitle) and not override_pos:
    #  pagemsg(u"WARNING: Space or hyphen in page title and probable final unstressed -е, not sure how to handle yet")
    #  return

    text = unicode(page.text)
    notes = []

    foundrussian = False
    sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

    for j in xrange(2, len(sections), 2):
        if sections[j - 1] == "==Russian==\n":
            if foundrussian:
                pagemsg(
                    "WARNING: Found multiple Russian sections, skipping page")
                return
            foundrussian = True

            subsections = re.split(
                "(^===(?:Etymology|Pronunciation) [0-9]+===\n)", sections[j],
                0, re.M)
            # If no separate etymology sections, add extra stuff at the beginning
            # to fit the pattern
            if len(subsections) == 1:
                subsections = ["", ""] + subsections

            subsections_with_ru_ipa_to_fix = set()
            subsections_with_ru_ipa = set()
            for k in xrange(0, len(subsections), 2):
                for t in blib.parse_text(subsections[k]).filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        subsections_with_ru_ipa.add(k)
                        if getparam(t, "pos"):
                            pagemsg(
                                "Already has pos=, skipping template in section %s: %s"
                                % (k // 2, unicode(t)))
                        else:
                            phon = (getparam(t, "phon") or getparam(t, "1")
                                    or pagetitle).lower()
                            phonwords = split_words(phon, True)
                            if len(phonwords) != len(titlewords):
                                pagemsg(
                                    "WARNING: #Words (%s) in phon=%s not same as #words (%s) in title"
                                    % ((len(phonwords) + 1) // 2, phon,
                                       (len(titlewords) + 1) // 2))
                                for i in xrange(0, len(phonwords), 2):
                                    phonword = phonwords[i]
                                    wordno = i // 2 + 1
                                    if rulib.is_monosyllabic(phonword):
                                        pagemsg(
                                            "Skipping monosyllabic pronun %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                    elif not phonword.endswith(u"е"):
                                        pagemsg(
                                            u"Skipping pronun word %s (#%s) in section %s because doesn't end in -е"
                                            % (phonword, wordno, k // 2))
                                    else:
                                        pagemsg(
                                            "Found template that will be modified due to phonword %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                        subsections_with_ru_ipa_to_fix.add(k)
                            else:
                                for i in xrange(0, len(phonwords), 2):
                                    titleword = titlewords[i]
                                    phonword = phonwords[i]
                                    wordno = i // 2 + 1
                                    if rulib.is_monosyllabic(phonword):
                                        pagemsg(
                                            "Skipping monosyllabic pronun %s (#%s) in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))
                                    elif not titleword.endswith(u"е"):
                                        pagemsg(
                                            u"Skipping title word %s (#%s) in section %s because doesn't end in -е"
                                            % (titleword, wordno, k // 2))
                                    elif re.search(
                                            u"([еия]|цы|е̂|[кгхцшжщч]а)" +
                                            rulib.DOTABOVE + "?$", phonword):
                                        pagemsg(
                                            "Found template that will be modified due to phonword %s, titleword %s (#%s) in section %s: %s"
                                            % (phonword, titleword, wordno,
                                               k // 2, unicode(t)))
                                        subsections_with_ru_ipa_to_fix.add(k)
                                    elif not re.search(
                                            u"[еэѐ][" + rulib.AC + rulib.GR +
                                            rulib.CFLEX + rulib.DUBGR + "]?$",
                                            phonword):
                                        pagemsg(
                                            u"WARNING: ru-IPA pronunciation word %s (#%s) doesn't end in [еэия] or е̂ or hard sibilant + [ыа] when corresponding titleword %s ends in -е, something wrong in section %s: %s"
                                            % (phonword, wordno, titleword,
                                               k // 2, unicode(t)))
                                    else:
                                        pagemsg(
                                            u"Pronun word %s (#%s) with final -э or stressed vowel, ignoring in section %s: %s"
                                            % (phonword, wordno, k // 2,
                                               unicode(t)))

            if not subsections_with_ru_ipa:
                pagemsg("No ru-IPA on page, skipping page")
                return
            if not subsections_with_ru_ipa_to_fix:
                pagemsg("No fixable ru-IPA on page, skipping page")
                return

            # If saw ru-IPA covering multiple etym sections, make sure we don't
            # also have pronuns inside the etym sections, and then treat as one
            # single section for the purposes of finding POS's
            if 0 in subsections_with_ru_ipa:
                if len(subsections_with_ru_ipa) > 1:
                    pagemsg(
                        "WARNING: Saw ru-IPA in section 0 (covering multiple etym or pronun sections) and also inside etym/pronun section(s) %s; skipping page"
                        %
                        (",".join(k // 2
                                  for k in subsections_with_ru_ipa if k > 0)))
                    return
                subsections = ["", "", "".join(subsections)]
                subsections_with_ru_ipa_to_fix = {2}

            for k in subsections_with_ru_ipa_to_fix:
                pagemsg("Fixing section %s" % (k // 2))
                parsed = blib.parse_text(subsections[k])

                if override_pos:
                    pos = override_pos
                else:
                    pos = set()
                    is_lemma = set()
                    lemma = set()
                    saw_acc = False
                    saw_noun_form = False
                    for t in parsed.filter_templates():

                        def getp(param):
                            return getparam(t, param)

                        tname = unicode(t.name)
                        if tname in ["ru-noun", "ru-proper noun"]:
                            if getparam(t, "2") == "-":
                                pagemsg("Found invariable noun: %s" %
                                        unicode(t))
                                pos.add("inv")
                            else:
                                pagemsg("Found declined noun: %s" % unicode(t))
                                pos.add("n")
                            is_lemma.add(True)
                        elif tname in ["ru-noun+", "ru-proper noun+"]:
                            for param in t.params:
                                if re.search("^[0-9]+$", unicode(
                                        param.name)) and "+" in unicode(
                                            param.value):
                                    pagemsg(
                                        "Found declined adjectival noun, treating as adjective: %s"
                                        % unicode(t))
                                    pos.add("a")
                                    break
                            else:
                                pagemsg("Found declined noun: %s" % unicode(t))
                                pos.add("n")
                            is_lemma.add(True)
                        elif tname == "comparative of" and getp(
                                "lang") == "ru":
                            pagemsg("Found comparative: %s" % unicode(t))
                            pos.add("com")
                            is_lemma.add(False)
                        elif tname == "ru-adv":
                            pagemsg("Found adverb: %s" % unicode(t))
                            pos.add("adv")
                            is_lemma.add(True)
                        elif tname == "ru-adj":
                            pagemsg("Found adjective: %s" % unicode(t))
                            pos.add("a")
                            is_lemma.add(True)
                        elif tname == "ru-noun form":
                            pagemsg("Found noun form: %s" % unicode(t))
                            saw_noun_form = True
                            is_lemma.add(False)
                        elif tname == "head" and getp("1") == "ru":
                            if getp("2") == "verb form":
                                pagemsg("Found verb form: %s" % unicode(t))
                                pos.add("v")
                                is_lemma.add(False)
                            elif getp("2") in [
                                    "adjective form", "participle form"
                            ]:
                                pagemsg("Found adjective form: %s" %
                                        unicode(t))
                                pos.add("a")
                                is_lemma.add(False)
                            elif getp("2") == "noun form":
                                pagemsg("Found noun form: %s" % unicode(t))
                                saw_noun_form = True
                                is_lemma.add(False)
                            elif getp("2") == "pronoun form":
                                pagemsg("Found pronoun form: %s" % unicode(t))
                                pos.add("pro")
                                is_lemma.add(False)
                            elif getp("2") == "preposition":
                                pagemsg("Found preposition: %s" % unicode(t))
                                pos.add("p")
                                is_lemma.add(True)
                            elif getp("2") == "numeral":
                                pagemsg("Found numeral: %s" % unicode(t))
                                pos.add("num")
                                is_lemma.add(True)
                            elif getp("2") == "pronoun":
                                pagemsg("Found pronoun: %s" % unicode(t))
                                pos.add("pro")
                                is_lemma.add(True)
                        elif tname == "inflection of" and getp("lang") == "ru":
                            is_lemma.add(False)
                            lemma.add(rulib.remove_accents(getp("1")))
                            if saw_noun_form:
                                inflection_groups = []
                                inflection_group = []
                                for param in t.params:
                                    if param.name in ["1", "2"]:
                                        continue
                                    val = unicode(param.value)
                                    if val == ";":
                                        if inflection_group:
                                            inflection_groups.append(
                                                inflection_group)
                                            inflection_group = []
                                    else:
                                        inflection_group.append(val)
                                if inflection_group:
                                    inflection_groups.append(inflection_group)
                                for igroup in inflection_groups:
                                    igroup = set(igroup)
                                    is_plural = not not ({"p", "plural"}
                                                         & igroup)
                                    if is_plural and ({"nom", "nominative"}
                                                      & igroup):
                                        pagemsg(
                                            "Found nominative plural case inflection: %s"
                                            % unicode(t))
                                        pos.add("nnp")
                                    elif {"acc", "accusative"} & igroup:
                                        # We use "n" for misc cases, but skip accusative for now,
                                        # adding "n" later if we haven't seen nnp to avoid problems
                                        # below with the check for multiple pos's (nom pl and acc pl
                                        # are frequently the same)
                                        saw_acc = True
                                    elif not is_plural and (
                                        {"pre", "prep", "prepositional"}
                                            & igroup):
                                        pagemsg(
                                            "Found prepositional singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("pre")
                                    elif not is_plural and ({"dat", "dative"}
                                                            & igroup):
                                        pagemsg(
                                            "Found dative singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("dat")
                                    elif not is_plural and (
                                        {"loc", "locative"} & igroup):
                                        pagemsg(
                                            "Found locative singular case inflection: %s"
                                            % unicode(t))
                                        pos.add("dat")
                                    elif not is_plural and (
                                        {"voc", "vocative"} & igroup):
                                        pagemsg(
                                            "Found vocative case inflection: %s"
                                            % unicode(t))
                                        pos.add("voc")
                                    else:
                                        pos.add("n")
                        elif tname == "prepositional singular of" and getp(
                                "lang") == "ru":
                            pagemsg(
                                "Found prepositional singular case inflection: %s"
                                % unicode(t))
                            pos.add("pre")
                            is_lemma.add(False)
                            lemma.add(getp("1"))
                        elif tname == "dative singular of" and getp(
                                "lang") == "ru":
                            pagemsg(
                                "Found dative singular case inflection: %s" %
                                unicode(t))
                            pos.add("dat")
                            is_lemma.add(False)
                            lemma.add(getp("1"))
                        elif tname == "vocative singular of" and getp(
                                "lang") == "ru":
                            pagemsg("Found vocative case inflection: %s" %
                                    unicode(t))
                            pos.add("voc")
                            is_lemma.add(False)
                            lemma.add(getp("1"))

                    if saw_acc and "nnp" not in pos:
                        pos.add("n")
                    if "dat" in pos and "pre" in pos:
                        pagemsg("Removing pos=dat because pos=pre is found")
                        pos.remove("dat")
                    if "com" in pos:
                        if "a" in pos:
                            pagemsg("Removing pos=a because pos=com is found")
                            pos.remove("a")
                        if "adv" in pos:
                            pagemsg(
                                "Removing pos=adv because pos=com is found")
                            pos.remove("adv")
                    if "a" in pos and "nnp" in pos:
                        pagemsg("Removing pos=nnp because pos=a is found")
                        pos.remove("nnp")
                    if not pos:
                        pagemsg(
                            "WARNING: Can't locate any parts of speech, skipping section"
                        )
                        continue
                    if len(pos) > 1:
                        pagemsg(
                            "WARNING: Found multiple parts of speech, skipping section: %s"
                            % ",".join(pos))
                        continue
                    pos = list(pos)[0]

                    # If multiword term or potential adjectival term, can't trust
                    # the part of speech coming from the above process
                    if (" " in pagetitle or "-" in pagetitle
                            or re.search(u"[ыиео]́?е$", pagetitle)):
                        if not is_lemma:
                            pagemsg(
                                "WARNING: Can't determine whether lemma or not, skipping section"
                            )
                            continue
                        if len(is_lemma) > 1:
                            pagemsg(
                                "WARNING: Found both lemma and non-lemma parts of speech, skipping section"
                            )
                            continue
                        is_lemma = list(is_lemma)[0]
                        if (" " in pagetitle or "-" in pagetitle) and is_lemma:
                            pagemsg(
                                u"WARNING: Space or hyphen in lemma page title and probable final unstressed -e, not sure how to handle yet, skipping section"
                            )
                            continue
                        # If is_lemma, we are a single-word adjective and will be handled
                        # correctly by the above code
                        if not is_lemma:
                            if not lemma:
                                pagemsg(
                                    "WARNING: Non-lemma form and can't determine lemma, skipping section"
                                )
                                continue
                            if len(lemma) > 1:
                                pagemsg(
                                    "WARNING: Found inflections of multiple lemmas, skipping section: %s"
                                    % ",".join(lemma))
                                continue
                            lemma = list(lemma)[0]
                            retval = find_noun_word_types(lemma, pagemsg)
                            if not retval:
                                continue
                            word_types, seen_pos_specs = retval
                            words = split_words(pagetitle, False)
                            assert len(words) == len(word_types)
                            modified_word_types = []
                            need_to_continue = False
                            # FIXME: Should we be using phonetic version of lemma?
                            for wordno, (word, ty) in enumerate(
                                    zip(words, word_types)):
                                if word.endswith(
                                        u"е"
                                ) and not rulib.is_monosyllabic(word):
                                    if ty == "inv":
                                        if len(seen_pos_specs) > 1:
                                            pagemsg(
                                                u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has ambiguous pos= params (%s), not sure what to do, skipping section"
                                                % (pagetitle, word,
                                                   ",".join(seen_pos_specs)))
                                            need_to_continue = True
                                            break
                                        elif not seen_pos_specs:
                                            pagemsg(
                                                u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has no pos= params, not sure what to do, skipping section"
                                                % (pagetitle, word))
                                            need_to_continue = True
                                            break
                                        else:
                                            seen_pos_spec = list(
                                                seen_pos_specs)[0]
                                            seen_poses = re.split(
                                                "/", seen_pos_spec)
                                            if len(seen_poses) == 1:
                                                ty = seen_poses[0]
                                            elif len(words) != len(seen_poses):
                                                pagemsg(
                                                    u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma param pos=%s has wrong number of parts of speech, not sure what to do, skipping section"
                                                    % (pagetitle, word,
                                                       seen_pos_spec))
                                                need_to_continue = True
                                                break
                                            else:
                                                ty = seen_poses[wordno]
                                                if not ty:
                                                    pagemsg(
                                                        "WARNING: Something wrong with retrieved pos= value from lemma, has blank value"
                                                    )
                                                    need_to_continue = True
                                                    break
                                    if ty == "decln":
                                        modified_word_types.append(pos)
                                    else:
                                        modified_word_types.append(ty)
                                else:
                                    modified_word_types.append("")
                            if need_to_continue:
                                continue
                            non_blank_distinct_mwt = set(
                                x for x in modified_word_types if x)
                            if len(non_blank_distinct_mwt) == 0:
                                pagemsg(
                                    "WARNING: Something wrong, pos= would end up blank"
                                )
                            elif len(non_blank_distinct_mwt) == 1:
                                pos = list(non_blank_distinct_mwt)[0]
                            else:
                                pos = "/".join(modified_word_types)

                # Check whether there's a pronunciation with final -е for a given
                # word. There are some entries that have multiple pronunciations,
                # one with final -е and one with something else, e.g. final -и,
                # and we want to leave those alone with a warning.
                saw_final_e = {}
                for t in parsed.filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        param = "phon"
                        phon = getparam(t, param)
                        if not phon:
                            param = "1"
                            phon = getparam(t, "1")
                            if not phon:
                                param = "pagetitle"
                                phon = pagetitle
                        if getparam(t, "pos"):
                            pass  # Already output msg
                        else:
                            phonwords = split_words(phon, True)
                            for i in xrange(0, len(phonwords), 2):
                                if re.search(u"е$", phonwords[i]):
                                    saw_final_e[i] = True

                # Now modify the templates.
                for t in parsed.filter_templates():
                    if unicode(t.name) == "ru-IPA":
                        param = "phon"
                        phon = getparam(t, param)
                        if not phon:
                            param = "1"
                            phon = getparam(t, "1")
                            if not phon:
                                param = "pagetitle"
                                phon = pagetitle
                        origt = unicode(t)
                        if getparam(t, "pos"):
                            pass  # Already output msg
                        else:
                            phonwords = split_words(phon, True)
                            mismatched_phon_title = len(phonwords) != len(
                                titlewords)
                            for i in xrange(0, len(phonwords), 2):
                                titleword = not mismatched_phon_title and titlewords[
                                    i]
                                phonword = phonwords[i]
                                lphonword = phonword.lower()
                                wordno = i // 2 + 1

                                if rulib.is_monosyllabic(phonword):
                                    pass  # Already output msg
                                elif mismatched_phon_title:
                                    pass  # Can't canonicalize template
                                elif not titleword.endswith(u"е"):
                                    pass  # Already output msg
                                elif re.search(
                                        u"([еия]|цы|е̂|[кгхцшжщч]а)" +
                                        rulib.DOTABOVE + "?$", lphonword):
                                    # Found a template to modify
                                    if re.search(u"е" + rulib.DOTABOVE + "?$",
                                                 lphonword):
                                        pass  # No need to canonicalize
                                    else:
                                        if saw_final_e.get(i, False):
                                            pagemsg(
                                                u"WARNING: Found another pronunciation with final -е, skipping: phon=%s (word #%s)"
                                                % (phonword, wordno))
                                            continue
                                        if re.search(
                                                u"и" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in -и, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -и -> -е")
                                        elif re.search(u"е̂$", lphonword):
                                            # Make this a warning because we're not sure this is correct
                                            pagemsg(
                                                u"WARNING: phon=%s (word #%s) ends in -е̂, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(u"-е̂ -> -е")
                                        elif re.search(
                                                u"я" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in -я, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -я -> -е")
                                        elif re.search(
                                                u"цы" + rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in ц + -ы, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -ы after ц -> -е")
                                        elif re.search(
                                                u"[кгхцшжщч]а" +
                                                rulib.DOTABOVE + "?$",
                                                lphonword):
                                            pagemsg(
                                                u"phon=%s (word #%s) ends in unpaired cons + -а, will modify to -е in section %s: %s"
                                                % (phonword, wordno, k // 2,
                                                   unicode(t)))
                                            notes.append(
                                                u"unstressed -а after unpaired cons -> -е"
                                            )
                                        else:
                                            assert False, "Something wrong, strange ending, logic not correct: section %s, phon=%s (word #%s)" % (
                                                k // 2, phonword, wordno)
                                        newphonword = re.sub(
                                            u"(?:[ияыа]|е̂)(" +
                                            rulib.DOTABOVE + "?)$", ur"е\1",
                                            phonword)
                                        newphonword = re.sub(
                                            u"(?:[ИЯЫА]|Е̂)(" +
                                            rulib.DOTABOVE + "?)$", ur"Е\1",
                                            newphonword)
                                        pagemsg(
                                            "Modified phon=%s (word #%s) to %s in section %s: %s"
                                            % (phonword, wordno, newphonword,
                                               k // 2, unicode(t)))
                                        phonwords[i] = newphonword
                            newphon = "".join(phonwords)
                            if newphon != phon:
                                assert param != "pagetitle", u"Something wrong, page title should not have -и or similar that needs modification: section %s, phon=%s, newphon=%s" % (
                                    k // 2, phon, newphon)
                                if pos in ["voc", "inv", "pro"]:
                                    pagemsg(
                                        u"WARNING: pos=%s may be unstable or inconsistent in handling final -е, please check change of phon=%s to %s in section %s: %s"
                                        % (pos, phon, newphon, k // 2,
                                           unicode(t)))
                                pagemsg(
                                    "Modified phon=%s to %s in section %s: %s"
                                    % (phon, newphon, k // 2, unicode(t)))
                                if pos == "none":
                                    pagemsg(
                                        "WARNING: pos=none, should not occur, not modifying phon=%s to %s in section %s: %s"
                                        % (phon, newphon, k // 2, unicode(t)))
                                else:
                                    t.add(param, newphon)

                            if pos == "none":
                                pagemsg(
                                    "WARNING: pos=none, should not occur, not setting pos= in section %s: %s"
                                    % (k // 2, unicode(t)))
                            else:
                                t.add("pos", pos)
                                notes.append(
                                    "added pos=%s%s" %
                                    (pos, override_pos and " (override)"
                                     or ""))
                                pagemsg(
                                    "Replaced %s with %s in section %s%s" %
                                    (origt, unicode(t), k // 2, override_pos
                                     and " (using override)" or ""))
                subsections[k] = unicode(parsed)
            sections[j] = "".join(subsections)

    new_text = "".join(sections)

    def fmt_key_val(key, val):
        if val == 1:
            return "%s" % key
        else:
            return "%s (%s)" % (key, val)

    if new_text != text:
        assert notes
        # Group identical notes together and append the number of such identical
        # notes if > 1, putting 'added pos=X' notes before others, so we get e.g.
        # "added pos=n (2); added pos=a; unstressed -и -> -е (2)" from five
        # original notes.
        # 1. Count items in notes[] and return a key-value list in descending order
        notescount = Counter(notes).most_common()
        # 2. Extract 'added pos=X' items; we put them first; note, descending order
        #    of # of times each note has been seen is maintained
        added_pos = [(x, y) for x, y in notescount
                     if x.startswith("added pos=")]
        # 3. Extract other items
        not_added_pos = [(x, y) for x, y in notescount
                         if not x.startswith("added pos=")]
        # 4. Recreate notes for 'added pos=X', then others
        notes = [fmt_key_val(x, y) for x, y in added_pos]
        notes.extend([fmt_key_val(x, y) for x, y in not_added_pos])

        return new_text, notes