def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)
  notes = []
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "ru-adj":
      comps = blib.fetch_param_chain(t, "2", "comp")
      newcomps = []
      for comp in comps:
        if re.search(u"е́?й$", comp):
          regcomp = re.sub(u"(е́?)й$", ur"\1е", comp)
          if regcomp in newcomps:
            pagemsg("Skipping informal form %s" % comp)
            notes.append("remove informal comparative %s" % comp)
          else:
            pagemsg("WARNING: Found informal form %s without corresponding regular form")
            newcomps.append(comp)
        else:
          newcomps.append(comp)
      if comps != newcomps:
        blib.set_param_chain(t, newcomps, "2", "comp")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
Beispiel #2
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")
  notes = []

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn == "#invoke:form of/templates" and getparam(t, "1") == "template_tags":
      t.add("1", "tagged_form_of_t")
      notes.append("Rewrite {{#invoke:form of/templates|template_tags}} with {{#invoke:form of/templates|tagged_form_of_t}}")
    if tn == "#invoke:form of" and getparam(t, "1") in ["form_of_t", "alt_form_of_t"]:
      ignorelist = blib.fetch_param_chain(t, "ignorelist", "ignorelist")
      if ignorelist:
        ignore = blib.fetch_param_chain(t, "ignore", "ignore")
        for il in ignorelist:
          ignore.append(il + ":list")
        blib.set_param_chain(t, ignore, "ignore", "ignore", before="ignorelist")
        blib.remove_param_chain(t, "ignorelist", "ignorelist")
      blib.set_template_name(t, "#invoke:form of/templates")
      notes.append("Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}"  % getparam(t, "1"))
    if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t":
      t.add("2", getparam(t, "text"), before="text")
      rmparam(t, "text")
      if t.has("nocap"):
        rmparam(t, "nocap")
      else:
        t.add("withcap", "1")
      if t.has("nodot"):
        rmparam(t, "nodot")
      else:
        t.add("withdot", "1")
      t.add("1", "form_of_t")

    if unicode(t) != origt:
      pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

  return unicode(parsed), notes
 def combine_verbs(m):
   verb1 = m.group(1)
   verb2 = m.group(3)
   if m.group(2):
     pagemsg("WARNING: Would combine verbs but found text '%s' needing to go into a note, skipping: %s and %s" %
         (m.group(2), verb1, verb2))
     return m.group(0)
   t1 = blib.parse_text(verb1).filter_templates()[0]
   t2 = blib.parse_text(verb2).filter_templates()[0]
   for t in [t1, t2]:
     for param in t.params:
       if not re.search("^[0-9]+$", unicode(param.name)):
         pagemsg("Verb conjugation has non-numeric args, skipping: %s" %
             unicode(t))
         return m.group(0)
   params = fetch_numbered_params(t1)
   params.append("or")
   newparams = fetch_numbered_params(t2)
   if len(newparams) < 2:
     pagemsg("WARNING: Something wrong, no verb type in ru-conj: %s" %
         unicode(t2))
     return m.group(0)
   vt1 = getparam(t1, "1")
   vt2 = getparam(t2, "1")
   if vt1 != vt2:
     pagemsg("WARNING: Can't combine verbs of different verb types: %s and %s" %
         (verb1, verb2))
     return m.group(0)
   del newparams[0]
   params.extend(newparams)
   blib.set_param_chain(t1, params, "1", "")
   pagemsg("Combining verb conjugations %s and %s" % (
     getparam(t1, "1"), getparam(t2, "1")))
   pagemsg("Replaced %s with %s" % (m.group(0).replace("\n", r"\n"), unicode(t1)))
   notes.append("combined verb conjugations %s and %s" % (
     getparam(t1, "1"), getparam(t2, "1")))
   return unicode(t1)
Beispiel #4
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    for t in parsed.filter_templates():
        origt = unicode(t)
        if unicode(t.name) == "ru-adj":
            comps = blib.fetch_param_chain(t, "2", "comp")
            newcomps = []
            for comp in comps:
                if re.search(u"е́?й$", comp):
                    regcomp = re.sub(u"(е́?)й$", ur"\1е", comp)
                    if regcomp in newcomps:
                        pagemsg("Skipping informal form %s" % comp)
                        notes.append("remove informal comparative %s" % comp)
                    else:
                        pagemsg(
                            "WARNING: Found informal form %s without corresponding regular form"
                        )
                        newcomps.append(comp)
                else:
                    newcomps.append(comp)
            if comps != newcomps:
                blib.set_param_chain(t, newcomps, "2", "comp")
        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes
 def handle_multiform(firstparam, restparam, form, formtr, declparam=None):
   if form:
     form = split_form(form)
   if declparam:
     if declparam == "-":
       declforms = ["-"]
     else:
       declforms = split_form(getparam(declt, declparam))
     if not form:
       form = declforms
     elif set(form) != set(declforms):
       pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" %
           (restparam, ",".join(form), ",".join(declforms), origt, origdeclt))
   if form:
     blib.set_param_chain(t, form, firstparam, restparam)
   if formtr:
     trparam = ("" if restparam == "head" else restparam) + "tr"
     if not form:
       pagemsg("WARNING: Saw %s=%s but no %s=: %s" %
           ("trparam", formtr, restparam, origt))
     elif len(form) > 1:
       pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" %
           (trparam, formtr, restparam, ",".join(form), origt))
     t.add(trparam, formtr)
 def handle_mf(mf, mf_full, make_mf):
     mfs = blib.fetch_param_chain(t, mf, mf)
     mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl")
     if mfs and not any(x.startswith("+") for x in mfs):
         defmf = make_mf(lemma)
         if set(mfs) == {defmf}:
             defpls = make_plural(defmf)
             ok = False
             if not mfpls or set(mfpls) == set(defpls):
                 ok = True
             elif set(mfpls) < set(defpls):
                 pagemsg(
                     "WARNING: %pl=%s subset of default=%s, allowing"
                     % (mf, ",".join(mfpls), ",".join(defpls)))
                 ok = True
             if ok:
                 notes.append(
                     "replace %s=%s with '+' in {{es-noun}}" %
                     (mf, ",".join(mfs)))
                 blib.set_param_chain(t, ["+"], mf, mf)
                 blib.remove_param_chain(t, mf + "pl", mf + "pl")
                 return
         actual_special = None
         for special in all_specials:
             special_mf = make_mf(lemma, special)
             if special_mf is None:
                 continue
             if mfs == [special_mf]:
                 pagemsg("Found special=%s with special_mf=%s" %
                         (special, special_mf))
                 actual_special = special
                 break
         if actual_special:
             if not mfpls:
                 pagemsg(
                     "WARNING: Explicit %s=%s matches special=%s but no %s plural"
                     % (mf, ",".join(mfs), actual_special, mf_full))
             else:
                 special_mfpl = make_plural(special_mf,
                                            actual_special)
                 if special_mfpl:
                     if len(special_mfpl) > 1 and set(mfpls) < set(
                             special_mfpl):
                         pagemsg(
                             "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf,
                                ",".join(special_mfpl)))
                     elif set(mfpls) == set(special_mfpl):
                         pagemsg(
                             "Found %s=%s and special=%s, %spls=%s matches special_%spl"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf))
                     else:
                         pagemsg(
                             "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s"
                             % (mf, ",".join(mfs), actual_special,
                                mf, ",".join(mfpls), mf,
                                ",".join(special_mfpl)))
                         actual_special = None
             if actual_special:
                 notes.append(
                     "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural"
                     % (mf_full, ",".join(mfs), actual_special,
                        mf_full))
                 blib.set_param_chain(t, ["+%s" % actual_special],
                                      mf, mf)
                 blib.remove_param_chain(t, mf + "pl", mf + "pl")
         if not actual_special:
             defmf = make_mf(lemma)
             mfs_with_def = ["+" if x == defmf else x for x in mfs]
             if mfs_with_def != mfs:
                 notes.append(
                     "replace default %s %s with '+' in {{es-noun}}"
                     % (mf_full, defmf))
                 blib.set_param_chain(t, mfs_with_def, mf, mf)
             if mfpls:
                 defpl = [
                     x for y in mfs for x in (make_plural(y) or [])
                 ]
                 ok = False
                 if set(defpl) == set(mfpls):
                     ok = True
                 elif len(defpl) > 1 and set(mfpls) < set(defpl):
                     pagemsg(
                         "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing"
                         % (mf, ",".join(mfs), mf, ",".join(mfpls),
                            ",".join(defpl)))
                     ok = True
                 if ok:
                     pagemsg(
                         "Found %s=%s, %spl=%s matches default pl" %
                         (mf, ",".join(mfs), mf, ",".join(mfpls)))
                     notes.append(
                         "remove redundant explicit %s plural %s in {{es-noun}}"
                         % (mf_full, ",".join(mfpls)))
                     blib.remove_param_chain(
                         t, mf + "pl", mf + "pl")
                 else:
                     for special in all_specials:
                         defpl = [
                             x for y in mfs for x in (
                                 make_plural(y, special) or [])
                         ]
                         if set(defpl) == set(mfpls):
                             pagemsg(
                                 "Found %s=%s, %spl=%s matches special=%s"
                                 % (mf, ",".join(mfs), mf,
                                    ",".join(mfpls), special))
                             notes.append(
                                 "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}"
                                 % (mf_full, ",".join(mfpls),
                                    special))
                             blib.set_param_chain(
                                 t, ["+%s" % special], mf + "pl",
                                 mf + "pl")
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = ru.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = ru.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword and (not lemmas or pagetitle in lemmas):
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runoun.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runoun.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if old_adj_template not in text and "es-noun" not in text:
        return

    if ":" in pagetitle:
        pagemsg("Skipping non-mainspace title")
        return

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "es-noun" and args.remove_redundant_noun_args:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            if not getparam(t, "2") and (getparam(t, "pl2")
                                         or getparam(t, "pl3")):
                pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" %
                        unicode(t))
                continue
            g = getparam(t, "1")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in m=%s and old default noun algorithm applying"
                    % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in f=%s and old default noun algorithm applying"
                    % ",".join(fs))
            pls = blib.fetch_param_chain(t, "2", "pl")
            if not pls and not g.endswith("-p"):
                if " " in lemma:
                    pagemsg(
                        "WARNING: Space in headword and old default noun algorithm applying"
                    )
                continue
            pls_with_def = []
            defpl = make_plural(lemma)
            if not defpl:
                continue
            if len(defpl) > 1:
                if set(pls) == set(defpl):
                    pls_with_def = ["+"]
                elif set(pls) < set(defpl):
                    pagemsg(
                        "WARNING: pls=%s subset of defpls=%s, replacing with default"
                        % (",".join(pls), ",".join(defpl)))
                    pls_with_def = ["+"]
                else:
                    pls_with_def = pls
            else:
                for pl in pls:
                    if pl == defpl[0]:
                        pls_with_def.append("+")
                    else:
                        pls_with_def.append(pl)

            actual_special = None
            for special in all_specials:
                special_pl = make_plural(lemma, special)
                if special_pl is None:
                    continue
                if len(special_pl) > 1 and set(pls) < set(special_pl):
                    pagemsg(
                        "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing"
                        % (special, ",".join(pls), ",".join(special_pl)))
                    actual_special = special
                    break
                if set(pls) == set(special_pl):
                    pagemsg("Found special=%s with special_pl=%s" %
                            (special, ",".join(special_pl)))
                    actual_special = special
                    break

            if pls_with_def == ["+"]:
                notes.append("remove redundant plural%s %s from {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls)))
                blib.remove_param_chain(t, "2", "pl")
            elif actual_special:
                notes.append("replace plural%s %s with +%s in {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls),
                              actual_special))
                blib.set_param_chain(t, ["+" + actual_special], "2", "pl")
            elif pls_with_def != pls:
                notes.append(
                    "replace default plural %s with '+' in {{es-noun}}" %
                    ",".join(defpl))
                blib.set_param_chain(t, pls_with_def, "2", "pl")

            def handle_mf(mf, mf_full, make_mf):
                mfs = blib.fetch_param_chain(t, mf, mf)
                mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl")
                if mfs and not any(x.startswith("+") for x in mfs):
                    defmf = make_mf(lemma)
                    if set(mfs) == {defmf}:
                        defpls = make_plural(defmf)
                        ok = False
                        if not mfpls or set(mfpls) == set(defpls):
                            ok = True
                        elif set(mfpls) < set(defpls):
                            pagemsg(
                                "WARNING: %pl=%s subset of default=%s, allowing"
                                % (mf, ",".join(mfpls), ",".join(defpls)))
                            ok = True
                        if ok:
                            notes.append(
                                "replace %s=%s with '+' in {{es-noun}}" %
                                (mf, ",".join(mfs)))
                            blib.set_param_chain(t, ["+"], mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                            return
                    actual_special = None
                    for special in all_specials:
                        special_mf = make_mf(lemma, special)
                        if special_mf is None:
                            continue
                        if mfs == [special_mf]:
                            pagemsg("Found special=%s with special_mf=%s" %
                                    (special, special_mf))
                            actual_special = special
                            break
                    if actual_special:
                        if not mfpls:
                            pagemsg(
                                "WARNING: Explicit %s=%s matches special=%s but no %s plural"
                                % (mf, ",".join(mfs), actual_special, mf_full))
                        else:
                            special_mfpl = make_plural(special_mf,
                                                       actual_special)
                            if special_mfpl:
                                if len(special_mfpl) > 1 and set(mfpls) < set(
                                        special_mfpl):
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                elif set(mfpls) == set(special_mfpl):
                                    pagemsg(
                                        "Found %s=%s and special=%s, %spls=%s matches special_%spl"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf))
                                else:
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                    actual_special = None
                        if actual_special:
                            notes.append(
                                "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural"
                                % (mf_full, ",".join(mfs), actual_special,
                                   mf_full))
                            blib.set_param_chain(t, ["+%s" % actual_special],
                                                 mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                    if not actual_special:
                        defmf = make_mf(lemma)
                        mfs_with_def = ["+" if x == defmf else x for x in mfs]
                        if mfs_with_def != mfs:
                            notes.append(
                                "replace default %s %s with '+' in {{es-noun}}"
                                % (mf_full, defmf))
                            blib.set_param_chain(t, mfs_with_def, mf, mf)
                        if mfpls:
                            defpl = [
                                x for y in mfs for x in (make_plural(y) or [])
                            ]
                            ok = False
                            if set(defpl) == set(mfpls):
                                ok = True
                            elif len(defpl) > 1 and set(mfpls) < set(defpl):
                                pagemsg(
                                    "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing"
                                    % (mf, ",".join(mfs), mf, ",".join(mfpls),
                                       ",".join(defpl)))
                                ok = True
                            if ok:
                                pagemsg(
                                    "Found %s=%s, %spl=%s matches default pl" %
                                    (mf, ",".join(mfs), mf, ",".join(mfpls)))
                                notes.append(
                                    "remove redundant explicit %s plural %s in {{es-noun}}"
                                    % (mf_full, ",".join(mfpls)))
                                blib.remove_param_chain(
                                    t, mf + "pl", mf + "pl")
                            else:
                                for special in all_specials:
                                    defpl = [
                                        x for y in mfs for x in (
                                            make_plural(y, special) or [])
                                    ]
                                    if set(defpl) == set(mfpls):
                                        pagemsg(
                                            "Found %s=%s, %spl=%s matches special=%s"
                                            % (mf, ",".join(mfs), mf,
                                               ",".join(mfpls), special))
                                        notes.append(
                                            "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}"
                                            % (mf_full, ",".join(mfpls),
                                               special))
                                        blib.set_param_chain(
                                            t, ["+%s" % special], mf + "pl",
                                            mf + "pl")

            handle_mf("f", "feminine", make_feminine)
            handle_mf("m", "masculine", make_masculine)

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

        if tn == "es-noun" and args.make_multiword_plural_explicit:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)

            def expand_text(tempcall):
                return blib.expand_text(tempcall, pagetitle, pagemsg,
                                        args.verbose)

            if " " in lemma and not getparam(t, "2"):
                g = getparam(t, "1")
                if not g.endswith("-p"):
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" %
                        (lemma, g))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string"
                        )
                        continue
                    plurals = explicit_pl.split(",")
                    blib.set_param_chain(t, plurals, "2", "pl")
                    notes.append("add explicit plural to multiword noun")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls:
                mpls = []
                for m in ms:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" %
                        (blib.remove_links(m)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string"
                            % m)
                        continue
                    this_mpls = explicit_pl.split(",")
                    mpls.extend(this_mpls)
                blib.set_param_chain(t, mpls, "mpl", "mpl")
                notes.append("add explicit plural to m=%s" % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls:
                fpls = []
                for f in fs:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" %
                        (blib.remove_links(f)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string"
                            % f)
                        continue
                    this_fpls = explicit_pl.split(",")
                    fpls.extend(this_fpls)
                blib.set_param_chain(t, fpls, "fpl", "fpl")
                notes.append("add explicit plural to f=%s" % ",".join(fs))
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

        if tn == old_adj_template:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            deff = make_feminine(pagetitle)
            defmpl = make_plural(pagetitle)
            fs = []
            fullfs = []
            f = getparam(t, "f") or pagetitle
            fullfs.append(f)
            if f == deff:
                f = "+"
            elif f == lemma:
                f = "#"
            fs.append(f)
            f2 = getparam(t, "f2")
            if f2:
                fullfs.append(f2)
                if f2 == deff:
                    f2 == "+"
                fs.append(f2)
            mpls = []
            mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s"
            mpls.append(mpl)
            mpl2 = getparam(t, "mpl2") or getparam(t, "pl2")
            if mpl2:
                mpls.append(mpl2)
            fullmpls = mpls
            # should really check for subsequence but it never occurs
            if set(mpls) == set(defmpl):
                mpls = ["+"]
            elif set(mpls) < set(defmpl):
                pagemsg(
                    "WARNING: mpls=%s subset of defmpl=%s, replacing with default"
                    % (",".join(mpls), ",".join(defmpl)))
                mpls = ["+"]
            mpls = ["#" if x == lemma else x for x in mpls]
            deffpl = [x for f in fullfs for x in make_plural(f)]
            fpls = []
            fpl = getparam(t, "fpl") or getparam(
                t, "pl") or (getparam(t, "f") or pagetitle) + "s"
            fpls.append(fpl)
            fpl2 = getparam(t, "fpl2") or getparam(t, "pl2")
            if fpl2:
                fpls.append(fpl2)
            fullfpls = fpls
            # should really check for subsequence but it never occurs
            if set(fpls) == set(deffpl):
                fpls = ["+"]
            elif set(fpls) < set(deffpl):
                pagemsg(
                    "WARNING: fpls=%s subset of deffpl=%s, replacing with default"
                    % (",".join(fpls), ",".join(deffpl)))
                fpls = ["+"]
            fpls = ["#" if x == lemma else x for x in fpls]
            actual_special = None
            for special in all_specials:
                deff = make_feminine(pagetitle, special)
                if deff is None:
                    continue
                defmpl = make_plural(pagetitle, special)
                deffpl = make_plural(deff, special)
                deff = [deff]
                if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl:
                    actual_special = special
                    break

            head = getparam(t, "head")

            must_continue = False
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn == "1" and pv in ["m", "mf"]:
                    pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" %
                            (pn, pv, unicode(t)))
                    continue
                if pn not in [
                        "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl",
                        "fpl2"
                ]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s in %s" %
                            (pn, pv, unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue

            del t.params[:]
            if head:
                t.add("head", head)
            if fullfs == [pagetitle] and fullmpls == [
                    pagetitle
            ] and fullfpls == [pagetitle]:
                blib.set_template_name(t, "es-adj-inv")
            else:
                blib.set_template_name(t, "es-adj")
                if actual_special:
                    t.add("sp", actual_special)
                else:
                    if fs != ["+"]:
                        blib.set_param_chain(t, fs, "f", "f")

                    if mpls == fpls and ("+" not in mpls or defmpl == deffpl):
                        # masc and fem pl the same
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "pl", "pl")
                    else:
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "mpl", "mpl")
                        if fpls != ["+"]:
                            blib.set_param_chain(t, fpls, "fpl", "fpl")

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("convert {{%s}} to new {{%s}} format" %
                             (old_adj_template, tname(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

    return unicode(parsed), notes
Beispiel #9
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    def getpron(pron):
        return expand_text("{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" %
                           pron)

    notes = []

    if "it-IPA" not in text:
        return

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn in ["it-IPA"]:
            pagemsg("Saw %s" % unicode(t))
            default_pron_phonemic = None
            prons = []
            for i in xrange(1, 11):
                pron = getparam(t, str(i))
                if pron:
                    prons.append(pron)
            if not prons:
                prons == ["+"]
            defaulted_prons = []
            for pron in prons:

                def add(prn):
                    if prn not in defaulted_prons:
                        defaulted_prons.append(prn)

                if pron == "+" or pron == pagetitle:
                    add("+")
                elif len(pron) == 1:  # vowel only
                    add(pron)
                else:  # full pronun
                    pron_phonemic = None
                    if default_pron_phonemic is None:
                        default_pron_phonemic = getpron(pagetitle)
                    if default_pron_phonemic:
                        pron_phonemic = getpron(pron)
                        if not pron_phonemic:
                            add(pron)
                            continue
                        if default_pron_phonemic == pron_phonemic:
                            pron = "+"
                    if pron != "+":
                        if pron_phonemic is None:
                            pron_phonemic = getpron(pron)
                        if not pron_phonemic:
                            add(pron)
                            continue
                        single_vowel_spec = re.sub(u"[^àèéìòúù]", "", pron)
                        if len(single_vowel_spec) == 1:
                            single_vowel_pron_phonemic = getpron(
                                single_vowel_spec)
                            if single_vowel_pron_phonemic == pron_phonemic:
                                pron = single_vowel_spec
                    add(pron)
            if defaulted_prons == ["+"]:
                blib.remove_param_chain(t, "1", "")
                if unicode(t) != origt:
                    notes.append(
                        "remove redundant respelling(s) from {{it-IPA}}")
            else:
                blib.set_param_chain(t, defaulted_prons, "1", "")
                if unicode(t) != origt:
                    notes.append(
                        "replace default respelling(s) with single-vowel spec or '+' in {{it-IPA}}"
                    )
            if unicode(t) != origt:
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [
        "noun", "nouns", "proper noun", "proper nouns"]:
      pos = getparam(t, "2")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if (pname not in ["1", "2", "head", "g", "g2", "g3", "3", "4", "5", "6", "7", "8", "9", "10"] or
            pname == "3" and pval not in ["masculine", "feminine"] or
            pname in ["5", "7", "9"] and pval != "or"):
          pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt))
          break
      else: # no break
        rmparam(t, "1")
        rmparam(t, "2")
        m = []
        f = []
        head = getparam(t, "head")
        rmparam(t, "head")
        genders = []
        def process_gender(g):
          if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
            genders.append(g)
          else:
            pagemsg("WARNING: Unrecognized gender '%s'" % g)
        g = getparam(t, "g")
        if g:
          process_gender(g)
        rmparam(t, "g")
        g2 = getparam(t, "g2")
        if g2:
          process_gender(g2)
        rmparam(t, "g2")
        g3 = getparam(t, "g3")
        if g3:
          process_gender(g3)
        rmparam(t, "g3")
        def handle_mf(array):
          array.append(getparam(t, "4"))
          rmparam(t, "3")
          rmparam(t, "4")
          i = 5
          while getparam(t, str(i)) == "or":
            array.append(getparam(t, str(i + 1)))
            rmparam(t, str(i))
            rmparam(t, str(i + 1))
            i += 2
        if getparam(t, "3") == "masculine":
          handle_mf(m)
        if getparam(t, "3") == "feminine":
          handle_mf(f)
        if pos in ["noun", "nouns"]:
          newtn = "bg-noun"
        else:
          newtn = "bg-proper noun"
        blib.set_template_name(t, newtn)
        t.add("1", head or pagetitle)
        blib.set_param_chain(t, genders, "2", "g")
        if m:
          blib.set_param_chain(t, m, "m", "m")
        if f:
          blib.set_param_chain(t, f, "f", "f")
        notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn))
    elif tn in ["bg-noun", "bg-proper noun"]:
      g = None
      cur1 = getparam(t, "1")
      if cur1 in ["m", "f"]:
        g = cur1
      elif re.search("[a-zA-Z]", cur1):
        pagemsg("WARNING: Saw Latin in 1=%s in %s" % (cur1, origt))
        continue
      head = getparam(t, "head") or getparam(t, "sg")
      rmparam(t, "head")
      rmparam(t, "sg")
      genders = []
      def process_gender(g):
        if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]:
          genders.append(g)
        elif g in ["mf", "fm"]:
          genders.append("m")
          genders.append("f")
        elif g in ["mn", "nm"]:
          genders.append("m")
          genders.append("n")
        elif g in ["fn", "nf"]:
          genders.append("f")
          genders.append("n")
        elif g in ["mfn", "fmn", "mnf", "nmf", "fnm", "nfm"]:
          genders.append("m")
          genders.append("f")
          genders.append("n")
        else:
          pagemsg("WARNING: Unrecognized gender '%s'" % g)
      if g:
        process_gender(g)
        rmparam(t, "1")
      g = getparam(t, "2")
      if g:
        process_gender(g)
      g = getparam(t, "g")
      if g:
        process_gender(g)
      rmparam(t, "g")
      g2 = getparam(t, "g2")
      if g2:
        process_gender(g2)
      rmparam(t, "g2")
      g3 = getparam(t, "g3")
      if g3:
        process_gender(g3)
      rmparam(t, "g3")
      params = []
      for param in t.params:
        pname = unicode(param.name).strip()
        pval = unicode(param.value).strip()
        showkey = param.showkey
        if not pval:
          continue
        params.append((pname, pval, showkey))
      # Erase all params.
      del t.params[:]
      # Put back new params.
      t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle))
      blib.set_param_chain(t, genders, "2", "g")
      for pname, pval, showkey in params:
        t.add(pname, pval, showkey=showkey, preserve_spacing=False)
      if origt != unicode(t):
        notes.append("move head=/sg= to 1=, g= to 2= in {{%s}}" % tn)
    if unicode(t) != origt:
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return parsed, notes
def process_page(page, index, adverb):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")

    text = unicode(page.text)

    parsed = blib.parse_text(text)
    adj_template = None
    part_template = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-adj":
            if adj_template:
                pagemsg(
                    "WARNING: Saw multiple adjective templates: %s and %s" %
                    (unicode(adj_template), unicode(t)))
            else:
                adj_template = t
        if tn == "la-part":
            if part_template:
                pagemsg(
                    "WARNING: Saw multiple participle templates: %s and %s" %
                    (unicode(part_template), unicode(t)))
            else:
                part_template = t
    if adj_template and part_template:
        pagemsg("Saw both %s and %s, modifying adjective" %
                (unicode(adj_template), unicode(part_template)))
    if adj_template:
        template_to_fix = adj_template
    elif part_template:
        template_to_fix = part_template
    else:
        pagemsg("WARNING: Didn't see adjective or participle template")
        return None, None
    existing_advs = blib.fetch_param_chain(template_to_fix, "adv", "adv")
    changed = False
    for i in xrange(len(existing_advs)):
        if lalib.remove_macrons(existing_advs[i]) == lalib.remove_macrons(adv):
            if existing_advs[i] != adv:
                pagemsg("Updating macrons of %s -> %s in %s" %
                        (existing_advs[i], adv, unicode(template_to_fix)))
                existing_advs[i] = adv
                changed = True
                notes.append("update macrons of adv=, changing %s -> %s" %
                             (existing_advs[i], adv))
            else:
                pagemsg("Already saw %s: %s" % (adv, unicode(template_to_fix)))
            break
    else:
        # no break
        existing_advs.append(adv)
        changed = True
        notes.append("add adv %s to adjective" % adv)
    if changed:
        origt = unicode(template_to_fix)
        blib.set_param_chain(template_to_fix, existing_advs, "adv", "adv")
        pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix)))

    return unicode(parsed), notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc):
  notes = []
  pagetitle = unicode(nounpage.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  text = unicode(nounpage.text)
  retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % (
      desc, ",".join(new_adj_or_dims)))
    return
  sections, j, secbody, sectail, has_non_lang = retval
  parsed = blib.parse_text(secbody)
  head = None
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]:
      if head:
        pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" %
            (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims)))
        return
      head = t
  if not head:
    pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims)))
    return
  orig_adjs_or_dims = blib.fetch_param_chain(head, param, param)
  adjs_or_dims = blib.fetch_param_chain(head, param, param)
  added_adjs_or_dims = []
  for adj_or_dim in new_adj_or_dims:
    if adj_or_dim in adjs_or_dims:
      pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head)))
    else:
      adjs_or_dims.append(adj_or_dim)
      added_adjs_or_dims.append(adj_or_dim)
  if adjs_or_dims != orig_adjs_or_dims:
    orighead = unicode(head)
    blib.set_param_chain(head, adjs_or_dims, param, param)
    pagemsg("Replaced %s with %s" % (orighead, unicode(head)))
    notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims)))
    secbody = unicode(parsed)
  subsecs = re.split("(^==.*==\n)", secbody, 0, re.M)
  for k in xrange(2, len(subsecs), 2):
    if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]:
      header = re.sub("=", "", subsecs[k - 1]).strip()
      for adj_or_dim in adjs_or_dims:
        def note_removed_text(m):
          if m.group(1):
            pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" %
                (header, desc, adj_or_dim, m.group(0)))
          return ""
        newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k])
        if newsubsecsk != subsecs[k]:
          notes.append("remove %s %s from %s" % (desc, adj_or_dim, header))
        subsecs[k] = newsubsecsk
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        # Repeat in case adjacent terms removed (unlikely though).
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M)
      if re.search(r"^\s*$", subsecs[k]):
        subsecs[k] = ""
        subsecs[k - 1] = ""
  secbody = "".join(subsecs)
  secj = secbody + sectail
  newsecj = re.sub(r"\n\n\n+", "\n\n", secj)
  if newsecj != secj and not notes:
    notes.append("eliminate sequences of 3 or more newlines")
  secj = newsecj
  sections[j] = secj
  return "".join(sections), notes
def process_text_on_page(index, pagetitle, text):
    global args

    if pagetitle.startswith("Reconstruction:Latin/"):
        pagetitle = re.sub("^Reconstruction:Latin/", "*", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if not args.stdin:
        pagemsg("Processing")

    # Greatly speed things up when --stdin by ignoring non-Latin pages
    if "==Latin==" not in text:
        return None, None

    retval = lalib.find_heads_and_defns(text, pagemsg)
    if retval is None:
        return None, None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)

        if tn == "la-noun-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "noun form":
            pos = "noun"
            tag_set_groups = lalib.noun_tag_groups
            possible_slots = lalib.la_noun_decl_overrides
            expected_headtemps = ["la-noun"]
            expected_infltemps = ["la-ndecl"]
        elif tn == "la-proper noun-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "proper noun form":
            pos = "pn"
            tag_set_groups = lalib.noun_tag_groups
            possible_slots = lalib.la_noun_decl_overrides
            expected_headtemps = ["la-proper noun"]
            expected_infltemps = ["la-ndecl"]
        #elif tn == "la-pronoun-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "pronoun form":
        #  pos = "pronoun"
        #  tag_set_groups = lalib.adj_tag_groups
        #  possible_slots = lalib.la_adj_decl_overrides
        #  expected_headtemp = ???
        elif tn == "la-verb-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "verb form":
            pos = "verb"
            tag_set_groups = lalib.verb_tag_groups
            possible_slots = lalib.la_verb_overrides
            expected_headtemps = ["la-verb"]
            expected_infltemps = ["la-conj"]
        elif tn == "la-adj-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "adjective form":
            pos = "adj"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-adj", "la-adj-comp", "la-adj-sup"]
            expected_infltemps = ["la-adecl"]
        elif tn == "la-part-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "participle form":
            pos = "part"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-part"]
            expected_infltemps = ["la-adecl"]
        #elif tn == "la-suffix-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "suffix form":
        #  pos = "suffix"
        elif tn == "la-num-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "numeral form":
            pos = "numadj"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-num-adj"]
            expected_infltemps = ["la-adecl"]
        else:
            continue

        #
        # We have the following:
        #
        # 1. The non-lemma headword, with one or (potentially but unlikely) more
        #    than one headword form.
        # 2. Under the headword, multiple {{inflection of}} templates, each of
        #    which specifies a single lemma under which the non-lemma form
        #    belongs, and one or more corresponding tag sets.
        # 3. The lemma page corresponding to the lemma specified in an
        #    {{inflection of}} template may have one or more lemmas of the right
        #    part of speech. Each lemma specifies one or (potentially but
        #    unlikely) more than one lemma form. Some, all or none of the lemmas
        #    might match the lemma specified in the {{inflection of}} template
        #    in macrons (i.e. there's an exact match between the lemma in the
        #    {{inflection of}} template and one of the actual lemma forms of a
        #    lemma on the page).
        # 4. Under each lemma on the lemma page is one or more inflection
        #    templates specifying the inflections of the lemma. Each inflection
        #    template specifies the non-lemma form(s) (potentially more than one)
        #    for each slot.
        #
        # When looking up a given {{inflection of}} template, the ideal case is
        # that the specified lemma matches one of the actual lemmas, and all
        # corresponding specified non-lemma forms match the corresponding actual
        # non-lemma form(s) for all tag sets. (If there are multiple specified
        # non-lemma forms, they may match across inflection templates if there's
        # more than one, e.g. the first matches the first inflecion template and
        # the second matches the second inflection template.)
        #
        # What if there are mismatches?
        #
        # 1. If the specified non-lemma forms are a subset of the actual
        #    non-lemma forms for a given {{inflection of}} template and lemma,
        #    this is still considered a match but we make a note of it (not a
        #    warning).
        # 2. If a single {{inflection of}} template has multiple tag sets in it
        #    and and for some but not all tag sets the specified non-lemma forms
        #    match, we consider this a match but issue a warning. (In the future,
        #    we might consider removing the bad tag sets, conditioned on a
        #    separate command-line flag.)
        # 3. If the specified lemma of a given {{inflection of}} template
        #    doesn't match any actual lemmas, we look at all actual lemmas that
        #    match except for macrons and see if, for any of them, the specified
        #    non-lemma forms match the actual non-lemma forms per (1) and (2).
        #    If so, we gather the set of lemma forms for all such lemmas. If
        #    there's only one, we can update the specified lemma in the
        #    {{inflection of}} template (and issue a warning). If there are
        #    multiple, we issue a warning and don't update the specified lemma.
        # 4. We first loop through all {{inflection of}} templates for the given
        #    specified non-lemma forms and check for matches according to
        #    (1), (2) and (3). If some but not all templates match, we issue
        #    a warning and we're done with this non-lemma headword.
        # 5. If there are no matches per (4), we look for the set of actual forms
        #    that match all tag sets of all {{inflection of}} templates when
        #    ignoring macron differences. If there is such a non-empty set,
        #    we can update the specified non-lemma forms in the non-lemma
        #    headword (and issue a warning). When doing so, we may need to
        #    update the corresponding pronunciation template(s), according to
        #    logic still to be determined (FIXME), but similar to or identical to
        #    existing logic in clean_latin_long_vowels.py.
        # 6. If there are no matches per (5), we first look at the possible
        #    assignments of actual lemmas to each possible {{inflection of}}
        #    template (ignoring macron differences). If there's only one such
        #    assignment (i.e. each {{inflection of}} template can be assigned to
        #    only one actual lemma), then for that assignment, we find the
        #    actual forms that match the non-lemma pagename except in macrons and
        #    are common among all the sets of inflections, and update the
        #    specified non-lemma forms in the non-lemma headword using those
        #    forms (and issue a warning). When doing so, we may need to update
        #    the corresponding pronunciation template(s) as in (5). If there are
        #    no forms in common, issue a warning and do nothing.
        # 7. If there are multiple assignments of actual lemmas to
        #    {{inflection of}} templates, we loop over all possible assignments.
        #    For each assignment, we find the set of actual common non-lemma
        #    forms as in (6). If there is more than one assignment with a
        #    non-empty set of actual common non-lemma forms, or no assignment,
        #    we issue a warning and do nothing. Otherwise, we update the
        #    specified non-lemma forms in the non-lemma headword (and
        #    corresponding pronunciation template(s)) as in (6).

        headword_forms = lalib.la_get_headword_from_template(
            ht, pagetitle, pagemsg)
        matching_headword_forms = []
        for headword_form in headword_forms:
            if "[" in headword_form or "|" in headword_form:
                pagemsg(
                    "WARNING: Bracket or pipe symbol in non-lemma headword form, should not happen: %s"
                    % unicode(ht))
                headword_form = blib.remove_links(headword_form)
            if lalib.remove_macrons(headword_form) != pagetitle:
                pagemsg(
                    "WARNING: Bad headword form %s, doesn't match page title: %s"
                    % (headword_form, unicode(ht)))
            elif headword_form in matching_headword_forms:
                pagemsg("WARNING: Duplicate headword form %s: %s" %
                        (headword_form, unicode(ht)))
            else:
                matching_headword_forms.append(headword_form)
        headword_forms = matching_headword_forms

        for stage in [1, 2, 3]:

            def stagemsg(txt):
                pagemsg("Stage %s: %s" % (stage, txt))

            def errandstagemsg(txt):
                errandpagemsg("Stage %s: %s" % (stage, txt))

            def yield_infl_of_templates_and_properties():
                for t in headword['infl_of_templates']:
                    lang = getparam(t, "lang")
                    if lang:
                        lemma_param = 1
                    else:
                        lang = getparam(t, "1")
                        lemma_param = 2
                    if lang != "la":
                        errandstagemsg(
                            "WARNING: In Latin section, found {{inflection of}} for different language %s: %s"
                            % (lang, unicode(t)))
                        continue
                    lemma = getparam(t, str(lemma_param))
                    if "[" in lemma or "|" in lemma:
                        stagemsg("WARNING: Link in lemma %s, skipping: %s" %
                                 (lemma, unicode(t)))
                        continue
                    inflargs_sets = lookup_inflection(
                        lalib.remove_macrons(lemma), pos, expected_headtemps,
                        expected_infltemps, stagemsg, errandstagemsg)
                    if inflargs_sets is None:
                        stagemsg(
                            "WARNING: Lemma %s doesn't exist or has no %s heads"
                            % (lemma, pos))
                        continue

                    # fetch tags
                    tags = []
                    for param in t.params:
                        pname = unicode(param.name).strip()
                        pval = unicode(param.value).strip()
                        if re.search("^[0-9]+$", pname):
                            if int(pname) >= lemma_param + 2:
                                if pval:
                                    tags.append(pval)
                    # split tags into tag sets (which may be multipart) and further
                    # split any multipart tag sets into component tag sets
                    tag_sets = [
                        tag_set for maybe_multipart_tag_set in
                        lalib.split_tags_into_tag_sets(tags)
                        for tag_set in lalib.split_multipart_tag_set(
                            maybe_multipart_tag_set)
                    ]
                    yield t, lemma_param, lemma, inflargs_sets, tag_sets

            def merge_forms_for_slot(slot, this_inflargs):
                # Merge the forms of all inflection templates under the given
                # lemma headword
                all_valid_forms = []
                all_valid_forms_with_syncopated = []
                for inflargs in this_inflargs:
                    if slot not in inflargs:
                        continue
                    saw_slot_in_inflargs = True
                    forms = inflargs[slot].split(",")
                    valid_forms = [
                        form for form in forms
                        if "[" not in form and "|" not in form
                    ]
                    for form in valid_forms:
                        if form not in all_valid_forms:
                            all_valid_forms.append(form)
                        if form not in all_valid_forms_with_syncopated:
                            all_valid_forms_with_syncopated.append(form)
                        if pos == "verb" and re.search(u"v[eiē]", form):
                            syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1",
                                                     form)
                            if syncopated_form not in all_valid_forms_with_syncopated:
                                all_valid_forms_with_syncopated.append(
                                    syncopated_form)
                all_matchable_forms = [
                    form for form in all_valid_forms
                    if lalib.remove_macrons(form) == pagetitle
                ]
                all_matchable_forms_with_syncopated = [
                    form for form in all_valid_forms_with_syncopated
                    if lalib.remove_macrons(form) == pagetitle
                ]
                return (all_valid_forms, all_valid_forms_with_syncopated,
                        all_matchable_forms,
                        all_matchable_forms_with_syncopated)

            if stage == 1:
                matched_infl_of_templates = False
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):

                    def check_for_tag_set_match(tag_set, allow_lemma_mismatch):
                        slot = lalib.tag_set_to_slot(tag_set, tag_set_groups,
                                                     stagemsg)
                        if slot is None:
                            # Already issued warning
                            return []
                        if slot not in possible_slots:
                            stagemsg(
                                "WARNING: Unrecognized slot %s from tag set: %s"
                                % (slot, unicode(t)))
                            return []
                        saw_slot_in_inflargs = False
                        matching_actual_lemmas = []
                        for actual_lemmas, this_inflargs in inflargs_sets:
                            saw_matching_lemma = False
                            for actual_lemma in actual_lemmas:
                                actual_lemma = blib.remove_links(actual_lemma)
                                if (lalib.remove_macrons(lemma)
                                        == lalib.remove_macrons(actual_lemma)
                                        if allow_lemma_mismatch else lemma
                                        == actual_lemma):
                                    saw_matching_lemma = True
                            if not saw_matching_lemma:
                                continue

                            (all_valid_forms, all_valid_forms_with_syncopated,
                             all_matchable_forms,
                             all_matchable_forms_with_syncopated) = (
                                 merge_forms_for_slot(slot, this_inflargs))

                            matched_form = False
                            if set(headword_forms) == set(all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms), slot, lemma,
                                       ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms),
                                       ",".join(all_matchable_forms), slot,
                                       lemma, ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) == set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    %
                                    (",".join(headword_forms), slot, lemma,
                                     ",".join(all_valid_forms_with_syncopated))
                                )
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    % (",".join(headword_forms), ",".join(
                                        all_matchable_forms_with_syncopated),
                                       slot, lemma, ",".join(
                                           all_valid_forms_with_syncopated)))
                                matched_form = True
                            if matched_form:
                                for actual_lemma in actual_lemmas:
                                    if actual_lemma not in matching_actual_lemmas:
                                        matching_actual_lemmas.append(
                                            actual_lemma)

                        if not matching_actual_lemmas:
                            if not saw_slot_in_inflargs:
                                if "pasv" in slot:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb"
                                        % (",".join(headword_forms), slot,
                                           lemma))
                                else:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s"
                                        % (",".join(headword_forms), slot,
                                           lemma))

                        return matching_actual_lemmas

                    saw_matching_lemma = False
                    for actual_lemmas, this_inflargs in inflargs_sets:
                        if lemma in [
                                blib.remove_links(x) for x in actual_lemmas
                        ]:
                            saw_matching_lemma = True
                            break

                    if saw_matching_lemma:
                        tag_set_matches = []
                        tag_set_mismatches = []
                        for tag_set in tag_sets:
                            matching_lemmas = check_for_tag_set_match(
                                tag_set, allow_lemma_mismatch=False)
                            if matching_lemmas:
                                tag_set_matches.append(tag_set)
                            else:
                                tag_set_mismatches.append(tag_set)
                        if len(tag_set_matches) > 0:
                            matched_infl_of_templates = True
                            if len(tag_set_mismatches) > 0:
                                stagemsg(
                                    "WARNING: Matched tag sets %s but not %s, counting as a match: %s"
                                    %
                                    (",".join("|".join(tag_set)
                                              for tag_set in tag_set_matches),
                                     ",".join(
                                         "|".join(tag_set)
                                         for tag_set in tag_set_mismatches),
                                     unicode(t)))
                        else:
                            stagemsg(
                                "WARNING: Couldn't match any tag sets: %s" %
                                unicode(t))

                    else:
                        stagemsg(
                            "WARNING: Couldn't match lemma %s among potential lemmas %s, trying without lemma matches: %s"
                            % (lemma, ",".join(
                                actual_lemma for actual_lemmas, this_inflargs
                                in inflargs_sets
                                for actual_lemma in actual_lemmas),
                               unicode(t)))
                        tag_set_matches = []
                        tag_set_mismatches = []
                        all_matching_lemmas = []
                        for tag_set in tag_sets:
                            matching_lemmas = check_for_tag_set_match(
                                tag_set, allow_lemma_mismatch=True)
                            if matching_lemmas:
                                tag_set_matches.append(tag_set)
                                for matching_lemma in matching_lemmas:
                                    if matching_lemma not in all_matching_lemmas:
                                        all_matching_lemmas.append(
                                            matching_lemma)
                            else:
                                tag_set_mismatches.append(tag_set)
                        if len(tag_set_matches) > 0:
                            matched_infl_of_templates = True
                            if len(all_matching_lemmas) == 1:
                                notes.append(
                                    "fix macrons in lemma of '%s' (stage 1): %s -> %s"
                                    %
                                    (tname(t), lemma, all_matching_lemmas[0]))
                                if len(tag_set_mismatches) > 0:
                                    stagemsg(
                                        "WARNING: Fixing macrons in lemma %s -> %s despite only some tag sets %s but not %s matching, counting as a match: %s"
                                        % (lemma, all_matching_lemmas[0],
                                           ",".join(
                                               "|".join(tag_set)
                                               for tag_set in tag_set_matches),
                                           ",".join("|".join(tag_set)
                                                    for tag_set in
                                                    tag_set_mismatches),
                                           unicode(t)))
                                else:
                                    stagemsg(
                                        "WARNING: Fixing macrons in lemma %s -> %s; all tag sets match: %s"
                                        % (lemma, all_matching_lemmas[0],
                                           unicode(t)))
                                origt = unicode(t)
                                t.add(str(lemma_param), all_matching_lemmas[0])
                                stagemsg("Replaced %s with %s" %
                                         (origt, unicode(t)))
                            else:
                                if len(tag_set_mismatches) > 0:
                                    stagemsg(
                                        "WARNING: Multiple possible lemmas %s match some tag sets %s but not %s, counting as a match but not updating lemma %s: %s"
                                        % (",".join(all_matching_lemmas),
                                           ",".join(
                                               "|".join(tag_set)
                                               for tag_set in tag_set_matches),
                                           ",".join("|".join(tag_set)
                                                    for tag_set in
                                                    tag_set_mismatches), lemma,
                                           unicode(t)))
                                else:
                                    stagemsg(
                                        "WARNING: Multiple possible lemmas %s match tag sets, with all tag sets matching, counting as a match but not updating lemma %s: %s"
                                        % (",".join(all_matching_lemmas),
                                           lemma, unicode(t)))
                        else:
                            stagemsg(
                                "WARNING: Couldn't match any tag sets even when allowing macron mismatches with lemma %s: %s"
                                % (lemma, unicode(t)))

                if matched_infl_of_templates:
                    break

            elif stage == 2:
                common_forms = None
                no_common_forms = False
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):
                    for tag_set in tag_sets:
                        slot = lalib.tag_set_to_slot(tag_set, tag_set_groups,
                                                     stagemsg)
                        if slot is None or slot not in possible_slots:
                            # Already issued warning
                            no_common_forms = True
                            break
                        this_tag_set_matching_forms = []
                        combined_this_inflargs = []
                        for actual_lemmas, this_inflargs in inflargs_sets:
                            for actual_lemma in actual_lemmas:
                                actual_lemma = blib.remove_links(actual_lemma)
                                if lemma == actual_lemma:
                                    combined_this_inflargs.extend(
                                        this_inflargs)
                                    break
                        if not combined_this_inflargs:
                            continue
                        (all_valid_forms, all_valid_forms_with_syncopated,
                         all_matchable_forms,
                         all_matchable_forms_with_syncopated) = (
                             merge_forms_for_slot(slot,
                                                  combined_this_inflargs))
                        for form in all_matchable_forms:
                            if form not in this_tag_set_matching_forms:
                                this_tag_set_matching_forms.append(form)
                        if common_forms is None:
                            common_forms = this_tag_set_matching_forms
                            if len(common_forms) == 0:
                                no_common_forms = True
                                break
                        else:
                            new_common_forms = []
                            for form in common_forms:
                                if form in this_tag_set_matching_forms:
                                    new_common_forms.append(form)
                            common_forms = new_common_forms
                            if len(common_forms) == 0:
                                no_common_forms = True
                                break
                    if no_common_forms:
                        break
                if no_common_forms or common_forms is None:
                    stagemsg(
                        "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets, not changing headword form(s) but trying again allowing macron differences in lemmas: %s"
                        % (pagetitle, unicode(ht)))
                else:
                    notes.append(
                        "fix macrons in forms of '%s' (stage 2): %s -> %s" %
                        (tname(ht), ",".join(headword_forms),
                         ",".join(common_forms)))
                    oright = unicode(ht)
                    if tname(ht) == "head":
                        blib.set_param_chain(ht, common_forms, "head", "head")
                    else:
                        blib.set_param_chain(ht, common_forms, "1", "head")
                    stagemsg("Replaced %s with %s" % (oright, unicode(ht)))
                    if len(common_forms) > 1:
                        stagemsg(
                            "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s"
                            % ",".join(common_forms))
                    else:
                        assert len(common_forms) == 1
                        clean_latin_long_vowels.process_pronun_templates(
                            headword['pronun_section'], common_forms[0],
                            stagemsg, notes,
                            "fix macrons in pronun of '%%s' (stage 2): %s -> %s"
                            %
                            (",".join(headword_forms), ",".join(common_forms)))
                    break

            else:
                assert stage == 3
                multiple_assignments = False
                infl_of_assignments = []
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):
                    matching_lemmas = []
                    for actual_lemmas, this_inflargs in inflargs_sets:
                        for actual_lemma in actual_lemmas:
                            actual_lemma = blib.remove_links(actual_lemma)
                            if lalib.remove_macrons(
                                    lemma) == lalib.remove_macrons(
                                        actual_lemma):
                                if actual_lemma not in matching_lemmas:
                                    matching_lemmas.append(actual_lemma)
                    if len(matching_lemmas) > 1:
                        stagemsg(
                            "WARNING: Multiple actual lemmas %s match {{inflection of}} lemma %s, hence multiple assignments, doing things the hard way: %s"
                            % (",".join(matching_lemmas), lemma, unicode(t)))
                        multiple_assignments = True
                    infl_of_assignments.append(matching_lemmas)

                cur_assignment = None
                cur_common_forms = None
                for assignment in itertools.product(*infl_of_assignments):
                    common_forms = None
                    no_common_forms = False
                    for actual_lemma, (
                            t, lemma_param, lemma, inflargs_sets,
                            tag_sets) in zip(
                                assignment,
                                yield_infl_of_templates_and_properties()):
                        for tag_set in tag_sets:
                            slot = lalib.tag_set_to_slot(
                                tag_set, tag_set_groups, stagemsg)
                            if slot is None or slot not in possible_slots:
                                # Already issued warning
                                no_common_forms = True
                                break
                            this_tag_set_matching_forms = []
                            combined_this_inflargs = []
                            for actual_lemmas, this_inflargs in inflargs_sets:
                                if actual_lemma in actual_lemmas:
                                    combined_this_inflargs.extend(
                                        this_inflargs)
                                (all_valid_forms,
                                 all_valid_forms_with_syncopated,
                                 all_matchable_forms,
                                 all_matchable_forms_with_syncopated) = (
                                     merge_forms_for_slot(
                                         slot, combined_this_inflargs))
                                for form in all_matchable_forms:
                                    if form not in this_tag_set_matching_forms:
                                        this_tag_set_matching_forms.append(
                                            form)
                            if common_forms is None:
                                common_forms = this_tag_set_matching_forms
                                if len(common_forms) == 0:
                                    no_common_forms = True
                                    break
                            else:
                                new_common_forms = []
                                for form in common_forms:
                                    if form in this_tag_set_matching_forms:
                                        new_common_forms.append(form)
                                common_forms = new_common_forms
                                if len(common_forms) == 0:
                                    no_common_forms = True
                                    break
                        if no_common_forms:
                            break
                    if not no_common_forms and common_forms is not None:
                        if cur_assignment:
                            stagemsg(
                                "WARNING: Multiple assignments of lemmas have common forms, at least %s -> %s and %s -> %s, not changing: %s"
                                % (",".join(cur_assignment),
                                   ",".join(cur_common_forms),
                                   ",".join(assignment),
                                   ",".join(common_forms), unicode(ht)))
                        else:
                            cur_assignment = assignment
                            cur_common_forms = common_forms
                if cur_assignment is None:
                    stagemsg(
                        "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets when allowing macron differences in lemmas, not changing headword form(s): %s"
                        % (pagetitle, unicode(ht)))
                else:
                    for actual_lemma, (
                            t, lemma_param, lemma, inflargs_sets,
                            tag_sets) in zip(
                                cur_assignment,
                                yield_infl_of_templates_and_properties()):
                        notes.append(
                            "fix macrons in lemma of '%s' (stage 3): %s -> %s"
                            % (tname(t), lemma, actual_lemma))
                        stagemsg(
                            "WARNING: found common forms %s, updating lemma %s to %s: %s"
                            % (",".join(cur_common_forms), lemma, actual_lemma,
                               unicode(t)))
                        origt = unicode(t)
                        t.add(str(lemma_param), actual_lemma)
                        stagemsg("Replaced %s with %s" % (origt, unicode(t)))
                    notes.append(
                        "fix macrons in forms of '%s' (stage 3): %s -> %s" %
                        (tname(ht), ",".join(headword_forms),
                         ",".join(cur_common_forms)))
                    oright = unicode(ht)
                    if tname(ht) == "head":
                        blib.set_param_chain(ht, cur_common_forms, "head",
                                             "head")
                    else:
                        blib.set_param_chain(ht, cur_common_forms, "1", "head")
                    stagemsg("Replaced %s with %s" % (oright, unicode(ht)))
                    if len(cur_common_forms) > 1:
                        stagemsg(
                            "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s"
                            % ",".join(cur_common_forms))
                    else:
                        assert len(cur_common_forms) == 1
                        clean_latin_long_vowels.process_pronun_templates(
                            headword['pronun_section'], cur_common_forms[0],
                            stagemsg, notes,
                            "fix macrons in pronun of '%%s' (stage 3): %s -> %s"
                            % (",".join(headword_forms),
                               ",".join(cur_common_forms)))
                    break

    secbody = "".join(unicode(x) for x in parsed_subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
Beispiel #14
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    if tname(t) == "bg-noun-form":
      origt = unicode(t)
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "head"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      rmparam(t, "1")
      rmparam(t, "2")
      head = getparam(t, "head")
      rmparam(t, "head")
      g = getparam(t, "3")
      rmparam(t, "3")
      blib.set_template_name(t, "head")
      t.add("1", "bg")
      t.add("2", "noun form")
      if head:
        t.add("head", head)
      else:
        if bglib.needs_accents(pagetitle):
          pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" %
              unicode(t))
        else:
          t.add("head", pagetitle)
      if g:
        t.add("g", g)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}")

  headt = None
  saw_infl_after_head = False
  saw_headt = False
  saw_inflt = False
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    saw_infl = False
    already_fetched_forms = False
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form":
      saw_headt = True
      if headt and not saw_infl_after_head:
        pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % (
          unicode(headt), origt))
      saw_infl_after_head = False
      headt = t
    if tn == "bg-noun form of":
      saw_inflt = True
      if not headt:
        pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt)
        continue
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "noun"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      saw_infl_after_head = True
      noun = getparam(t, "noun")
      if not noun:
        pagemsg("WARNING: Didn't see noun=: %s" % origt)
        continue
      infls = []
      param2 = getparam(t, "2")
      if param2 == "indefinite":
        infls.append("indef")
      elif param2 == "definite":
        infls.append("def")
      elif param2 == "vocative":
        infls.append("voc")
      elif param2:
        pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt))
        continue
      param3 = getparam(t, "3")
      if param3 == "subject":
        infls.append("sbjv")
      elif param3 == "object":
        infls.append("objv")
      elif param3:
        pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt))
        continue
      param1 = getparam(t, "1")
      if param1 == "singular":
        infls.append("s")
      elif param1 == "plural":
        infls.append("p")
      elif param1 == "count":
        infls.extend(["count", "form"])
      elif param1 == "vocative":
        infls.extend(["voc", "s"])
      else:
        pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt))
        continue
      blib.set_template_name(t, "inflection of")
      del t.params[:]
      t.add("1", "bg")
      lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
      if not lemma:
        pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
        t.add("2", noun)
      else:
        t.add("2", lemma)
      t.add("3", "")
      for i, infl in enumerate(infls):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{bg-noun form of}} to {{inflection of}}")
      tn = tname(t)
      saw_infls = infls_to_slot(infls)
      already_fetched_forms = True
      if not saw_infls:
        pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "inflection of" and getparam(t, "1") == "bg":
      saw_inflt = True
      infls = []
      i = 4
      while True:
        infl = getparam(t, str(i))
        if not infl:
          break
        infls.append(infl)
        i += 1
      saw_infls = infls_to_slot(infls)
      if not saw_infls:
        if "vnoun" in infls:
          pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt))
        elif "part" in infls:
          pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt))
        else:
          pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "definite singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_sg"
    elif tn == "indefinite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "ind_pl"
    elif tn == "definite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_pl"
    elif tn == "vocative singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "voc_sg"
    if saw_infl:
      if not already_fetched_forms:
        noun = getparam(t, "2")
        lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
        if not lemma:
          pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
          continue
        t.add("2", lemma)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn))
      if saw_infl == "def_sg":
        def_sub_sg = forms.get("def_sub_sg", None)
        def_obj_sg = forms.get("def_obj_sg", None)
        if def_sub_sg != def_obj_sg:
          pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % (
            def_sub_sg, def_obj_sg))
          continue
        form = def_sub_sg
      else:
        form = forms.get(saw_infl, None)
      if not form:
        pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" %
            (saw_infl, format_forms(forms)))
        continue
      form = form.split(",")
      filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle]
      if not filtered_form:
        pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form)))
        continue
      form = filtered_form
      existing_form = blib.fetch_param_chain(headt, "head", "head")
      if existing_form:
        must_continue = False
        for f in existing_form:
          if bglib.remove_accents(f) != pagetitle:
            pagemsg("WARNING: Existing head %s doesn't match page title: %s" % (
              f, unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue
        needs_accents = [bglib.needs_accents(f) for f in existing_form]
        if any(needs_accents) and not all(needs_accents):
          pagemsg("WARNING: Some but not all existing heads missing accents: %s" %
              unicode(headt))
          continue
        if not any(needs_accents):
          if existing_form != form:
            pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % (
              saw_infl, ",".join(existing_form), ",".join(form)))
          continue
      origheadt = unicode(headt)
      blib.set_param_chain(headt, form, "head", "head")
      pagemsg("Replaced %s with %s" % (origheadt, unicode(headt)))
      notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form)))

  if saw_headt and not saw_inflt:
    pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt))

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn in template_to_infl_codes and getparam(t, "1") == "bg":
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      infl_codes = template_to_infl_codes[tn]
      blib.set_template_name(t, "inflection of")
      t.add("3", "")
      for i, infl in enumerate(infl_codes):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{%s}} to {{inflection of}}" % tn)

  return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if "es-verb" not in text:
        return

    if ":" in pagetitle:
        pagemsg("Skipping non-mainspace title")
        return

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():

        def getp(param):
            return getparam(t, param)

        tn = tname(t)
        if tn == "es-verb" and args.add_attn and not getp("1"):
            origt = unicode(t)
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                pagemsg("WARNING: No 1= but saw param %s=%s: %s" %
                        (pn, pv, unicode(t)))
                break
            t.add("attn", "1")
            notes.append("add attn=1 to verb with missing 1=")
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))
            continue

        if tn == "es-verb":
            origt = unicode(t)
            lemma = getparam(t, "head") or pagetitle
            if " " in lemma:
                pagemsg("WARNING: Space in lemma")
            prep = getp("prep")
            shouldlemma = getp("1") + getp("2") + (
                "se" if getp("ref") == "y" else
                "") + (" " + blib.remove_links(prep) if prep else "")
            if shouldlemma != blib.remove_links(lemma):
                pagemsg(
                    "WARNING: lemma=%s from 1/2/ref != lemma=%s from head or pagetitle: %s"
                    % (shouldlemma, blib.remove_links(lemma), unicode(t)))
                continue
            d = get_def_forms(lemma, prep, pagemsg)
            if not d:
                continue
            if getp("part2") and not getp("part"):
                pagemsg("WARNING: Saw part2= without part=: %s" % unicode(t))
                part = [d["part"], getp("part2")]
            else:
                part = blib.fetch_param_chain(t, "part")
            pres = blib.fetch_param_chain(t, "pres")
            pret = blib.fetch_param_chain(t, "pret")
            part = ["+" if x == d["part"] else x for x in part]
            pret = ["+" if x == d["pret"] else x for x in pret]
            pres = [
                "+" if x == d["pres"] else
                "+ie" if x == d["pres_ie"] else "+ue" if x == d["pres_ue"] else
                "+i" if x == d["pres_i"] else u"+í" if x ==
                d["pres_iacc"] else u"+ú" if x == d["pres_uacc"] else x
                for x in pres
            ]
            notes.append("convert {{es-verb}} to new format")
            if pres == ["+"]:
                notes.append("remove redundant present from {{es-verb}}")
                pres = []
            if pret == ["+"]:
                notes.append("remove redundant preterite from {{es-verb}}")
                pret = []
            if part == ["+"]:
                notes.append("remove redundant participle from {{es-verb}}")
                part = []
            for vowel_var in ["+ie", "+ue", "+i", u"+í", u"+ú"]:
                if vowel_var in pres:
                    notes.append(
                        "replace vowel-varying present with '%s' in {{es-verb}}"
                        % vowel_var)
            if "+" in part:
                notes.append(
                    "replace default participle with '+' in {{es-verb}}")

            head = getp("head")

            must_continue = False
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn == "1" and pv in ["m", "mf"]:
                    pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" %
                            (pn, pv, unicode(t)))
                    continue
                if pn not in [
                        "head", "1", "2", "ref", "pres", "pret", "part",
                        "part2", "prep"
                ]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s in %s" %
                            (pn, pv, unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue

            del t.params[:]

            def has_override(forms):
                return 1 if any(x and not x.startswith("+")
                                for x in forms) else 0

            num_overrides = has_override(pres) + has_override(
                pret) + has_override(part)

            if d["post"] or (d["refl"] or d["clitic"]) and num_overrides >= 2:
                main_verb = d["full_verb"]
                if part:
                    angle_brackets = "<%s,%s,%s>" % (
                        ":".join(pres), ":".join(pret), ":".join(part))
                elif pret:
                    angle_brackets = "<%s,%s>" % (":".join(pres),
                                                  ":".join(pret))
                elif pres:
                    angle_brackets = "<%s>" % (":".join(pres))
                else:
                    angle_brackets = "<>"
                if angle_brackets == "<>":
                    if head:
                        t.add("head", head)
                else:
                    arg1 = "%s%s%s" % (main_verb, angle_brackets, d["post"]
                                       or "")
                    t.add("1", arg1)
            else:
                if head:
                    t.add("head", head)
                pres = [
                    make_verb_form_full(x,
                                        d["clitic"],
                                        d["refl"],
                                        "",
                                        is_part=False,
                                        do_link=True) for x in pres
                ]
                pret = [
                    make_verb_form_full(x,
                                        d["clitic"],
                                        d["refl"],
                                        "",
                                        is_part=False,
                                        do_link=True) for x in pret
                ]
                part = [
                    make_verb_form_full(x,
                                        d["clitic"],
                                        d["refl"],
                                        "",
                                        is_part=True,
                                        do_link=True) for x in part
                ]
                blib.set_param_chain(t, pres, "pres")
                blib.set_param_chain(t, pret, "pret")
                blib.set_param_chain(t, part, "part")

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

    return unicode(parsed), notes
def process_page(page, index, parsed):
    global args
    verbose = args.verbose
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    parsed = blib.parse(page)
    notes = []
    hascomp = False
    headword_templates = []
    decl_templates = []
    for t in parsed.filter_templates():
        if unicode(t.name) == "ru-adj":
            headword_templates.append(t)
            if getparam(t, "2"):
                hascomp = True
            elif getparam(t, "comp2") or getparam(t, "comp3") or getparam(
                    t, "comp4") or getparam(t, "comp5"):
                pagemsg("WARNING: Found compN= but no 2=: %s" % unicode(t))
        if unicode(t.name) == "ru-decl-adj":
            decl_templates.append(t)
    if hascomp:
        if len(headword_templates) > 1 or len(decl_templates) > 1:
            pagemsg(
                "WARNING: Found comparative and multiple headword or decl templates, can't proceed"
            )
        elif len(decl_templates) == 1 and not headword_templates:
            pagemsg(
                "WARNING: Strange, decl template but no headword template: %s"
                % unicode(decl_templates[0]))
        elif len(headword_templates) == 1 and not decl_templates:
            pagemsg(
                "WARNING: Strange, headword template but no decl template: %s"
                % unicode(headword_templates[0]))
        elif pagetitle.endswith(u"ся"):
            pagemsg(
                "WARNING: Comparative with reflexive adjective, not sure what to do: %s"
                % unicode(headword_templates[0]))
        else:
            head = getparam(decl_templates[0], "1")
            decl = getparam(decl_templates[0], "2")
            if decl == "-" or decl == "?" or not decl:
                pagemsg(
                    "WARNING: Found comparative with no short decl '%s': %s" %
                    (decl, getparam(headword_templates[0], "2")))
                compspec = "+"
            else:
                decl = re.sub(r"\*", "", decl)
                decl = re.sub(r"\([12]\)", "", decl)
                decl = set(re.sub(":.*", "", x) for x in re.split(",", decl))
                if len(decl) > 1:
                    pagemsg(
                        "WARNING: Found multiple short declensions, not sure what to do: %s (reduced to %s)"
                        % getparam(decl_templates[0], "2"), ",".join(decl))
                    return
                decl = list(decl)[0]
                if not re.search("^[abc]'*$", decl):
                    pagemsg(
                        "WARNING: Strange canonicalized decl %s (orig %s), don't know what to do"
                        % (decl, getparam(decl_templates[0], "2")))
                    return
                if (decl == "a" and not pagetitle.endswith(u"ой")
                        or decl == "b" and pagetitle.endswith(u"ой")):
                    compspec = "+"
                else:
                    compspec = "+" + decl
            comparatives = expand_text(
                "{{#invoke:ru-headword|generate_comparative|%s|%s}}" %
                (head, compspec))
            if not comparatives:
                # Already output warning
                return
            comparatives = [
                re.sub("//.*", "", x) for x in re.split(",", comparatives)
            ]
            unique_comparatives = []
            for comp in comparatives:
                if comp not in unique_comparatives:
                    unique_comparatives.append(comp)
            origt = unicode(headword_templates[0])
            existing_comparatives = []
            compparams = []
            i = 0
            while True:
                compparam = "2" if i == 0 else "comp" + str(i + 1)
                existing_comp = getparam(headword_templates[0], compparam)
                if not existing_comp:
                    break
                existing_comparatives.append(existing_comp)
                compparams.append(compparam)
                i += 1
            if "peri" in existing_comparatives:
                if len(existing_comparatives) > 1:
                    pagemsg(
                        "WARNING: 'peri' along with other explicit comparatives, not sure what to do: %s"
                        % ",".join(existing_comparatives))
            elif any(x.startswith("+") for x in existing_comparatives):
                if len(existing_comparatives) > 1:
                    pagemsg(
                        "WARNING: auto-comparative along with other explicit comparatives, not sure what to do: %s"
                        % ",".join(existing_comparatives))
            elif existing_comparatives != unique_comparatives:
                pagemsg(
                    "WARNING: Explicit comparative(s) %s not same as auto-generated %s"
                    % (",".join(existing_comparatives),
                       ",".join(unique_comparatives)))
            else:
                superlatives = blib.fetch_param_chain(headword_templates[0],
                                                      "3", "sup")
                blib.remove_param_chain(headword_templates[0], "3", "sup")
                for compparam in compparams:
                    rmparam(headword_templates[0], compparam)
                headword_templates[0].add("2", compspec)
                blib.set_param_chain(headword_templates[0], superlatives, "3",
                                     "sup")
                pagemsg("Replaced %s with %s" %
                        (origt, unicode(headword_templates[0])))
                notes.append("replaced explicit comparative %s with %s" %
                             (",".join(existing_comparatives), compspec))

    return unicode(parsed), notes
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []
    pagemsg("Processing")

    heads = None
    headt = None
    headtn = None
    gender_and_animacy = None
    genitives = None
    plurals = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in [args.lang + "-noun", args.lang + "-proper noun"]:
            if heads:
                pagemsg(
                    "WARNING: Encountered headword twice without declension: %s"
                    % unicode(t))
                return
            headt = t
            headtn = tn
            heads = blib.fetch_param_chain(t, "1", "head")
            gender_and_animacy = blib.fetch_param_chain(t, "2", "g")
            genitives = blib.fetch_param_chain(t, "3", "gen")
            plurals = blib.fetch_param_chain(t, "4", "pl")
            genitive_plurals = blib.fetch_param_chain(t, "5", "genpl")
        if tn == args.lang + "-ndecl":
            if not heads:
                pagemsg("WARNING: Encountered decl without headword: %s" %
                        unicode(t))
                return
            generate_template = re.sub(
                r"^\{\{%s-ndecl\|" % args.lang,
                "{{User:Benwing2/%s-generate-prod-noun-props|" % args.lang,
                unicode(t))
            result = expand_text(generate_template)
            if not result:
                return
            new_forms = blib.split_generate_args(result)
            new_g = new_forms["g"].split(",")

            def compare(old, new, stuff, nocanon=False):
                if not old:
                    return True
                if not nocanon:
                    remove_monosyllabic_accents = (
                        uk.remove_monosyllabic_stress if args.lang == "uk" else
                        be.remove_monosyllabic_accents)
                    old = [
                        remove_monosyllabic_accents(blib.remove_links(x))
                        for x in old
                    ]
                    new = [remove_monosyllabic_accents(x) for x in new]
                if set(old) != set(new):
                    pagemsg(
                        "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s"
                        % (stuff, ",".join(old), stuff, ",".join(new),
                           unicode(headt), unicode(t)))
                    return False
                return True

            if not compare(gender_and_animacy, new_g, "gender", nocanon=True):
                heads = None
                continue
            is_plural = [x.endswith("-p") for x in new_g]
            if any(is_plural) and not all(is_plural):
                pagemsg(
                    "WARNING: Mixture of plural-only and non-plural-only genders, can't process: %s"
                    % unicode(t))
                return
            is_plural = any(is_plural)
            if is_plural:
                if (not compare(heads,
                                new_forms.get("nom_p", "-").split(","),
                                "nom pl")
                        or not compare(genitives,
                                       new_forms.get("gen_p", "-").split(","),
                                       "gen pl")):
                    heads = None
                    continue
            else:
                if (not compare(heads,
                                new_forms.get("nom_s", "-").split(","),
                                "nom sg")
                        or not compare(genitives,
                                       new_forms.get("gen_s", "-").split(","),
                                       "gen sg") or
                        # 'uk/be-proper noun' headwords don't have nominative plural set
                        headtn == args.lang + "-noun" and not compare(
                            plurals,
                            new_forms.get("nom_p", "-").split(","), "nom pl")
                        or headtn == args.lang + "-noun" and not compare(
                            genitive_plurals,
                            new_forms.get("gen_p", "-").split(","), "gen pl")):
                    heads = None
                    continue
            decl = getparam(t, "1")
            blib.set_param_chain(headt, [decl], "1", "head")
            blib.remove_param_chain(headt, "2", "g")
            blib.remove_param_chain(headt, "3", "gen")
            blib.remove_param_chain(headt, "4", "pl")
            blib.remove_param_chain(headt, "5", "genpl")
            notes.append("convert {{%s}} to new style using decl %s" %
                         (unicode(headt.name), decl))
            heads = None
    return unicode(parsed), notes
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    pagemsg("Processing")

    notes = []

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn not in ["la-noun", "la-proper noun"]:
            continue

        origt = unicode(t)

        def render_headword():
            return "headword template <from> %s <to> %s <end>" % (origt, origt)

        if getparam(t, "indecl"):
            pagemsg("Skipping indeclinable noun: %s" % render_headword())
            continue
        new_style_headword_template = (not getparam(t, "head2")
                                       and not getparam(t, "2")
                                       and not getparam(t, "3")
                                       and not getparam(t, "4")
                                       and not getparam(t, "decl"))
        if new_style_headword_template:
            pagemsg("Skipping new-style template: %s" % render_headword())
            continue
        lemma = blib.fetch_param_chain(t, ["1", "head", "head1"],
                                       "head") or [pagetitle]
        genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen")
        noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g")
        noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl")
        if " " in lemma[0]:
            pagemsg("WARNING: Space in lemma %s, skipping: %s" %
                    (lemma[0], render_headword()))
            continue
        if len(lemma) > 1:
            pagemsg("WARNING: Multiple lemmas %s, skipping: %s" %
                    (",".join(lemma), render_headword()))
            continue
        lemma = lemma[0]
        noun_decl_to_decl_type = {
            "first": "1",
            "second": "2",
            "third": "3",
            "fourth": "4",
            "fifth": "5",
            "irregular": "irreg",
        }
        if len(noun_decl) == 0:
            pagemsg("WARNING: No declension, skipping: %s" % render_headword())
            continue
        if len(noun_decl) > 1:
            pagemsg("WARNING: Multiple decls %s, skipping: %s" %
                    (",".join(noun_decl), render_headword()))
            continue
        noun_decl = noun_decl[0]
        if noun_decl not in noun_decl_to_decl_type:
            pagemsg("WARNING: Unrecognized declension %s, skipping: %s" %
                    (noun_decl, render_headword()))
            continue
        decl_type = noun_decl_to_decl_type[noun_decl]
        if decl_type in ["1", "2", "4", "5"]:
            param1 = "%s<%s>" % (lemma, decl_type)
        elif decl_type == "3":
            if len(genitive) == 0:
                pagemsg(
                    "WARNING: No genitives with decl 3 lemma %s, skipping: %s"
                    % (lemma, render_headword()))
                continue
            elif len(genitive) > 1:
                pagemsg(
                    "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s"
                    % (",".join(genitive), lemma, render_headword()))
                continue
            else:
                gen1 = genitive[0]
                if gen1.endswith("is"):
                    stem = gen1[:-2]
                    if lalib.infer_3rd_decl_stem(lemma) == stem:
                        param1 = "%s<3>" % lemma
                    else:
                        param1 = "%s/%s<3>" % (lemma, stem)
                elif gen1.endswith("ium"):
                    if lemma.endswith("ia"):
                        param1 = "%s<3.pl>" % lemma
                    elif lemma.endswith(u"ēs"):
                        param1 = "%s<3.I.pl>" % lemma
                    else:
                        pagemsg(
                            "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s"
                            % (lemma, render_headword()))
                        continue
                elif gen1.endswith("um"):
                    if lemma.endswith("a") or lemma.endswith(u"ēs"):
                        param1 = "%s<3.pl>" % lemma
                    else:
                        pagemsg(
                            "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s"
                            % (lemma, render_headword()))
                        continue
                else:
                    pagemsg(
                        "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s"
                        % (gen1, lemma, render_headword()))
                    continue
        elif decl_type == "irreg":
            pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" %
                    render_headword())
            continue
        else:
            pagemsg(
                "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s"
                % (decl_type, render_headword()))
            continue
        la_ndecl = "{{la-ndecl|%s}}" % param1
        noun_props = convert_la_headword_noun.new_generate_noun_forms(
            la_ndecl, errandpagemsg, expand_text, include_props=True)
        if noun_props is None:
            continue
        decl_gender = noun_props.get("g", None)
        if not convert_la_headword_noun.compare_headword_decl_forms(
                "genitive",
                genitive, ["gen_sg", "gen_pl"],
                noun_props,
                render_headword(),
                pagemsg,
                adjust_for_missing_gen_forms=True,
                adjust_for_e_ae_gen=True,
                remove_headword_links=True):
            continue
        if len(noun_gender) == 1 and noun_gender[0] == decl_gender:
            need_explicit_gender = False
        else:
            need_explicit_gender = True
            if len(noun_gender) > 1:
                pagemsg(
                    "WARNING: Saw multiple headword genders %s, please verify: %s"
                    % (",".join(noun_gender), render_headword()))
            elif (noun_gender and noun_gender[0].startswith("n") !=
                  (decl_gender == "n")):
                pagemsg(
                    "WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s"
                    % (noun_gender[0], decl_gender, render_headword()))
                continue

        # Fetch remaining params from headword template
        headword_params = []
        for param in t.params:
            pname = unicode(param.name)
            if pname.strip() in ["1", "2", "3", "4"] or re.search(
                    "^(head|gen|g|decl)[0-9]*$", pname.strip()):
                continue
            headword_params.append((pname, param.value, param.showkey))
        # Erase all params
        del t.params[:]
        # Add param1
        t.add("1", param1)
        # Add explicit gender if needed
        if need_explicit_gender:
            explicit_genders = []
            for ng in noun_gender:
                ng = ng[0]
                if ng not in explicit_genders:
                    explicit_genders.append(ng)
            blib.set_param_chain(t, explicit_genders, "g", "g")
        # Copy remaining params from headword template
        for name, value, showkey in headword_params:
            t.add(name, value, showkey=showkey, preserve_spacing=False)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append(
            "convert {{la-noun}}/{{la-proper noun}} params to new style")

    return unicode(parsed), notes
  def process_noun_headt(t, declt=None):
    origt = unicode(t)
    origdeclt = declt and unicode(declt) or "None"
    def getp(param):
      return getparam(t, param)
    if tname(t) == "head":
      pos = getp("2")
      head = getp("head")
      headtr = getp("tr")
      g = getp("g")
      g2 = getp("g2")
      g3 = getp("g3")
      anim = ""
      decl = ""
      gen = ""
      gentr = ""
      pl = ""
      pltr = ""
      f = ""
      ftr = ""
      m = ""
      mtr = ""
      collective = ""
      collectivetr = ""
      must_continue = False
      for param in t.params:
        pn = pname(param)
        if pn not in ["1", "2", "head", "tr", "g", "g2", "g3",
            # extra params to ignore
            "sc"]:
          pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
              (pn, unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        return False
    else:
      pos = getp("pos")
      head = getp("1") or getp("head") or getp("sg")
      headtr = getp("tr")
      g = getp("2") or getp("g")
      g2 = getp("g2")
      g3 = getp("g3")
      anim = getp("a")
      decl = getp("decl")
      gen = getp("gen") or getp("3")
      gentr = getp("gentr")
      pl = getp("pl") or getp("4")
      pltr = getp("pltr")
      f = getp("f")
      ftr = getp("ftr")
      m = getp("m")
      mtr = getp("mtr")
      collective = getp("collective")
      collectivetr = getp("collectivetr")
      must_continue = False
      for param in t.params:
        pn = pname(param)
        if pn not in ["pos", "1", "head", "sg", "tr", "2", "g", "g2", "g3",
            "a", "decl", "gen", "gentr", "3", "pl", "pltr", "4",
            "f", "ftr", "m", "mtr", "collective", "collectivetr",
            # extra params to ignore
            "sc"]:
          pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
              (pn, unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        return False

    def clean_gender(g):
      gparts = g.split("-")
      realg = "?"
      realan = "?"
      realpl = ""
      for part in gparts:
        if part in ["m", "f", "n"]:
          realg = part
        elif part in ["an", "in"]:
          realan = part
        elif part == "p":
          realpl = part
        elif part != "?":
          pagemsg("WARNING: Encountered unrecognized gender part '%s' in gender '%s': %s" % (
            part, g, origt))
      an = anim
      if an in ["a", "an"]:
        an = "an"
      elif an in ["i", "in"]:
        an = "in"
      elif an:
        pagemsg("WARNING: Unrecognized animacy a=%s: %s" % (an, origt))
        an = "?"
      if realan != "?" and an and an != "?" and an != realan:
        pagemsg("WARNING: Animacy mismatch, anim %s in gender spec %s but a=%s: %s" % (
          realan, g, anim, origt))
      if realan == "?" and an:
        realan = an
      pl = ""
      if realpl:
        pl = "-%s" % realpl
      if realg == "?":
        pagemsg("WARNING: Unknown gender in gender spec %s: %s" % (g, origt))
      if realan == "?":
        pagemsg("WARNING: Unknown animacy in gender spec %s and a=%s: %s" % (g, anim, origt))
      if realg == "?" and realan == "?":
        return "?%s" % pl
      else:
        return "%s-%s%s" % (realg, realan, pl)

    if not g and not g2 and not g3:
      pagemsg("WARNING: No gender specified: %s" % origt)
      g = "?"
    genders = []
    if g:
      genders.append(clean_gender(g))
    if g2:
      genders.append(clean_gender(g2))
    if g3:
      genders.append(clean_gender(g3))

    if not head:
      head = pagetitle
    if decl and decl not in ["off", "no", "indeclinable"]:
      pagemsg("WARNING: Unrecognized value for decl=%s: %s" % (decl, origt))
      decl = ""
    if decl:
      if gen and gen != "-":
        pagemsg("WARNING: Indeclinable but gen=%s specified: %s" % (gen, origt))
      else:
        gen = "-"

    del t.params[:]
    if tname(t) == "head":
      blib.set_template_name(t, "be-" + pos)
    elif pos:
      t.add("pos", pos)

    def split_form(form):
      forms = re.split(r",\s*", form.strip())
      forms = [re.sub(r"^\[\[([^\[\]]*)\]\]$", r"\1", f) for f in forms]
      forms = [belib.add_accent_to_o(f) for f in forms]
      for f in forms:
        if "[[" in f:
          pagemsg("WARNING: Link in form %s: headword=%s, decl=%s" %
              (f, origt, origdeclt))
        if belib.needs_accents(f):
          pagemsg("WARNING: Form %s missing accents: headword=%s, decl=%s" %
              (f, origt, origdeclt))
      forms = [f for f in forms if f != "-"]
      return forms

    def handle_multiform(firstparam, restparam, form, formtr, declparam=None):
      if form:
        form = split_form(form)
      if declparam:
        if declparam == "-":
          declforms = ["-"]
        else:
          declforms = split_form(getparam(declt, declparam))
        if not form:
          form = declforms
        elif set(form) != set(declforms):
          pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" %
              (restparam, ",".join(form), ",".join(declforms), origt, origdeclt))
      if form:
        blib.set_param_chain(t, form, firstparam, restparam)
      if formtr:
        trparam = ("" if restparam == "head" else restparam) + "tr"
        if not form:
          pagemsg("WARNING: Saw %s=%s but no %s=: %s" %
              ("trparam", formtr, restparam, origt))
        elif len(form) > 1:
          pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" %
              (trparam, formtr, restparam, ",".join(form), origt))
        t.add(trparam, formtr)

    decl_headparam = None
    decl_genparam = None
    decl_plparam = None
    if declt:
      decl_headparam = "1"
      tn = tname(declt)
      if tn == "be-decl-noun":
        decl_genparam = "3"
        decl_plparam = "2"
      elif tn == "be-decl-noun-unc":
        decl_genparam = "2"
        decl_plparam = "-"
      else:
        decl_genparam = "2"
      if tn == "be-decl-noun-pl":
        for g in genders:
          if not g.endswith("-p"):
            pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % (
              g, unicode(declt), origt))
      else:
        for g in genders:
          if g.endswith("-p"):
            pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % (
              g, unicode(declt), origt))

    handle_multiform("1", "head", head, headtr, decl_headparam)
    blib.set_param_chain(t, genders, "2", "g")
    handle_multiform("3", "gen", gen, gentr, decl_genparam)
    if not getp("3") and pl:
      t.add("3", "")
    handle_multiform("4", "pl", pl, pltr, decl_plparam)
    handle_multiform("m", "m", m, mtr)
    handle_multiform("f", "f", f, ftr)
    handle_multiform("collective", "collective", collective, collectivetr)

    if origt != unicode(t):
      notes.append("fix up {{%s}} to use new param convention" % tname(t))
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return True
def process_page_section(index, page, section, verbose):
  pagetitle = unicode(page.title())
  subpagetitle = re.sub("^.*:", "", pagetitle)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

  if not page.exists():
    pagemsg("WARNING: Page doesn't exist, skipping")
    return None

  parsed = blib.parse_text(section)

  noun_table_templates = []
  noun_old_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-decl-noun-see":
      pagemsg("Found ru-decl-noun-see, skipping")
      return None

  for t in parsed.filter_templates():
    if unicode(t.name) == "ru-noun-table":
      noun_table_templates.append(t)
    if unicode(t.name) == "ru-noun-old":
      noun_old_templates.append(t)

  if len(noun_table_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-table templates, skipping")
    return None
  if len(noun_old_templates) > 1:
    pagemsg("WARNING: Found multiple ru-noun-old templates, skipping")
    return None
  if len(noun_table_templates) < 1:
    if noun_old_templates:
      pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" %
          ", ".join(unicode(x) for x in noun_old_templates))
    return unicode(parsed), 0, 0, 0, 0

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun", "ru-proper noun"]:
      pagemsg("Found ru-noun or ru-proper noun, skipping")
      return None

  headword_templates = []

  for t in parsed.filter_templates():
    if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]:
      headword_templates.append(t)

  if len(headword_templates) > 1:
    pagemsg("WARNING: Found multiple headword templates, skipping")
    return None
  if len(headword_templates) < 1:
    return unicode(parsed), 0, 0, 0, 0

  noun_table_template = noun_table_templates[0]
  noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None
  headword_template = headword_templates[0]
  decl_templates = [x for x in [noun_table_template, noun_old_template] if x]

  if verbose:
    pagemsg("Found headword template: %s" % unicode(headword_template))
    pagemsg("Found decl template: %s" % unicode(noun_table_template))
    if noun_old_template:
      pagemsg("Found old decl template: %s" % unicode(noun_old_template))

  orig_headword_template = unicode(headword_template)
  orig_noun_table_template = unicode(noun_table_template)

  genders = blib.fetch_param_chain(headword_template, "g", "g")
  masculines = blib.fetch_param_chain(headword_template, "m", "m")
  feminines = blib.fetch_param_chain(headword_template, "f", "f")
  notrcat = getparam(headword_template, "notrcat")
  filtered_headword_params = []
  for param in headword_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name) or name == "notrcat":
      pass
    else:
      filtered_headword_params.append((param.name, param.value))
  filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0]
  for name, value in filtered_headword_params:
    filtered_headword_template.add(name, value)

  ru_noun_table_cleaned = 0
  ru_noun_table_link_copied = 0
  ru_noun_changed = 0
  ru_proper_noun_changed = 0

  new_decl_params = []
  for param in noun_table_template.params:
    name = unicode(param.name)
    if re.search("^[gmf][0-9]*$", name):
      pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" %
          unicode(noun_table_template))
    else:
      new_decl_params.append((param.name, param.value))
  del noun_table_template.params[:]
  for name, value in new_decl_params:
    noun_table_template.add(name, value)
  if orig_noun_table_template != unicode(noun_table_template):
    ru_noun_table_cleaned = 1

  modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0]
  for param in noun_table_template.params:
    modified_noun_table_template.add(param.name, param.value)

  # If proper noun and n is both then we need to add n=both because
  # proper noun+ defaults to n=sg
  if unicode(headword_template.name) == "ru-proper noun+":
    generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args",
        unicode(noun_table_template))
    generate_result = expand_text(generate_template)
    if not generate_result:
      pagemsg("WARNING: Error generating noun args, skipping")
      return None
    args = blib.split_generate_args(generate_result)

    # If proper noun and n is both then we need to add n=both because
    # proper noun+ defaults to n=sg
    if args["n"] == "b" and not getparam(modified_noun_table_template, "n"):
      pagemsg("Adding n=both to headword template")
      modified_noun_table_template.add("n", "both")
    # Correspondingly, if n is sg then we can usually remove n=sg;
    # but we need to check that the number is actually sg with n=sg
    # removed because of the possibility of plurale tantum lemmas
    if args["n"] == "s":
      generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}")
      generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "",
          generate_template_with_ndef)
      generate_result = expand_text(generate_template_with_ndef)
      if not generate_result:
        pagemsg("WARNING: Error generating noun args, skipping")
        return None
      ndef_args = blib.split_generate_args(generate_result)
      if ndef_args["n"] == "s":
        existing_n = getparam(headword_template, "n")
        if existing_n and not re.search(r"^s", existing_n):
          pagemsg("WARNING: Something wrong: Found n=%s, not singular" %
              existing_n)
        pagemsg("Removing n=sg from headword template")
        rmparam(modified_noun_table_template, "n")
      else:
        pagemsg("WARNING: Unable to remove n= from headword template because n=%s" %
            ndef_args["n"])

  new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+",
      unicode(modified_noun_table_template))
  existing_filtered_headword_template = unicode(filtered_headword_template)
  change_existing_headword = False
  if existing_filtered_headword_template != new_headword_template:
    if "[" in existing_filtered_headword_template and "[" not in new_headword_template:
      if blib.remove_links(existing_filtered_headword_template) == new_headword_template:
        pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl")
        del noun_table_template.params[:]
        for param in filtered_headword_template.params:
          noun_table_template.add(param.name, param.value)
        ru_noun_table_link_copied = 1
        ru_noun_table_cleaned = 0
      else:
        pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually"
            % (existing_filtered_headword_template, new_headword_template))
        return None
    else:
      pagemsg("WARNING: Existing headword template %s will be overwritten with %s"
          % (existing_filtered_headword_template, new_headword_template))
      change_existing_headword = True

  if change_existing_headword:
    del headword_template.params[:]
    for param in modified_noun_table_template.params:
      headword_template.add(param.name, param.value)
    blib.set_param_chain(headword_template, genders, "g", "g")
    blib.set_param_chain(headword_template, masculines, "m", "m")
    blib.set_param_chain(headword_template, feminines, "f", "f")
    if notrcat:
      headword_template.add("notrcat", notrcat)
    
  #genders = runounlib.check_old_noun_headword_forms(headword_template, args,
  #    subpagetitle, pagemsg)
  #if genders == None:
  #  return None

  #new_params = []
  #for param in noun_table_template.params:
  #  new_params.append((param.name, param.value))

  #params_to_preserve = runounlib.fix_old_headword_params(headword_template,
  #    new_params, genders, pagemsg)
  #if params_to_preserve == None:
  #  return None

  new_noun_table_template = unicode(noun_table_template)
  if new_noun_table_template != orig_noun_table_template:
    pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template,
      new_noun_table_template))

  new_headword_template = unicode(headword_template)
  if new_headword_template != orig_headword_template:
    pagemsg("Replacing headword %s with %s" % (orig_headword_template,
      new_headword_template))
    if unicode(headword_template.name) == "ru-noun+":
      ru_noun_changed = 1
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
Beispiel #21
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    if "it-IPA" not in text:
        return

    if ":" in pagetitle:
        pagemsg("Skipping non-mainspace title")
        return

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    headt = None
    saw_decl = False

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        if tn == "it-IPA":
            pagemsg("Saw %s" % unicode(t))
            if getparam(t, "voiced2"):
                pagemsg("WARNING: Can't yet handle voiced2=%s" %
                        getparam(t, "voiced2"))
                continue
            specified_pronuns = blib.fetch_param_chain(t, "1", "")
            pronuns = specified_pronuns or [pagetitle]
            frobbed_pronuns = []
            must_continue = False
            for ipa in pronuns:
                ipa = unicodedata.normalize("NFD", ipa)
                if AC not in ipa and GR not in ipa:
                    vowel_count = len([x for x in ipa if x in vowel])
                    if vowel_count == 1:
                        pagemsg("WARNING: Single-vowel word")
                    if vowel_count > 1:
                        new_ipa = re.sub(
                            "(" + vowel_c + ")(" + not_vowel_c + "*[iyu]?" +
                            vowel_c + not_vowel_c + "*)$",
                            lambda m: m.group(1) +
                            (AC if m.group(1) in u"eoɛɔ" else GR) + m.group(2),
                            ipa)
                        if new_ipa == ipa:
                            pagemsg("WARNING: Unable to add stress: %s" % ipa)
                        else:
                            notes.append(
                                unicodedata.normalize(
                                    "NFC",
                                    "add stressed form %s to defaulted {{it-IPA}} pronun"
                                    % new_ipa))
                            ipa = new_ipa
                if "z" in ipa:
                    frobbed_ipa = re.sub("i(" + vowel_c + ")", r"j\1", ipa)
                    frobbed_ipa = re.sub("u(" + vowel_c + ")", r"w\1",
                                         frobbed_ipa)
                    split_frobbed_ipa = re.split("(z+)", frobbed_ipa)
                    split_z = re.split("(z+)", ipa)
                    voiced = getparam(t, "voiced")
                    if voiced not in ["y", "yes", "1", ""]:
                        pagemsg("WARNING: Unrecognized voiced=%s" % voiced)
                        must_continue = True
                        break
                    for i in xrange(1, len(split_z), 2):
                        if split_z[i - 1].endswith("d"):
                            continue  # already converted appropriately
                        default_voiced = False
                        if voiced in ["y", "yes"] or i == 1 and voiced == "1":
                            default_voiced = True
                        elif i == 1 and split_frobbed_ipa[0] == "":
                            if re.search("^[ij]" + stress_c + "?" + vowel_c,
                                         split_frobbed_ipa[2]):
                                default_voiced = False
                            elif re.search(
                                    "^" + vowel_c + stress_c + "?" + vowel_c,
                                    split_frobbed_ipa[2]):
                                default_voiced = True
                        else:
                            if (split_frobbed_ipa[i] == "z"
                                    and re.search(vowel_c + stress_c + "?$",
                                                  split_frobbed_ipa[i - 1])
                                    and re.search("^" + vowel_c,
                                                  split_frobbed_ipa[i + 1])):
                                default_voiced = True
                            if re.search("i" + CFLEX,
                                         split_frobbed_ipa[i + 1]):
                                default_voiced = False
                        if default_voiced:
                            z_to_voiced = {"z": "dz", "zz": "ddz"}
                            split_z[i] = z_to_voiced.get(
                                split_z[i], split_z[i])
                        else:
                            z_to_voiceless = {"z": "ts", "zz": "tts"}
                            split_z[i] = z_to_voiceless.get(
                                split_z[i], split_z[i])
                    new_ipa = "".join(split_z)
                    if new_ipa != ipa:
                        notes.append(
                            unicodedata.normalize(
                                "NFC",
                                "convert z to ts or dz in %s -> %s in {{it-IPA}}"
                                % (ipa, new_ipa)))
                        ipa = new_ipa
                new_ipa = ipa.replace(u"ʦ", "ts")
                new_ipa = new_ipa.replace(u"ʣ", "dz")
                if new_ipa != ipa:
                    notes.append(
                        u"normalize ʦ/ʣ to ts/dz in {{it-IPA}} pronun")
                    ipa = new_ipa
                ipa = unicodedata.normalize("NFC", ipa)
                # module special-cases -izzare
                new_ipa = re.sub(u"iddz[àá]re", "izzare", ipa)
                if new_ipa != ipa:
                    notes.append(
                        u"normalize -iddzàre to -izzare in {{it-IPA}}")
                    ipa = new_ipa
                new_ipa = ipa.replace(u"á",
                                      u"à").replace(u"í",
                                                    u"ì").replace(u"ú", u"ù")
                if new_ipa != ipa:
                    notes.append(
                        unicodedata.normalize(
                            "NFC",
                            u"normalize stress in %s in {{it-IPA}}" % ipa))
                    ipa = new_ipa
                frobbed_pronuns.append(ipa)
            if must_continue:
                continue
            if frobbed_pronuns == [pagetitle]:
                frobbed_pronuns = []
                if specified_pronuns:
                    notes.append(
                        "remove explicitly specified pronun in {{it-IPA}} because same as page title"
                    )
            blib.set_param_chain(t, frobbed_pronuns, "1", "")
            if t.has("voiced"):
                rmparam(t, "voiced")
                notes.append("remove voiced= in {{it-IPA}}")

        if origt != unicode(t):
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes