def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  if "en-noun" not in text:
    return

  parsed = blib.parse_text(text)

  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    if tn == "en-noun":
      must_continue = False
      for param in t.params:
        pn = pname(param)
        if pn != "1":
          pagemsg("Template has %s=, not touching: %s" % (pn, origt))
          must_continue = True
          break
      if must_continue:
        continue
      par1 = getparam(t, "1")
      if par1 == pagetitle + "s" or par1 == "s":
        rmparam(t, "1")
        notes.append("remove redundant 1=%s from {{%s}}" % (par1, tn))
      if unicode(t) != origt:
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  return unicode(parsed), notes
  def process_adj_headt(t):
    origt = unicode(t)
    def getp(param):
      return getparam(t, param)
    tr = getp("tr")
    head = getp("head")
    if getp("1"):
      pagemsg("WARNING: Has 1=%s: %s" % (getp("1"), origt))
      return
    must_continue = False
    for param in t.params:
      pn = pname(param)
      if pn not in ["head", "tr"]:
        pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
            (pn, unicode(param.value), origt))
        must_continue = True
        break
    if must_continue:
      return False
    del t.params[:]
    if not head:
      head = pagetitle
    if belib.needs_accents(head):
      pagemsg("WARNING: Head %s missing accents: %s" % (head, origt))
    t.add("1", head)
    if tr:
      t.add("tr", tr)

    if origt != unicode(t):
      notes.append("fix up {{be-adj}} to use new param convention")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return True
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "uk-conj-manual":
            aspect = getparam(t, "1")
            t.add("aspect", aspect, before="1", preserve_spacing=False)
            rmparam(t, "1")
            for param in t.params:
                pn = pname(param)
                if "_futr_" in pn:
                    param.name = pn.replace("_futr_", "_fut_")
            to_fix = []
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn.endswith("2"):
                    to_fix.append((pn, pv))
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn.endswith("3"):
                    to_fix.append((pn, pv))
            for pn, pv in to_fix:
                if pv.strip() and pv.strip() not in ["-", u"—"]:
                    existing = getparam(t, pn[:-1])
                    if not existing:
                        existing = pv
                    else:
                        existing = re.sub(r"(\s*)$", r", %s\1" % pv.strip(),
                                          existing)
                        t.add(pn[:-1], existing, preserve_spacing=False)
                rmparam(t, pn)
            blib.set_template_name(t, "uk-conj-table")
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            notes.append("convert {{%s}} to {{uk-conj-table}}" % tn)

    return unicode(parsed), notes
Beispiel #4
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "vi-hantu":
            if not one_char(pagetitle):
                pagemsg("WARNING: Length of page title is %s > 1, skipping" %
                        len(pagetitle))
                continue
            if getparam(t, "pos"):
                pagemsg("WARNING: Saw pos=, skipping: %s" % unicode(t))
                continue
            chu = getparam(t, "chu")
            if chu and chu != "Nom":
                pagemsg("WARNING: Saw chu=%s not 'Nom', skipping: %s" %
                        (chu, unicode(t)))
                continue
            if chu == "Nom":
                newparam = "nom"
            else:
                newparam = "reading"
            reading = blib.remove_links(getparam(t, "1"))
            if not reading:
                pagemsg("WARNING: Empty reading, skipping: %s" % unicode(t))
                continue
            must_continue = False
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "rs", "chu"]:
                    pagemsg(
                        "WARNING: Unrecognized parameter %s=%s, skipping: %s" %
                        (pn, unicode(param.value), unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue
            t.add(newparam, reading, before="1")
            rmparam(t, "1")
            blib.set_template_name(t, "vi-readings")
            notes.append("{{vi-hantu}} -> {{vi-readings}}")

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #5
0
 def insert_into_existing_pron_section(k):
   parsed = blib.parse_text(subsections[k])
   for t in parsed.filter_templates():
     tn = tname(t)
     if tn in pronun_templates:
       pagemsg("Already saw pronunciation template: %s" % unicode(t))
       break
   else: # no break
     new_pron_template, pron_prefix = construct_new_pron_template()
     # Remove existing rhymes/hyphenation/pl-IPA lines
     for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]:
       re_template = template.replace("|", r"\|")
       regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
       m = re.search(regex, subsections[k], re.M)
       if m:
         pagemsg("Removed existing %s" % m.group(1).strip())
         notes.append("remove existing {{%s}}" % template)
         subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
     for template in ["audio|pl"]:
       re_template = template.replace("|", r"\|")
       regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
       all_audios = re.findall(regex, subsections[k], re.M)
       if len(all_audios) > 1:
         pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios()))
         return
       if len(all_audios) == 1:
         audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0]
         assert(tname(audiot) == "audio")
         if getparam(audiot, "1") != "pl":
           pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line)
           return
         audiofile = getparam(audiot, "2")
         audiogloss = getparam(audiot, "3")
         for param in audiot.params:
           pn = pname(param)
           pv = unicode(param.value)
           if pn not in ["1", "2", "3"]:
             pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
               pn, pv, audio_line))
             return
         if audiogloss in ["Audio", "audio"]:
           audiogloss = ""
         params = "|a=%s" % audiofile
         if audiogloss:
           params += "|ac=%s" % audiogloss
         new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:]
         pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip())
         notes.append("incorporate existing {{%s}} into {{pl-p}}" % template)
         subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
     subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k]
     notes.append("insert %s into existing Pronunciation section" % new_pron_template)
   return True
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    text = unicode(page.text)
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn == "head" and getparam(t, "1") == "ang" and getparam(
                t, "2") in ["adjective", "adjectives"]:
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "2", "head"]:
                    pagemsg(
                        "WARNING: head|ang|adjective with extra params: %s" %
                        unicode(t))
                    break
            else:
                # no break
                blib.set_template_name(t, "ang-adj")
                rmparam(t, "1")
                rmparam(t, "2")
                notes.append("convert {{head|ang|adjective}} into {{ang-adj}}")
        elif tn == "ang-adj":
            if getparam(t, "2"):
                t.add("1", "")
                notes.append("remove unneeded 1= from {{ang-adj}}")
            else:
                param1 = getparam(t, "1")
                if param1:
                    t.add("1", "")
                    t.add("2", param1)
                    notes.append("move 1= to 2= in {{ang-adj}}")
            param4 = getparam(t, "4")
            if param4:
                rmparam(t, "4")
                if not getparam(t, "1"):
                    t.add("1", "")
                if not getparam(t, "2"):
                    t.add("2", "")
                t.add("3", param4)
                notes.append("move 4= to 3= in {{ang-adj}}")
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return parsed, notes
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    text = unicode(page.text)
    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        if tn == "head" and getparam(t, "1") == "ang" and getparam(
                t, "2") in ["verb", "verbs"]:
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "2", "head"]:
                    pagemsg("WARNING: head|ang|verb with extra params: %s" %
                            unicode(t))
                    break
            else:
                # no break
                blib.set_template_name(t, "ang-verb")
                rmparam(t, "1")
                rmparam(t, "2")
                notes.append("convert {{head|ang|verb}} into {{ang-verb}}")
                head = getparam(t, "head")
                if head:
                    t.add("1", head)
                rmparam(t, "head")
        elif tn == "ang-verb":
            head = getparam(t, "head")
            head2 = getparam(t, "head2")
            head3 = getparam(t, "head3")
            rmparam(t, "head")
            rmparam(t, "head2")
            rmparam(t, "head3")
            if head:
                t.add("1", head)
            if head2:
                t.add("head2", head2)
            if head3:
                t.add("head3", head3)
            notes.append("move head= to 1= in {{ang-verb}}")
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return parsed, notes
Beispiel #8
0
  def etym_section_is_movable(sectext, header):
    parsed = blib.parse_text(sectext)
    inflection_of_templates_with_unrecognized_tags = []
    saw_inflection_of_with_recognized_tag = False
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "inflection of":
        if getparam(t, "lang"):
          lang = getparam(t, "lang")
          first_tag_param = 3
        else:
          lang = getparam(t, "1")
          first_tag_param = 4
        if lang != "ar":
          pagemsg("WARNING: Non-Arabic language in Arabic {{inflection of}} in %s, skipping: %s" % (header, unicode(t)))
          return False
        tags = []
        for param in t.params:
          pn = pname(param)
          pv = unicode(param.value).strip()
          if re.search("^[0-9]+$", pn) and int(pn) >= first_tag_param:
            tags.append(pv)
        if tags not in split_recognized_tag_sets:
          inflection_of_templates_with_unrecognized_tags.append(unicode(t))
        else:
          saw_inflection_of_with_recognized_tag = True

    if not saw_inflection_of_with_recognized_tag:
      return False

    if inflection_of_templates_with_unrecognized_tags:
      pagemsg("WARNING: Unrecognized {{inflection of}} tag set mixed with recognized ones in %s, skipping: %s" %
        (header, " / ".join(inflection_of_templates_with_unrecognized_tags)))
      return False

    for t in parsed.filter_templates():
      tn = tname(t)
      if tn in ["also", "ar-root", "nonlemma", "ar-IPA"]:
        continue
      if tn == "ar-verb-form":
        form = getparam(t, "1")
        if not form.endswith(u"و") and form.endswith(u"وْ"):
          pagemsg("WARNING: ar-verb-form form doesn't end with waw in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t)))
          return False
        continue
      if tn != "inflection of":
        pagemsg("WARNING: Unrecognized template in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t)))
        return False
    return True
  def process_verb_headt(t):
    origt = unicode(t)
    def getp(param):
      return getparam(t, param)
    tr = getp("tr")
    if getp("2"):
      head = getp("1")
      g = getp("2")
    else:
      head = getp("head")
      g = getp("1") or getp("a")
    pf = blib.fetch_param_chain(t, "pf", "pf")
    impf = blib.fetch_param_chain(t, "impf", "impf")
    must_continue = False
    for param in t.params:
      pn = pname(param)
      if pn not in ["head", "tr", "1", "a", "2", "pf", "pf2", "pf3",
          "impf", "impf2", "impf3"]:
        pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
            (pn, unicode(param.value), origt))
        must_continue = True
        break
    if must_continue:
      return False
    del t.params[:]
    if not head:
      head = pagetitle
    if belib.needs_accents(head):
      pagemsg("WARNING: Head %s missing accents: %s" % (head, origt))
    if not g:
      pagemsg("WARNING: No aspect in verb headword: %s" % origt)
      g = "?"
    t.add("1", head)
    if tr:
      t.add("tr", tr)
    t.add("2", g)
    blib.set_param_chain(t, pf, "pf", "pf")
    blib.set_param_chain(t, impf, "impf", "impf")

    if origt != unicode(t):
      notes.append("fix up {{be-verb}} to use new param convention")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return True
Beispiel #10
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    # Greatly speed things up when --stdin by ignoring non-Latin pages
    if "==Latin==" not in text:
        return None, None

    if not re.search("la-(noun|proper noun|pronoun|verb|adj|num|suffix)-form",
                     text):
        return None, None

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in [
                "la-noun-form", "la-proper noun-form", "la-pronoun-form",
                "la-verb-form", "la-adj-form", "la-num-form", "la-suffix-form"
        ]:
            if not getparam(t, "1"):
                pagemsg("WARNING: Missing 1=: %s" % unicode(t))
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "g", "g2", "g3", "g4"]:
                    pagemsg("WARNING: Extraneous param %s=: %s" %
                            (pn, unicode(t)))
    return None, None
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "PIE root":
            if not getparam(t, "2"):
                pagemsg("WARNING: Something wrong, no 2=: %s" % unicode(t))
                continue
            blib.set_template_name(t, "root")
            newparams = []
            for param in t.params:
                pn = pname(param)
                if re.search("^[0-9]+$", pn) and int(pn) >= 2:
                    if pn == "2":
                        newparams.append(("2", "ine-pro"))
                    pv = unicode(param.value)
                    if not pv.startswith("*"):
                        pv = "*" + pv
                    if not pv.endswith("-"):
                        pv = pv + "-"
                    newparams.append((unicode(int(pn) + 1), pv))
                else:
                    newparams.append(
                        (unicode(param.name), unicode(param.value)))
            del t.params[:]
            for name, value in newparams:
                t.add(name, value, preserve_spacing=False)
            notes.append("convert {{%s}} to {{root|...|ine-pro}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #12
0
 def replace_name_translit(m):
     origline = m.group(0)
     source_lang, name_type, template, period = m.groups()
     if source_lang not in blib.languages_byCanonicalName:
         pagemsg(
             "WARNING: Unrecognized source lang %s, can't parse: <from> %s <to> %s <end>"
             % (source_lang, origline, origline))
         return origline
     source_lang_code = blib.languages_byCanonicalName[source_lang][
         "code"]
     parsed = blib.parse_text(template)
     t = list(parsed.filter_templates())[0]
     lang = getparam(t, "1")
     name = getparam(t, "2")
     alt = getparam(t, "3")
     eq = blib.remove_links(getparam(t, "4"))
     if source_lang_code != lang:
         pagemsg(
             "WARNING: Source lang code %s for %s != template lang code %s, can't parse: <from> %s <to> %s <end>"
             %
             (source_lang_code, source_lang, lang, origline, origline))
         return origline
     if alt:
         pagemsg(
             "WARNING: Can't handle alt=%s in %s: <from> %s <to> %s <end>"
             % (alt, unicode(t), origline, origline))
         return origline
     for param in t.params:
         pn = pname(param)
         if pn not in ["1", "2", "3", "4", "sc"]:
             pagemsg(
                 "WARNING: Can't handle %s=%s in %s: <from> %s <to> %s <end>"
                 % (pn, unicode(param.value), origline, origline))
             return origline
     return "{{name translit|%s|%s|%s|type=%s%s}}%s" % (
         thislangcode, source_lang_code, name, name_type,
         "|eq=%s" % eq if eq else "", period)
Beispiel #13
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    if tname(t) == "bg-noun-form":
      origt = unicode(t)
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "head"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      rmparam(t, "1")
      rmparam(t, "2")
      head = getparam(t, "head")
      rmparam(t, "head")
      g = getparam(t, "3")
      rmparam(t, "3")
      blib.set_template_name(t, "head")
      t.add("1", "bg")
      t.add("2", "noun form")
      if head:
        t.add("head", head)
      else:
        if bglib.needs_accents(pagetitle):
          pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" %
              unicode(t))
        else:
          t.add("head", pagetitle)
      if g:
        t.add("g", g)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}")

  headt = None
  saw_infl_after_head = False
  saw_headt = False
  saw_inflt = False
  for t in parsed.filter_templates():
    tn = tname(t)
    origt = unicode(t)
    saw_infl = False
    already_fetched_forms = False
    if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form":
      saw_headt = True
      if headt and not saw_infl_after_head:
        pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % (
          unicode(headt), origt))
      saw_infl_after_head = False
      headt = t
    if tn == "bg-noun form of":
      saw_inflt = True
      if not headt:
        pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt)
        continue
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2", "3", "noun"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      saw_infl_after_head = True
      noun = getparam(t, "noun")
      if not noun:
        pagemsg("WARNING: Didn't see noun=: %s" % origt)
        continue
      infls = []
      param2 = getparam(t, "2")
      if param2 == "indefinite":
        infls.append("indef")
      elif param2 == "definite":
        infls.append("def")
      elif param2 == "vocative":
        infls.append("voc")
      elif param2:
        pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt))
        continue
      param3 = getparam(t, "3")
      if param3 == "subject":
        infls.append("sbjv")
      elif param3 == "object":
        infls.append("objv")
      elif param3:
        pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt))
        continue
      param1 = getparam(t, "1")
      if param1 == "singular":
        infls.append("s")
      elif param1 == "plural":
        infls.append("p")
      elif param1 == "count":
        infls.extend(["count", "form"])
      elif param1 == "vocative":
        infls.extend(["voc", "s"])
      else:
        pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt))
        continue
      blib.set_template_name(t, "inflection of")
      del t.params[:]
      t.add("1", "bg")
      lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
      if not lemma:
        pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
        t.add("2", noun)
      else:
        t.add("2", lemma)
      t.add("3", "")
      for i, infl in enumerate(infls):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{bg-noun form of}} to {{inflection of}}")
      tn = tname(t)
      saw_infls = infls_to_slot(infls)
      already_fetched_forms = True
      if not saw_infls:
        pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "inflection of" and getparam(t, "1") == "bg":
      saw_inflt = True
      infls = []
      i = 4
      while True:
        infl = getparam(t, str(i))
        if not infl:
          break
        infls.append(infl)
        i += 1
      saw_infls = infls_to_slot(infls)
      if not saw_infls:
        if "vnoun" in infls:
          pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt))
        elif "part" in infls:
          pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt))
        else:
          pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt))
    elif tn == "definite singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_sg"
    elif tn == "indefinite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "ind_pl"
    elif tn == "definite plural of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "def_pl"
    elif tn == "vocative singular of" and getparam(t, "1") == "bg":
      saw_inflt = True
      saw_infl = "voc_sg"
    if saw_infl:
      if not already_fetched_forms:
        noun = getparam(t, "2")
        lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg)
        if not lemma:
          pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt))
          continue
        t.add("2", lemma)
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))
        notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn))
      if saw_infl == "def_sg":
        def_sub_sg = forms.get("def_sub_sg", None)
        def_obj_sg = forms.get("def_obj_sg", None)
        if def_sub_sg != def_obj_sg:
          pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % (
            def_sub_sg, def_obj_sg))
          continue
        form = def_sub_sg
      else:
        form = forms.get(saw_infl, None)
      if not form:
        pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" %
            (saw_infl, format_forms(forms)))
        continue
      form = form.split(",")
      filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle]
      if not filtered_form:
        pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form)))
        continue
      form = filtered_form
      existing_form = blib.fetch_param_chain(headt, "head", "head")
      if existing_form:
        must_continue = False
        for f in existing_form:
          if bglib.remove_accents(f) != pagetitle:
            pagemsg("WARNING: Existing head %s doesn't match page title: %s" % (
              f, unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue
        needs_accents = [bglib.needs_accents(f) for f in existing_form]
        if any(needs_accents) and not all(needs_accents):
          pagemsg("WARNING: Some but not all existing heads missing accents: %s" %
              unicode(headt))
          continue
        if not any(needs_accents):
          if existing_form != form:
            pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % (
              saw_infl, ",".join(existing_form), ",".join(form)))
          continue
      origheadt = unicode(headt)
      blib.set_param_chain(headt, form, "head", "head")
      pagemsg("Replaced %s with %s" % (origheadt, unicode(headt)))
      notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form)))

  if saw_headt and not saw_inflt:
    pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt))

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    if tn in template_to_infl_codes and getparam(t, "1") == "bg":
      must_continue = False
      for param in t.params:
        if pname(param) not in ["1", "2"]:
          pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        continue
      infl_codes = template_to_infl_codes[tn]
      blib.set_template_name(t, "inflection of")
      t.add("3", "")
      for i, infl in enumerate(infl_codes):
        t.add(str(i + 4), infl)
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append("convert {{%s}} to {{inflection of}}" % tn)

  return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if old_adj_template not in text and "es-noun" not in text:
        return

    if ":" in pagetitle:
        pagemsg("Skipping non-mainspace title")
        return

    pagemsg("Processing")

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "es-noun" and args.remove_redundant_noun_args:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            if not getparam(t, "2") and (getparam(t, "pl2")
                                         or getparam(t, "pl3")):
                pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" %
                        unicode(t))
                continue
            g = getparam(t, "1")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in m=%s and old default noun algorithm applying"
                    % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls and not g.endswith("-p"):
                pagemsg(
                    "WARNING: Space in f=%s and old default noun algorithm applying"
                    % ",".join(fs))
            pls = blib.fetch_param_chain(t, "2", "pl")
            if not pls and not g.endswith("-p"):
                if " " in lemma:
                    pagemsg(
                        "WARNING: Space in headword and old default noun algorithm applying"
                    )
                continue
            pls_with_def = []
            defpl = make_plural(lemma)
            if not defpl:
                continue
            if len(defpl) > 1:
                if set(pls) == set(defpl):
                    pls_with_def = ["+"]
                elif set(pls) < set(defpl):
                    pagemsg(
                        "WARNING: pls=%s subset of defpls=%s, replacing with default"
                        % (",".join(pls), ",".join(defpl)))
                    pls_with_def = ["+"]
                else:
                    pls_with_def = pls
            else:
                for pl in pls:
                    if pl == defpl[0]:
                        pls_with_def.append("+")
                    else:
                        pls_with_def.append(pl)

            actual_special = None
            for special in all_specials:
                special_pl = make_plural(lemma, special)
                if special_pl is None:
                    continue
                if len(special_pl) > 1 and set(pls) < set(special_pl):
                    pagemsg(
                        "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing"
                        % (special, ",".join(pls), ",".join(special_pl)))
                    actual_special = special
                    break
                if set(pls) == set(special_pl):
                    pagemsg("Found special=%s with special_pl=%s" %
                            (special, ",".join(special_pl)))
                    actual_special = special
                    break

            if pls_with_def == ["+"]:
                notes.append("remove redundant plural%s %s from {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls)))
                blib.remove_param_chain(t, "2", "pl")
            elif actual_special:
                notes.append("replace plural%s %s with +%s in {{es-noun}}" %
                             ("s" if len(pls) > 1 else "", ",".join(pls),
                              actual_special))
                blib.set_param_chain(t, ["+" + actual_special], "2", "pl")
            elif pls_with_def != pls:
                notes.append(
                    "replace default plural %s with '+' in {{es-noun}}" %
                    ",".join(defpl))
                blib.set_param_chain(t, pls_with_def, "2", "pl")

            def handle_mf(mf, mf_full, make_mf):
                mfs = blib.fetch_param_chain(t, mf, mf)
                mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl")
                if mfs and not any(x.startswith("+") for x in mfs):
                    defmf = make_mf(lemma)
                    if set(mfs) == {defmf}:
                        defpls = make_plural(defmf)
                        ok = False
                        if not mfpls or set(mfpls) == set(defpls):
                            ok = True
                        elif set(mfpls) < set(defpls):
                            pagemsg(
                                "WARNING: %pl=%s subset of default=%s, allowing"
                                % (mf, ",".join(mfpls), ",".join(defpls)))
                            ok = True
                        if ok:
                            notes.append(
                                "replace %s=%s with '+' in {{es-noun}}" %
                                (mf, ",".join(mfs)))
                            blib.set_param_chain(t, ["+"], mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                            return
                    actual_special = None
                    for special in all_specials:
                        special_mf = make_mf(lemma, special)
                        if special_mf is None:
                            continue
                        if mfs == [special_mf]:
                            pagemsg("Found special=%s with special_mf=%s" %
                                    (special, special_mf))
                            actual_special = special
                            break
                    if actual_special:
                        if not mfpls:
                            pagemsg(
                                "WARNING: Explicit %s=%s matches special=%s but no %s plural"
                                % (mf, ",".join(mfs), actual_special, mf_full))
                        else:
                            special_mfpl = make_plural(special_mf,
                                                       actual_special)
                            if special_mfpl:
                                if len(special_mfpl) > 1 and set(mfpls) < set(
                                        special_mfpl):
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                elif set(mfpls) == set(special_mfpl):
                                    pagemsg(
                                        "Found %s=%s and special=%s, %spls=%s matches special_%spl"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf))
                                else:
                                    pagemsg(
                                        "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s"
                                        % (mf, ",".join(mfs), actual_special,
                                           mf, ",".join(mfpls), mf,
                                           ",".join(special_mfpl)))
                                    actual_special = None
                        if actual_special:
                            notes.append(
                                "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural"
                                % (mf_full, ",".join(mfs), actual_special,
                                   mf_full))
                            blib.set_param_chain(t, ["+%s" % actual_special],
                                                 mf, mf)
                            blib.remove_param_chain(t, mf + "pl", mf + "pl")
                    if not actual_special:
                        defmf = make_mf(lemma)
                        mfs_with_def = ["+" if x == defmf else x for x in mfs]
                        if mfs_with_def != mfs:
                            notes.append(
                                "replace default %s %s with '+' in {{es-noun}}"
                                % (mf_full, defmf))
                            blib.set_param_chain(t, mfs_with_def, mf, mf)
                        if mfpls:
                            defpl = [
                                x for y in mfs for x in (make_plural(y) or [])
                            ]
                            ok = False
                            if set(defpl) == set(mfpls):
                                ok = True
                            elif len(defpl) > 1 and set(mfpls) < set(defpl):
                                pagemsg(
                                    "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing"
                                    % (mf, ",".join(mfs), mf, ",".join(mfpls),
                                       ",".join(defpl)))
                                ok = True
                            if ok:
                                pagemsg(
                                    "Found %s=%s, %spl=%s matches default pl" %
                                    (mf, ",".join(mfs), mf, ",".join(mfpls)))
                                notes.append(
                                    "remove redundant explicit %s plural %s in {{es-noun}}"
                                    % (mf_full, ",".join(mfpls)))
                                blib.remove_param_chain(
                                    t, mf + "pl", mf + "pl")
                            else:
                                for special in all_specials:
                                    defpl = [
                                        x for y in mfs for x in (
                                            make_plural(y, special) or [])
                                    ]
                                    if set(defpl) == set(mfpls):
                                        pagemsg(
                                            "Found %s=%s, %spl=%s matches special=%s"
                                            % (mf, ",".join(mfs), mf,
                                               ",".join(mfpls), special))
                                        notes.append(
                                            "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}"
                                            % (mf_full, ",".join(mfpls),
                                               special))
                                        blib.set_param_chain(
                                            t, ["+%s" % special], mf + "pl",
                                            mf + "pl")

            handle_mf("f", "feminine", make_feminine)
            handle_mf("m", "masculine", make_masculine)

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

        if tn == "es-noun" and args.make_multiword_plural_explicit:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)

            def expand_text(tempcall):
                return blib.expand_text(tempcall, pagetitle, pagemsg,
                                        args.verbose)

            if " " in lemma and not getparam(t, "2"):
                g = getparam(t, "1")
                if not g.endswith("-p"):
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" %
                        (lemma, g))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string"
                        )
                        continue
                    plurals = explicit_pl.split(",")
                    blib.set_param_chain(t, plurals, "2", "pl")
                    notes.append("add explicit plural to multiword noun")
            ms = blib.fetch_param_chain(t, "m", "m")
            space_in_m = False
            for m in ms:
                if " " in m:
                    space_in_m = True
            mpls = blib.fetch_param_chain(t, "mpl", "mpl")
            if space_in_m and not mpls:
                mpls = []
                for m in ms:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" %
                        (blib.remove_links(m)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string"
                            % m)
                        continue
                    this_mpls = explicit_pl.split(",")
                    mpls.extend(this_mpls)
                blib.set_param_chain(t, mpls, "mpl", "mpl")
                notes.append("add explicit plural to m=%s" % ",".join(ms))
            fs = blib.fetch_param_chain(t, "f", "f")
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            space_in_f = False
            for f in fs:
                if " " in f:
                    space_in_f = True
            fpls = blib.fetch_param_chain(t, "fpl", "fpl")
            if space_in_f and not fpls:
                fpls = []
                for f in fs:
                    explicit_pl = expand_text(
                        "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" %
                        (blib.remove_links(f)))
                    if not explicit_pl:
                        pagemsg(
                            "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string"
                            % f)
                        continue
                    this_fpls = explicit_pl.split(",")
                    fpls.extend(this_fpls)
                blib.set_param_chain(t, fpls, "fpl", "fpl")
                notes.append("add explicit plural to f=%s" % ",".join(fs))
            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))

        if tn == old_adj_template:
            origt = unicode(t)
            lemma = blib.remove_links(getparam(t, "head") or pagetitle)
            deff = make_feminine(pagetitle)
            defmpl = make_plural(pagetitle)
            fs = []
            fullfs = []
            f = getparam(t, "f") or pagetitle
            fullfs.append(f)
            if f == deff:
                f = "+"
            elif f == lemma:
                f = "#"
            fs.append(f)
            f2 = getparam(t, "f2")
            if f2:
                fullfs.append(f2)
                if f2 == deff:
                    f2 == "+"
                fs.append(f2)
            mpls = []
            mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s"
            mpls.append(mpl)
            mpl2 = getparam(t, "mpl2") or getparam(t, "pl2")
            if mpl2:
                mpls.append(mpl2)
            fullmpls = mpls
            # should really check for subsequence but it never occurs
            if set(mpls) == set(defmpl):
                mpls = ["+"]
            elif set(mpls) < set(defmpl):
                pagemsg(
                    "WARNING: mpls=%s subset of defmpl=%s, replacing with default"
                    % (",".join(mpls), ",".join(defmpl)))
                mpls = ["+"]
            mpls = ["#" if x == lemma else x for x in mpls]
            deffpl = [x for f in fullfs for x in make_plural(f)]
            fpls = []
            fpl = getparam(t, "fpl") or getparam(
                t, "pl") or (getparam(t, "f") or pagetitle) + "s"
            fpls.append(fpl)
            fpl2 = getparam(t, "fpl2") or getparam(t, "pl2")
            if fpl2:
                fpls.append(fpl2)
            fullfpls = fpls
            # should really check for subsequence but it never occurs
            if set(fpls) == set(deffpl):
                fpls = ["+"]
            elif set(fpls) < set(deffpl):
                pagemsg(
                    "WARNING: fpls=%s subset of deffpl=%s, replacing with default"
                    % (",".join(fpls), ",".join(deffpl)))
                fpls = ["+"]
            fpls = ["#" if x == lemma else x for x in fpls]
            actual_special = None
            for special in all_specials:
                deff = make_feminine(pagetitle, special)
                if deff is None:
                    continue
                defmpl = make_plural(pagetitle, special)
                deffpl = make_plural(deff, special)
                deff = [deff]
                if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl:
                    actual_special = special
                    break

            head = getparam(t, "head")

            must_continue = False
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn == "1" and pv in ["m", "mf"]:
                    pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" %
                            (pn, pv, unicode(t)))
                    continue
                if pn not in [
                        "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl",
                        "fpl2"
                ]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s in %s" %
                            (pn, pv, unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue

            del t.params[:]
            if head:
                t.add("head", head)
            if fullfs == [pagetitle] and fullmpls == [
                    pagetitle
            ] and fullfpls == [pagetitle]:
                blib.set_template_name(t, "es-adj-inv")
            else:
                blib.set_template_name(t, "es-adj")
                if actual_special:
                    t.add("sp", actual_special)
                else:
                    if fs != ["+"]:
                        blib.set_param_chain(t, fs, "f", "f")

                    if mpls == fpls and ("+" not in mpls or defmpl == deffpl):
                        # masc and fem pl the same
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "pl", "pl")
                    else:
                        if mpls != ["+"]:
                            blib.set_param_chain(t, mpls, "mpl", "mpl")
                        if fpls != ["+"]:
                            blib.set_param_chain(t, fpls, "fpl", "fpl")

            if origt != unicode(t):
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("convert {{%s}} to new {{%s}} format" %
                             (old_adj_template, tname(t)))
            else:
                pagemsg("No changes to %s" % unicode(t))

    return unicode(parsed), notes
Beispiel #15
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if "it-verb" not in text:
        return

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)

        def getp(param):
            return getparam(t, param)

        if tn in ["it-verb"]:
            pagemsg("Saw %s" % unicode(t))
            if not getp("1"):
                continue
            parts = []
            aux = getp("aux") or "avere"
            split_aux_with_footnotes = split_with_footnotes(aux)
            split_aux_with_footnotes = [
                re.sub("^avere", "a", x) for x in split_aux_with_footnotes
            ]
            split_aux_with_footnotes = [
                re.sub("^essere", "e", x) for x in split_aux_with_footnotes
            ]
            parts.append(":".join(split_aux_with_footnotes) + "/")
            parts.append(":".join(split_with_footnotes(getp("1"))))
            arg2 = getp("2")
            arg3 = getp("3")
            if arg2 or arg3:
                parts.append("," + ":".join(split_with_footnotes(arg2)))
            if arg3:
                parts.append("," + ":".join(split_with_footnotes(arg3)))
            irregparams = ["imperf", "fut", "sub", "impsub", "imp"]
            for irregparam in irregparams:
                arg = getp(irregparam)
                if arg:
                    parts.append("." + irregparam + ":" +
                                 ":".join(split_with_footnotes(arg)))
            if getp("impers"):
                parts.append(".only3s")
            if getp("only3sp"):
                parts.append(".only3sp")
            must_continue = False
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "2", "3", "aux", "impers", "only3sp"
                              ] and pn not in irregparams:
                    pagemsg("WARNING: Unrecognized param %s=%s" %
                            (pn, unicode(param.value)))
                    must_continue = True
                    break
            if must_continue:
                continue
            del t.params[:]
            t.add("1", "".join(parts))
            notes.append("convert {{it-verb}} params to new form")
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    def verify_template_is_full_line(tn, line):
        line = line.strip()
        templates = list(blib.parse_text(line).filter_templates())
        if type(tn) is list:
            tns = tn
        else:
            tns = [tn]
        tntext = "/".join(tns)
        if len(templates) == 0:
            pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" %
                    (tntext, line))
            return None
        t = templates[0]
        if tname(t) not in tns:
            pagemsg(
                "WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s"
                % (tntext, tntext, line))
            return None
        if unicode(t) != line:
            pagemsg(
                "WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s"
                % (tntext, tntext, line))
            return None
        return t

    notes = []

    if len(pagetitle) == 1 or pagetitle.endswith("-"):
        pagemsg("Page title is a single letter or a prefix, skipping")
        return

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Polish",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    for k in xrange(1, len(subsections), 2):
        if re.search(r"==\s*Pronunciation\s*==", subsections[k]):
            secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation",
                               subsections[k])
            if secheader != subsections[k]:
                subsections[k] = secheader
                notes.append(
                    "remove extraneous spaces in ==Pronunciation== header")
            extra_notes = []
            parsed = blib.parse_text(subsections[k + 1])
            num_pl_IPA = 0
            saw_pl_p = False
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["pl-p", "pl-pronunciation"]:
                    saw_pl_p = True
                    break
                if tn in ["pl-IPA", "pl-IPA-auto"]:
                    num_pl_IPA += 1
            if saw_pl_p:
                pagemsg("Already saw {{pl-p}}, skipping: %s" % unicode(t))
                continue
            if num_pl_IPA == 0:
                pagemsg(
                    "WARNING: Didn't see {{pl-IPA}} in Pronunciation section, skipping"
                )
                continue
            if num_pl_IPA > 1:
                pagemsg(
                    "WARNING: Saw multiple {{pl-IPA}} in Pronunciation section, skipping"
                )
                continue
            lines = subsections[k + 1].strip().split("\n")
            # Remove blank lines.
            lines = [line for line in lines if line]
            hyph_lines = []
            homophone_lines = []
            rhyme_lines = []
            audio_lines = []
            must_continue = False
            newtemp = None
            next_audio_param = 0
            has_respelling = False
            ipat = None
            for line in lines:
                origline = line
                # In case of "* {{pl-IPA|...}}", chop off the "* ".
                line = re.sub(r"^\*\s*(\{\{pl-IPA)", r"\1", line)
                if line.startswith("{{pl-IPA"):
                    if newtemp:
                        pagemsg(
                            "WARNING: Something wrong, already saw {{pl-IPA}}?: %s"
                            % origline)
                        must_continue = True
                        break
                    ipat = verify_template_is_full_line(
                        ["pl-IPA", "pl-IPA-auto"], line)
                    if ipat is None:
                        must_continue = True
                        break
                    newtemp_str = "{{pl-p}}"
                    newtemp = list(
                        blib.parse_text(newtemp_str).filter_templates())[0]
                    for param in ipat.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if re.search("^[0-9]+$", pn):
                            has_respelling = True
                            newtemp.add(pn, pv, preserve_spacing=False)
                        elif re.search("^qual[0-9]*$", pn):
                            newtemp.add(pn.replace("qual", "q"),
                                        pv,
                                        preserve_spacing=False)
                        else:
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{pl-IPA}}, skipping: %s"
                                % (pn, pv, origline))
                            must_continue = True
                            break
                    if has_respelling:
                        pagemsg("WARNING: {{pl-IPA}} has respelling: %s" %
                                unicode(ipat))
                    if must_continue:
                        break
                    continue
                if not line.startswith("* ") and not line.startswith("*{"):
                    pagemsg(
                        "WARNING: Pronunciation section line doesn't start with '* ', skipping: %s"
                        % origline)
                    must_continue = True
                    break
                if line.startswith("* "):
                    line = line[2:]
                else:
                    line = line[1:]
                if line.startswith("{{hyph"):
                    hyph_lines.append(line)
                elif line.startswith("{{homophone") or line.startswith(
                        "{{hmp"):
                    homophone_lines.append(line)
                elif line.startswith("{{audio"):
                    audio_lines.append(line)
                elif line.startswith("{{rhyme"):
                    rhyme_lines.append(line)
                else:
                    pagemsg(
                        "WARNING: Unrecognized Pronunciation section line, skipping: %s"
                        % origline)
                    must_continue = True
                    break
            if has_respelling and (rhyme_lines or hyph_lines):
                rhyme_hyph = []
                if rhyme_lines:
                    rhyme_hyph.append("rhyme line(s) %s" %
                                      ",".join(rhyme_lines))
                if hyph_lines:
                    rhyme_hyph.append("hyphenation line(s) %s" %
                                      ",".join(hyph_lines))
                # We formerly skipped these pages, but [[User:Vininn126]] requested running the bot on them.
                pagemsg("WARNING: Has respelling %s along with %s" %
                        (ipat and unicode(ipat)
                         or "UNKNOWN", " and ".join(rhyme_hyph)))
                #continue
            if must_continue:
                continue

            if audio_lines:
                must_continue = False
                for audio_line in audio_lines:
                    audiot = verify_template_is_full_line("audio", audio_line)
                    if audiot is None:
                        must_continue = True
                        break
                    if getparam(audiot, "1") != "pl":
                        pagemsg(
                            "WARNING: Wrong language in {{audio}}, skipping: %s"
                            % audio_line)
                        must_continue = True
                        break
                    audiofile = getparam(audiot, "2")
                    audiogloss = getparam(audiot, "3")
                    for param in audiot.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if pn not in ["1", "2", "3"]:
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s"
                                % (pn, pv, audio_line))
                            must_continue = True
                            break
                    if must_continue:
                        break
                    if audiogloss in ["Audio", "audio"]:
                        audiogloss = ""
                    if not newtemp:
                        pagemsg(
                            "WARNING: Saw %s without {{pl-IPA}}, skipping: %s"
                            % (unicode(audiot), audio_line))
                        must_continue = True
                        break
                    next_audio_param += 1
                    if next_audio_param == 1:
                        paramsuf = ""
                    else:
                        paramsuf = str(next_audio_param)
                    newtemp.add("a%s" % paramsuf,
                                audiofile,
                                preserve_spacing=False)
                    if audiogloss:
                        newtemp.add("ac%s" % paramsuf,
                                    audiogloss,
                                    preserve_spacing=False)
                    pagemsg("Replacing %s with %s" %
                            (unicode(audiot), unicode(newtemp)))
                    extra_notes.append("incorporate %s into {{pl-p}}" %
                                       unicode(audiot))
                if must_continue:
                    continue

            if rhyme_lines:
                if len(rhyme_lines) > 1:
                    pagemsg("WARNING: Multiple rhyme lines, not removing: %s" %
                            ", ".join(rhyme_lines))
                    continue
                rhyme_line = rhyme_lines[0]
                rhymet = verify_template_is_full_line(["rhyme", "rhymes"],
                                                      rhyme_line)
                if not rhymet:
                    continue
                if getparam(rhymet, "1") != "pl":
                    pagemsg(
                        "WARNING: Wrong language in {{%s}}, not removing: %s" %
                        (tname(rhymet), rhyme_line))
                    continue
                pagemsg("Ignoring rhyme line: %s" % rhyme_line)
                extra_notes.append("remove rhyme template %s" %
                                   unicode(rhymet))

            if hyph_lines:
                if len(hyph_lines) > 1:
                    pagemsg(
                        "WARNING: Multiple hyphenation lines, not removing: %s"
                        % ", ".join(hyph_lines))
                    continue
                hyph_line = hyph_lines[0]
                hypht = verify_template_is_full_line(["hyph", "hyphenation"],
                                                     hyph_line)
                if not hypht:
                    continue
                if getparam(hypht, "1") != "pl":
                    pagemsg(
                        "WARNING: Wrong language in {{%s}}, not removing: %s" %
                        (tname(hypht), hyph_line))
                    continue
                pagemsg("Ignoring hyphenation line: %s" % hyph_line)
                extra_notes.append("remove hyphenation template %s" %
                                   unicode(hypht))

            if homophone_lines:
                next_homophone_param = 0
                must_continue = False
                for homophone_line in homophone_lines:
                    homophones = {}
                    homophone_qualifiers = {}
                    hmpt = verify_template_is_full_line(
                        ["hmp", "homophone", "homophones"], homophone_line)
                    if not hmpt:
                        must_continue = True
                        break
                    if getparam(hmpt, "1") != "pl":
                        pagemsg(
                            "WARNING: Wrong language in {{%s}}, not removing: %s"
                            % (tname(hmpt), homophone_line))
                        must_continue = True
                        break
                    for param in hmpt.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if not re.search("^q?[0-9]+$", pn):
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s"
                                % (pn, pv, tname(hmpt), homophone_line))
                            must_continue = True
                            break
                        if pn.startswith("q"):
                            homophone_qualifiers[int(pn[1:])] = pv
                        elif int(pn) > 1:
                            homophones[int(pn) - 1] = pv
                    if must_continue:
                        break
                    if not newtemp:
                        pagemsg(
                            "WARNING: Something wrong, saw %s without {{pl-IPA}}, skipping"
                            % unicode(hmpt))
                        must_continue = True
                        break
                    hhs = []
                    hhp_args = []
                    for pn, pv in sorted(homophones.items()):
                        next_homophone_param += 1
                        hmp_param = "" if next_homophone_param == 1 else str(
                            next_homophone_param)
                        hhs.append(pv)
                        if pn in homophone_qualifiers:
                            hhp_args.append(("hhp%s" % hmp_param,
                                             homophone_qualifiers[pn]))
                    if hhs:
                        newtemp.add("hh", ",".join(hhs))
                        for pn, pv in hhp_args:
                            newtemp.add(pn, pv, preserve_spacing=False)
                    pagemsg("Replacing %s with %s" %
                            (unicode(hmpt), unicode(newtemp)))
                    extra_notes.append("incorporate homophones into {{pl-p}}")
                if must_continue:
                    continue

            pagemsg("Replaced %s with %s" % (unicode(ipat), unicode(newtemp)))

            all_lines = "\n".join([unicode(newtemp)])
            newsubsec = "%s\n\n" % all_lines
            if subsections[k + 1] != newsubsec:
                this_notes = ["convert {{pl-IPA}} to {{pl-p}}"] + extra_notes
                notes.extend(this_notes)
            subsections[k + 1] = newsubsec

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Beispiel #17
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "autocat":
            blib.set_template_name(t, "auto cat")
            notes.append("{{autocat}} -> {{auto cat}}")
        elif tn in ["ja-readingcat", "ryu-readingcat"]:
            m = re.search(
                "^Category:(Japanese|Okinawan) terms spelled with (.*?) read as (.*)$",
                pagetitle)
            if not m:
                pagemsg("WARNING: Can't parse page title")
                continue
            langname, kanji, reading = m.groups()
            if langname == "Japanese":
                auto_lang = "ja"
            else:
                auto_lang = "ryu"
            t_lang = re.sub("-.*", "", tn)
            if t_lang != auto_lang:
                pagemsg(
                    "WARNING: Auto-determined lang code %s for language name %s != template specified %s: %s"
                    % (auto_lang, langname, t_lang, unicode(t)))
                continue
            t_kanji = getparam(t, "1").strip()
            t_reading = getparam(t, "2").strip()
            if t_kanji != kanji:
                pagemsg(
                    "WARNING: Auto-determined kanji %s != template specified %s: %s"
                    % (kanji, t_kanji, unicode(t)))
                continue
            if t_reading != reading:
                pagemsg(
                    "WARNING: Auto-determined reading %s != template specified %s: %s"
                    % (reading, t_reading, unicode(t)))
                continue
            numbered_params = []
            must_continue = False
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if pn in ["1", "2"]:
                    pass
                elif re.search("^[0-9]+$", pn):
                    numbered_params.append(pv)
                else:
                    pagemsg(
                        "WARNING: Saw unknown non-numeric param %s=%s, skipping: %s"
                        % (pn, pv, unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue
            if len(numbered_params) == 0:
                pagemsg("WARNING: No reading types given, skipping: %s" %
                        unicode(t))
                continue
            blib.set_template_name(t, "auto cat")
            del t.params[:]
            for index, numbered_param in enumerate(numbered_params):
                t.add(str(index + 1), numbered_param, preserve_spacing=False)
            notes.append("convert {{%s}} to {{auto cat}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #18
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    headt = None
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn in ["de-verb-old", "de-verb-strong", "de-verb-weak"
                  ] or tn == "head" and getparam(t, "1") == "de" and getparam(
                      t, "2") == "verb":
            if headt:
                pagemsg(
                    "WARNING: Encountered headword twice without declension: old %s, current %s"
                    % (unicode(headt), unicode(t)))
                return
            headt = t
            headtn = tn
        if tn == "de-conj":
            if not headt:
                pagemsg("WARNING: Encountered conj without headword: %s" %
                        unicode(t))
                return
            param4_ignorable = False
            if getparam(headt, "4") in ["h", "haben", "s", "sein"]:
                param4_ignorable = True
            for param in headt.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not pv:
                    continue
                if headtn == "head":
                    allowed_params = ["1", "2", "head"]
                elif headtn == "de-verb-weak":
                    allowed_params = ["1", "2", "3", "auxiliary", "cat"]
                elif headtn == "de-verb-strong":
                    allowed_params = [
                        "1", "2", "3", "class", "class 2", "pres 2",
                        "pres 2 qual", "past 2", "past 2 qual",
                        "past participle 2", "past participle 2 qual",
                        "past subjunctive", "past subjunctive 2",
                        "past subjunctive 2 qual", "auxiliary", "cat"
                    ]
                else:
                    allowed_params = ["head"]
                if param4_ignorable:
                    allowed_params.append("4")
                if pn not in allowed_params:
                    pagemsg("WARNING: Encountered unknown param %s=%s in %s" %
                            (pn, pv, unicode(headt)))
                    return

            def canonicalize_existing(forms):
                forms = [re.sub(" '*or'* ", ",", form) for form in forms]
                forms = [
                    splitform for form in forms
                    for splitform in form.split(",")
                ]
                return [blib.remove_links(form) for form in forms if form]

            def compare(old, new, entities_compared):
                if not old:
                    return True
                if set(old) != set(new):
                    pagemsg(
                        "WARNING: Old %s %s disagree with new %s %s: head=%s, decl=%s"
                        % (entities_compared, ",".join(old), entities_compared,
                           ",".join(new), unicode(headt), unicode(t)))
                    return False
                return True

            def fetch_aux():
                aux = getparam(headt, "auxiliary")
                if aux in ["haben", "sein"]:
                    aux = [aux]
                elif aux == "both":
                    aux = ["haben", "sein"]
                elif not aux:
                    aux = []
                else:
                    pagemsg(
                        "WARNING: Unrecognized auxiliary=%s, skipping: %s" %
                        (aux, unicode(headt)))
                    return None
                if not aux:
                    param4 = getparam(headt, "4")
                    if param4 in ["h", "haben"]:
                        aux = ["haben"]
                    elif param4 in ["s", "sein"]:
                        aux = ["sein"]
                return aux

            if headtn == "de-verb-weak":
                generate_template = re.sub(
                    r"^\{\{de-conj(?=[|}])",
                    "{{User:Benwing2/de-generate-verb-props", unicode(t))
                result = expand_text(generate_template)
                if not result:
                    continue
                forms = blib.split_generate_args(result)
                pres_3s = canonicalize_existing([getparam(headt, "1")])
                past = canonicalize_existing([getparam(headt, "2")])
                pp = canonicalize_existing([getparam(headt, "3")])
                aux = fetch_aux()
                if aux is None:
                    return
                if (not compare(pres_3s,
                                forms.get("pres_3s", "-").split(","),
                                "pres 3sgs")
                        or not compare(past,
                                       forms.get("pret_3s", "-").split(","),
                                       "pasts")
                        or not compare(pp,
                                       forms.get("perf_part", "-").split(","),
                                       "pp's")
                        or not compare(aux,
                                       forms.get("aux", "-").split(","),
                                       "auxes")):
                    headt = None
                    continue
            if headtn == "de-verb-strong":
                generate_template = re.sub(
                    r"^\{\{de-conj(?=[|}])",
                    "{{User:Benwing2/de-generate-verb-props", unicode(t))
                result = expand_text(generate_template)
                if not result:
                    continue
                forms = blib.split_generate_args(result)
                pres_3s = canonicalize_existing(
                    [getparam(headt, "1"),
                     getparam(headt, "pres 2")])
                past = canonicalize_existing(
                    [getparam(headt, "2"),
                     getparam(headt, "past 2")])
                pp = canonicalize_existing([
                    getparam(headt, "3"),
                    getparam(headt, "past participle 2")
                ])
                past_subj = canonicalize_existing([
                    getparam(headt, "past subjunctive"),
                    getparam(headt, "past subjunctive 2")
                ])
                clazz = canonicalize_existing(
                    [getparam(headt, "class"),
                     getparam(headt, "class 2")])
                aux = fetch_aux()
                if aux is None:
                    return
                if (not compare(pres_3s,
                                forms.get("pres_3s", "-").split(","),
                                "pres 3sgs")
                        or not compare(past,
                                       forms.get("pret_3s", "-").split(","),
                                       "pasts")
                        or not compare(pp,
                                       forms.get("perf_part", "-").split(","),
                                       "pp's")
                        or not compare(past_subj,
                                       forms.get("subii_3s", "-").split(","),
                                       "past subjs") or
                        not compare(aux,
                                    forms.get("aux", "-").split(","), "auxes")
                        or not compare(clazz,
                                       forms.get("class", "-").split(","),
                                       "classes")):
                    headt = None
                    continue

            del headt.params[:]
            blib.set_template_name(headt, "de-verb")
            arg1 = getparam(t, "1")
            if arg1:
                headt.add("1", arg1)
            notes.append("replace {{%s|...}} with new-style {{de-verb%s}}" %
                         (headtn == "head" and "head|de|verb" or headtn,
                          (arg1 and "|" + arg1 or "")))
            headt = None

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #19
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "autocat":
            blib.set_template_name(t, "auto cat")
            notes.append("{{autocat}} -> {{auto cat}}")
        elif tn in [
                "prefix cat", "suffix cat", "circumfix cat", "infix cat",
                "interfix cat"
        ]:
            m = re.search("^Category:(.*) ([a-z]+) ([a-z]+fix)ed with (.*)$",
                          pagetitle)
            if not m:
                pagemsg("WARNING: Can't parse page title")
                continue
            langname, pos, affixtype, term_and_id = m.groups()
            m = re.search(r"^(.*?) \((.*)\)$", term_and_id)
            if m:
                term, id = m.groups()
            else:
                term, id = term_and_id, ""
            t_lang = getparam(t, "1")
            t_term = getparam(t, "2")
            t_alt = getparam(t, "3")
            t_pos = getparam(t, "pos")
            t_id = getparam(t, "id")
            t_tr = getparam(t, "tr")
            t_sort = getparam(t, "sort")
            t_sc = getparam(t, "sc")
            if langname not in blib.languages_byCanonicalName:
                pagemsg("WARNING: Unrecognized language name: %s" % langname)
                continue
            if blib.languages_byCanonicalName[langname]["code"] != t_lang:
                pagemsg(
                    "WARNING: Auto-determined code %s for language name %s != manually specified %s"
                    % (blib.languages_byCanonicalName[langname]["code"],
                       langname, t_lang))
                continue
            if tn[:-4] != affixtype:
                pagemsg(
                    "WARNING: Auto-determined affix type %s != manually specified %s"
                    % (affixtype, tn[:-4]))
                continue

            def add_missing_hyphens(alt):
                hyph_c = "([" + possible_hyphens + "])"
                m = re.search(r"^(\*)(.*)$", alt)
                if m:
                    althyp, altbase = m.groups()
                else:
                    althyp, altbase = "", alt
                m = re.search(r"^(\*)(.*)$", term)
                if m:
                    termhyp, termbase = m.groups()
                else:
                    termhyp, termbase = "", term
                if affixtype == "suffix":
                    m = re.search("^" + hyph_c, termbase)
                    if m:
                        initial_hyphen = m.group(1)
                        if not altbase.startswith(initial_hyphen):
                            alt = althyp + initial_hyphen + altbase
                elif affixtype == "prefix":
                    m = re.search(hyph_c + "$", termbase)
                    if m:
                        final_hyphen = m.group(1)
                        if not altbase.endswith(final_hyphen):
                            alt = althyp + altbase + final_hyphen
                elif affixtype in ["infix", "interfix"]:
                    m = re.search("^" + hyph_c + ".*" + hyph_c + "$", termbase)
                    if m:
                        initial_hyphen, final_hyphen = m.groups()
                        if not altbase.startswith(initial_hyphen):
                            altbase = initial_hyphen + altbase
                        if not altbase.endswith(final_hyphen):
                            altbase = altbase + final_hyphen
                        alt = althyp + altbase
                return alt

            orig_t_term = t_term
            t_term = add_missing_hyphens(t_term)
            already_checked_t_alt = False
            if t_term != term:
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, t_term))
                if manual_entry_name != term:
                    pagemsg(
                        "WARNING: Can't match manually specified term %s (originally %s, entry name %s) to auto-determined term %s"
                        % (t_term, orig_t_term, manual_entry_name, term))
                    continue
                if t_alt:
                    pagemsg(
                        "WARNING: Manually specified term %s has extra diacritics and alt=%s also specified, skipping"
                        % (t_term, t_alt))
                    continue
                t_alt = t_term
                already_checked_t_alt = True
            if t_id != id:
                pagemsg(
                    "WARNING: Auto-determined ID %s != manually specified %s" %
                    (id, t_id))
                continue
            if (pos == "words" and t_pos not in ["", "word", "words"]
                    or pos != "words" and t_pos != pos and t_pos + "s" != pos
                    and (not t_pos.endswith("x") or t_pos + "es" != pos)):
                pagemsg(
                    "WARNING: Auto-determined pos %s doesn't match manually specified %s"
                    % (pos, t_pos))
                continue
            if t_alt and not already_checked_t_alt:
                orig_t_alt = t_alt
                t_alt = add_missing_hyphens(t_alt)
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, t_alt))
                if manual_entry_name != term:
                    pagemsg(
                        "WARNING: Can't match manually specified alt %s (originally %s, entry name %s) to auto-determined term %s"
                        % (t_alt, orig_t_alt, manual_entry_name, term))
                    continue
            if t_sort:
                auto_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, term))
                autosort = expand_text(
                    "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}"
                    % (t_lang, auto_entry_name))
                manual_entry_name = expand_text(
                    "{{#invoke:languages/templates|makeEntryName|%s|%s}}" %
                    (t_lang, add_missing_hyphens(t_sort)))
                manual_sort = expand_text(
                    "{{#invoke:languages/templates|getByCode|%s|makeSortKey|%s}}"
                    % (t_lang, manual_entry_name))
                if manual_sort != autosort:
                    pagemsg(
                        "Keeping sort key %s because canonicalized sort key %s based on it not same as canonicalized sort key %s based on term %s"
                        % (t_sort, manual_sort, autosort, term))
                else:
                    pagemsg(
                        "Discarding sort key %s because canonicalized sort key %s based on it same as canonicalized sort key based on term %s"
                        % (t_sort, manual_sort, term))
                    t_sort = ""

            must_continue = False
            all_existing_params = [
                "1", "2", "3", "tr", "pos", "id", "tr", "sc", "sort"
            ]
            for param in t.params:
                pn = pname(param)
                if pn not in all_existing_params:
                    pagemsg(
                        "WARNING: Unrecognized param %s=%s in affix cat: %s" %
                        (pn, unicode(param.value), unicode(t)))
                    must_continue = True
                    break
            if must_continue:
                continue
            for param in all_existing_params:
                rmparam(t, param)
            blib.set_template_name(t, "auto cat")
            if t_alt:
                if t_alt == term:
                    pagemsg(
                        "Not adding alt=%s because it's the same as the term" %
                        t_alt)
                else:
                    t.add("alt", t_alt)
            if t_tr:
                t.add("tr", t_tr)
            if t_sort:
                t.add("sort", t_sort)
            if t_sc:
                t.add("sc", t_sc)
            notes.append("convert {{%s}} to {{auto cat}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #20
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Italian",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    need_ref_section = False

    for k in xrange(2, len(subsections), 2):
        if "==Pronunciation==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])

            all_pronun_templates = []
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it":
                    all_pronun_templates.append(t)

            saw_it_pr = False
            pronun_based_respellings = []
            for t in parsed.filter_templates():
                origt = unicode(t)

                def tmsg(txt):
                    other_templates = []
                    for t in all_pronun_templates:
                        thist = unicode(t)
                        if thist != origt:
                            other_templates.append(thist)
                    pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" %
                                          ", ".join(other_templates)
                                          if len(other_templates) > 0 else ""))

                tn = tname(t)
                if tn == "it-pr":
                    saw_it_pr = True
                    respellings = blib.fetch_param_chain(t, "1")
                    # FIXME, need to split on comma
                    pronun_based_respellings.extend(respellings)
                    break
                if tn == "IPA" and getparam(t, "1") == "it":
                    saw_it_pr = True
                    pronuns = blib.fetch_param_chain(t, "2")
                    this_phonemic_pronun = None
                    this_phonemic_respelling = None
                    this_phonetic_pronun = None
                    this_phonetic_respelling = None
                    respellings = []
                    all_warnings = []
                    hack_respelling_warnings = []
                    main_warnings = []
                    unable = [False]
                    for pronun in pronuns:
                        respelling = ipa_to_respelling(pronun)
                        respelling, this_hack_respelling_warnings = hack_respelling(
                            pagetitle, respelling)
                        hack_respelling_warnings.extend(
                            this_hack_respelling_warnings)

                        def set_unable(msg):
                            main_warnings.append(msg)
                            unable[0] = True

                        tmsg("For pronun %s, generated respelling %s" %
                             (pronun, respelling))
                        respelling_words = respelling.split(" ")
                        for rw in respelling_words:
                            if rw.endswith("-"):  # prefix
                                continue
                            hacked_rw = re.sub(
                                u".[\u0323\u0331]", "e", rw
                            )  # pretend vowels with secondary or no stress are 'e'
                            if not re.search(
                                    u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len(
                                        re.sub("[^aeiouAEIOU]", "",
                                               hacked_rw)) > 1:
                                set_unable(
                                    "WARNING: For respelling %s for pronun %s, word %s is missing stress"
                                    % (respelling, pronun, rw))
                        if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$",
                                         respelling):
                            set_unable(
                                "WARNING: Strange char in respelling %s for pronun %s"
                                % (respelling, pronun))
                        else:
                            putative_pagetitle = re.sub(
                                u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])",
                                lambda m: vowel_respelling_to_spelling[m.group(
                                    1)] + m.group(2), respelling)
                            pagetitle_words = pagetitle.split(" ")
                            putative_pagetitle_words = putative_pagetitle.split(
                                " ")
                            if len(pagetitle_words) != len(
                                    putative_pagetitle_words):
                                set_unable(
                                    "WARNING: Page title has %s words but putative page title %s has %s words"
                                    %
                                    (len(pagetitle_words), putative_pagetitle,
                                     len(putative_pagetitle_words)))
                            else:
                                hacked_putative_pagetitle_words = []
                                for ptw, puptw in zip(
                                        pagetitle_words,
                                        putative_pagetitle_words):
                                    split_ptw = re.split("([Zz]+)", ptw)
                                    split_puptw = re.split(
                                        "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw)
                                    if len(split_ptw) != len(split_puptw):
                                        set_unable(
                                            "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s"
                                            % (ptw, puptw))
                                        hacked_putative_pagetitle_words.append(
                                            puptw)
                                    else:
                                        parts = []
                                        for i in xrange(len(split_puptw)):
                                            if i % 2 == 0:
                                                parts.append(split_puptw[i])
                                            else:
                                                parts.append(split_ptw[i])
                                        hacked_putative_pagetitle_words.append(
                                            "".join(parts))
                                putative_pagetitle = " ".join(
                                    hacked_putative_pagetitle_words)
                                if putative_pagetitle != pagetitle:
                                    # If respelling already seen, we already warned about it.
                                    if respelling in respellings:
                                        assert unable[0]
                                    else:
                                        set_unable(
                                            "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)"
                                            % (respelling, putative_pagetitle,
                                               pronun))

                        def append_respelling(respelling):
                            if respelling not in respellings:
                                respellings.append(respelling)

                        def append_warnings(warning):
                            if warning:
                                all_warnings.append(warning)
                            for warning in hack_respelling_warnings:
                                all_warnings.append(warning)
                            del hack_respelling_warnings[:]
                            for warning in main_warnings:
                                all_warnings.append(warning)
                            del main_warnings[:]

                        append_respelling(respelling)
                        if pronun.startswith("/"):
                            if this_phonemic_pronun is not None:
                                append_warnings(
                                    "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun"
                                    % (this_phonemic_pronun,
                                       this_phonemic_respelling, pronun,
                                       respelling))
                            this_phonemic_pronun = pronun
                            this_phonemic_respelling = respelling
                            this_phonetic_pronun = None
                            this_phonetic_respelling = None
                        elif pronun.startswith("["):
                            if this_phonemic_pronun is None:
                                if this_phonetic_pronun is not None:
                                    unable[0] = True
                                    append_warnings(
                                        "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun"
                                        % (this_phonetic_pronun,
                                           this_phonetic_respelling, pronun,
                                           respelling))
                                else:
                                    append_warnings(
                                        "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun"
                                        % (pronun, respelling))
                                this_phonetic_pronun = pronun
                                this_phonetic_respelling = respelling
                            elif this_phonemic_respelling != respelling:
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)"
                                    %
                                    (this_phonemic_respelling,
                                     this_phonemic_pronun, respelling, pronun))
                            else:
                                if unable[0] and len(main_warnings) > 0:
                                    # `unable` could be set from a previous pronunciation but no main warnings this time around
                                    # because the previously generated warnings have already been appended to all_warnings.
                                    mesg = main_warnings[0]
                                    del main_warnings[0]
                                    append_warnings(mesg)
                                else:
                                    append_warnings(None)
                            this_phonemic_pronun = None
                            this_phonemic_respelling = None
                        else:
                            unable[0] = True
                            append_warnings(
                                "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic"
                                % (pronun, respelling))
                    if this_phonemic_pronun is not None:
                        append_warnings(
                            "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun"
                            % (this_phonemic_pronun, this_phonemic_respelling))
                    if not unable[0]:
                        for param in t.params:
                            pn = pname(param)
                            if not re.search("^[0-9]+$",
                                             pn) and pn != "nocount":
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Saw unrecognized param %s=%s" %
                                    (pn, unicode(param.value)))
                    manual_assist = ""
                    if unable[0]:
                        if pagetitle in ipa_directives:
                            respellings = ipa_directives[pagetitle]
                            unable[0] = False
                            manual_assist = " (manually assisted)"
                            tmsg(
                                "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s"
                                % ("[MULTIPLE PRONUN TEMPLATES] "
                                   if len(all_pronun_templates) > 1 else "",
                                   "s" if len(respellings) > 1 else "",
                                   ",".join(respellings),
                                   " ||| ".join(all_warnings)))
                        else:
                            tmsg("%s<respelling> %s <end> %s" %
                                 ("[MULTIPLE PRONUN TEMPLATES] "
                                  if len(all_pronun_templates) > 1 else "",
                                  " ".join(respellings),
                                  " ||| ".join(all_warnings)))
                    if not unable[0]:
                        del t.params[:]
                        nextparam = 0
                        for param in respellings:
                            if "=" in param:
                                paramname, paramval = param.split("=", 1)
                            else:
                                nextparam += 1
                                paramname = str(nextparam)
                                paramval = param
                            if re.search("^n[0-9]*$", paramname):
                                need_ref_section = True
                            t.add(paramname, paramval)
                        blib.set_template_name(t, "it-pr")
                        notes.append(
                            "replace raw {{IPA|it}} with {{it-pr|%s}}%s" %
                            ("|".join(respellings), manual_assist))
                    pronun_based_respellings.extend(respellings)
                if unicode(t) != origt:
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            subsections[k] = unicode(parsed)

            rhymes_template = None
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it":
                    if rhymes_template:
                        pagemsg(
                            "WARNING: Saw two {{rhymes|it}} templates: %s and %s"
                            % (unicode(rhymes_template), unicode(t)))
                    rhymes_template = t
            if rhymes_template:
                rhyme_based_respellings = []
                all_warnings = []

                def append_respelling(respelling):
                    if respelling not in rhyme_based_respellings:
                        rhyme_based_respellings.append(respelling)

                def append_warnings(warning):
                    all_warnings.append(warning)

                rhymes = blib.fetch_param_chain(rhymes_template, "2")
                unable = False
                for rhy in rhymes:
                    spellings = rhyme_to_spelling(rhy)
                    matched = False
                    bad_rhyme_msgs = []
                    for ending, ending_respelling in spellings:
                        if pagetitle.endswith(ending):
                            prevpart = pagetitle[:-len(ending)]
                            respelling = prevpart + ending_respelling
                            saw_oso_ese = False
                            if ending_respelling == u"óso":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"ó[s]o")
                            elif ending_respelling == u"ése":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"é[s]e")
                            else:
                                if respelling.endswith(u"zióne"):
                                    new_respelling = re.sub(
                                        u"zióne$", u"tsióne", respelling)
                                    pagemsg(
                                        "Replaced respelling '%s' with '%s'" %
                                        (respelling, new_respelling))
                                    respelling = new_respelling
                                    prevpart = respelling[:-len(
                                        ending)] + ending_respelling
                                append_respelling(respelling)
                            if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)",
                                          prevpart.lower())
                                    or not saw_oso_ese and re.search(
                                        u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]",
                                        ending_respelling.lower())):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s"
                                    % rhy)
                                unable = True
                                break
                            if "z" in prevpart:
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to z in part before rhyme: %s"
                                    % rhy)
                                unable = True
                                break
                            hacked_prevpart = re.sub("([gq])u", r"\1w",
                                                     prevpart)
                            hacked_prevpart = hacked_prevpart.replace(
                                "gli", "gl")
                            hacked_prevpart = re.sub("([cg])i", r"\1",
                                                     hacked_prevpart)
                            if re.search("[^aeiou][iu]([aeiou]|$)",
                                         hacked_prevpart.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s"
                                    % rhy)
                                unable = True
                                break
                            if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)",
                                         respelling.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s"
                                    % rhy)
                                unable = True
                                break
                            matched = True
                            break
                        else:
                            bad_rhyme_msgs.append(
                                "WARNING: Unable to match rhyme %s, spelling %s, respelling %s"
                                % (rhy, ending, ending_respelling))
                    if not matched and not unable and bad_rhyme_msgs:
                        for bad_rhyme_msg in bad_rhyme_msgs:
                            pagemsg(bad_rhyme_msg)
                if rhyme_based_respellings:
                    if not saw_it_pr:
                        manual_assist = ""
                        if pagetitle in rhyme_directives:
                            rhyme_based_respellings = rhyme_directives[
                                pagetitle]
                            manual_assist = " (manually assisted)"
                            pagemsg(
                                "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s"
                                % ("s" if len(rhyme_based_respellings) > 1 else
                                   "", ",".join(rhyme_based_respellings),
                                   " ||| ".join(all_warnings),
                                   unicode(rhymes_template)))
                            subsections[k] = "* {{it-pr|%s}}\n" % ",".join(
                                rhyme_based_respellings) + subsections[k]
                            notes.append(
                                "add Italian rhyme-based respelling%s %s%s" %
                                ("s" if len(rhyme_based_respellings) > 1 else
                                 "", ",".join(rhyme_based_respellings),
                                 manual_assist))
                        else:
                            different_headers = []
                            for pos in [
                                    "Noun", "Verb", "Adjective", "Adverb",
                                    "Participle"
                            ]:
                                if "==%s==" % pos in secbody:
                                    different_headers.append(pos)
                            if len(different_headers) > 1:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple headers %s seen" %
                                    ",".join(different_headers)
                                ]
                            if "Etymology 1" in secbody:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple etymologies seen"
                                ]

                            pagemsg(
                                "<respelling> all: %s <end>%s: <from> %s <to> %s <end>"
                                % (" ".join(rhyme_based_respellings), " " +
                                   " ||| ".join(all_warnings) if all_warnings
                                   else "", unicode(rhymes_template),
                                   unicode(rhymes_template)))
                    else:
                        for respelling in rhyme_based_respellings:
                            if (not re.search("^qual[0-9]*=", respelling)
                                    and pronun_based_respellings and respelling
                                    not in pronun_based_respellings):
                                pagemsg(
                                    "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s"
                                    %
                                    (" (with problems)" if
                                     len(all_warnings) > 0 else "", respelling,
                                     ",".join(pronun_based_respellings),
                                     ": %s" % " ||| ".join(all_warnings)
                                     if len(all_warnings) > 0 else ""))

    if need_ref_section:
        for k in xrange(len(subsections) - 1, 2, -2):
            if re.search(r"^===\s*References\s*===$",
                         subsections[k - 1].strip()):
                if not re.search(r"<references\s*/?\s*>", subsections[k]):
                    subsections[k] = subsections[k].rstrip(
                        "\n") + "\n<references />\n\n"
                    notes.append(
                        "add <references /> to existing ===References=== section for pronunciation refs"
                    )
                break
        else:  # no break
            for k in xrange(len(subsections) - 1, 2, -2):
                if not re.search(r"==\s*(Anagrams|Further reading)\s*==",
                                 subsections[k - 1]):
                    subsections[k + 1:k + 1] = [
                        "===References===\n", "<references />\n\n"
                    ]
                    notes.append(
                        "add new ===References=== section for pronunciation refs"
                    )
                    break
            else:  # no break
                pagemsg(
                    "WARNING: Something wrong, couldn't find location to insert ===References=== section"
                )

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg):
  notes = []

  def analyze_declts(declts, pagetitle, headword_gens, headword_pls):
    decl_genders_gens_and_pls = []
    prev_is_weak = None
    prev_is_sg = None
    for declt in declts:
      def getp(param):
        return getparam(declt, param)
      tn = tname(declt)
      gender = re.sub(".*-", "", tn)
      if gender == "pl":
        gender = "p"
      decl_gens = []
      decl_pls = []
      if gender != "p":
        is_weak = False
        is_sg = False
        for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]:
          if getp(param):
            pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt)))
            return None
        if gender in ["m", "n"]:
          arg1 = getp("1")
          if not arg1:
            gen = ""
          elif arg1 in ["n", "ns", "en", "ens"]:
            is_weak = True
            gen = arg1
          elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]:
            gen = arg1
          else:
            pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt)))
            return None
          decl_gens = convert_gens(pagetitle, [gen], from_decl=True)
        num = getp("n")
        if num == "sg":
          is_sg = True
        elif num not in ["full", ""]:
          pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt)))
          return None
        if not is_sg:
          if gender == "f":
            plsuffix = getp("1")
          else:
            plsuffix = getp("2")
          argpl = getp("pl")
          if argpl:
            pl = argpl
          else:
            pl = pagetitle + plsuffix
          if pl == "-":
            is_sg = True
          else:
            decl_pls = normalize_values([pl])
        if prev_is_weak is not None and prev_is_weak != is_weak:
          pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s"
              % (is_weak, prev_is_weak, declts_to_unicode(declts)))
          return None
        prev_is_weak = is_weak
        if prev_is_sg is not None and prev_is_sg != is_sg:
          pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s"
              % (is_sg, prev_is_sg, declts_to_unicode(declts)))
          return None
        prev_is_sg = is_sg
      decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls))

    all_decl_genders = []
    all_decl_gens = []
    all_decl_pls = []
    for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls:
      if decl_gender not in all_decl_genders:
        all_decl_genders.append(decl_gender)
      for decl_gen in decl_gens:
        if decl_gen not in all_decl_gens:
          all_decl_gens.append(decl_gen)
      for decl_pl in decl_pls:
        if decl_pl not in all_decl_pls:
          all_decl_pls.append(decl_pl)
    first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0]
    if len(all_decl_genders) > 1 and (
      len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls)
    ):
      pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s"
          % declts_to_unicode(declts))
      return None
    if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls):
      pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s"
          % declts_to_unicode(declts))
      return None

    is_weak = prev_is_weak
    is_sg = prev_is_sg
    declspec = ":".join(all_decl_genders)

    def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc):
      defparts = []
      for gender in all_decl_genders:
        defpart = pagetitle + get_default_part(pagetitle, gender, is_weak)
        if defpart not in defparts:
          defparts.append(defpart)
      if all_decl_parts == defparts:
        declspec += ","
      else:
        all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None)
        if set(headword_parts) == set(all_decl_parts):
          headword_part_forms = analyze_forms(pagetitle, headword_parts, None)
          if headword_part_forms != all_decl_part_forms:
            pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s"
                % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms,
                  declts_to_unicode(declts)))
            all_decl_part_forms = headword_part_forms
        else:
          pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing"
              % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts)))
        declspec += ",%s" % all_decl_part_forms
      return declspec

    if "m" in all_decl_genders or "n" in all_decl_genders:
      declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive")
    if "p" not in all_decl_genders:
      declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural")
    declspec = re.sub(",*$", "", declspec)
    if is_weak:
      declspec += ".weak"
    if is_sg:
      declspec += ".sg"
    if ss:
      declspec += ".ss"
    return declspec, all_decl_genders, all_decl_gens, all_decl_pls

  old_style_headt = False
  for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]:
    if getparam(headt, param):
      old_style_headt = True
      break
  if not old_style_headt:
    pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt),
      declts and ", declts=%s" % declts_to_unicode(declts) or ""))
    return notes

  is_proper = tname(headt) == "de-proper noun"
  ss = False
  if declts:
    sses = [not not getparam(declt, "ss") for declt in declts]
    if len(set(sses)) > 1:
      pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts))
      return
    ss = list(set(sses)) == [True]
  if ss:
    if not pagetitle.endswith(u"ß"):
      pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts))
      return
    # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the
    # headword and declension specs.
    pagetitle = re.sub(u"ß$", "ss", pagetitle)

  adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts)
  genders = blib.fetch_param_chain(headt, "1", "g")
  headword_genders = genders
  gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True))
  pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl"))
  dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim"))
  fems = normalize_values(blib.fetch_param_chain(headt, "f"))
  mascs = normalize_values(blib.fetch_param_chain(headt, "m"))
  if gens == [True]:
    gens = []
  for param in headt.params:
    pn = pname(param)
    pv = unicode(param.value)
    if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and (
        not adjectival or pn not in "head"):
      pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt)))
      return
  if not genders:
    pagemsg("WARNING: No genders in head template: %s" % unicode(headt))
    return
  if "p" in genders and len(genders) > 1:
    pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt))
    return
  if "p" in genders and (gens or pls):
    pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt))
    return
  saw_mn = "m" in genders or "n" in genders
  if not saw_mn and not adjectival:
    if gens and gens == [pagetitle]:
      gens = []
    if gens:
      pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt))
      return

  if adjectival:
    if len(declts) > 1:
      pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s"
        % declts_to_unicode(declts))
      return
    declt = declts[0]
    def getp(param):
      return getparam(declt, param)
    tn = tname(declt)
    m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn)
    if m:
      default_equiv = None
      is_sg, gender = m.groups()
      adj = getp("1")
      noun = getp("2")
      if gender in ["m", "f"]:
        default_equiv = adj + ("e" if gender == "m" else "er")
        if noun:
          default_equiv += " " + construct_default_equiv(noun, gender)
      if gender in ["m", "n"]:
        noun_gen = getp("3")
        noun_pl = getp("4")
      else:
        noun_gen = "-"
        noun_pl = getp("3")
      noun_pl_full = getp("pl")
      adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es"
      expected_lemma = adj + adj_ending
      if gender == "f":
        # Should be '-er' but we often see '-en' (weak form) instead
        expected_gens = [adj + "er", adj + "en"]
      else:
        expected_gens = [adj + "en"]
      if is_sg:
        expected_pls = []
      else:
        expected_pls = [adj + "e", adj + "en"]
      if not noun:
        if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"):
          pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt))
          return
        all_decl_genders = [gender]
      else:
        fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "")
        fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0]
        def analyze_headword_parts_for_noun(parts, desc):
          noun_headword_parts = []
          for part in parts:
            m = re.search("^([^ ]+) ([^ ]+)$", part.strip())
            if not m:
              pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s"
                  % (desc, part, unicode(headt), unicode(declt)))
              return []
            part_adj, part_noun = m.groups()
            noun_headword_parts.append(part_noun)
          return noun_headword_parts
        noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive")
        noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural")

        retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls)
        if retval is None:
          return
        declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
        expected_lemma = "%s %s" % (expected_lemma, noun)
        expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)]
        if is_sg:
          expected_pls = []
        else:
          expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls]
      if pagetitle != expected_lemma:
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s"
            % (expected_lemma, pagetitle, unicode(headt), unicode(declt)))
        return
      if set(genders) != set(all_decl_genders):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt)))
        return
      if not (set(gens) <= set(expected_gens)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt)))
        return
      if pls == ["-"]:
        if expected_pls:
          pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
              % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
          return
      elif not (set(pls) <= set(expected_pls)):
        pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s"
            % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt)))
        return
      if not noun:
        declspec = "+"
        if is_sg:
          declspec += ".sg"
      else:
        if re.search("^" + CAP, adj):
          adj_lemma = adj.lower()
        else:
          adj_lemma = adj
        if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]:
          adj_lemma += "e"
        adj_form = adj + adj_ending
        if adj_form.startswith(adj_lemma):
          adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):])
        else:
          adj_link = "[[%s|%s]]" % (adj_lemma, adj_form)
        noun_link = "[[%s]]" % noun
        # This is less accurate than the above. Often head= is wrong.
        # Try to update adjective and noun links from head= if given.
        #head = getparam(headt, "head")
        #if head:
        #  m = re.search("^([^ ]*) ([^ ]*)$", head)
        #  if not m:
        #    pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s"
        #        % (head, unicode(headt), unicode(declt)))
        #  else:
        #    head_adj_link, head_noun_link = m.groups()
        #    m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link)
        #    if m:
        #      adj_link_lemma, adj_link_form = m.groups()
        #      if adj_link_form.startswith(adj_link_lemma):
        #        head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):])
        #    if head_adj_link != adj_link:
        #      pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s"
        #          % (head_adj_link, adj_link, unicode(headt), unicode(declt)))
        #      adj_link = head_adj_link
        #    if head_noun_link != noun_link:
        #      pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s"
        #          % (head_noun_link, noun_link, unicode(headt), unicode(declt)))
        #      noun_link = head_noun_link
        declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec)
      headspec = declspec
      is_both = is_proper and not is_sg
    else:
      pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts))
      return

  else: # not adjectival
    if len(genders) == 1 and genders[0] in ["m", "f"]:
      default_equiv = construct_default_equiv(pagetitle, genders[0])
    headspec = ":".join(genders)
    is_sg = False
    is_both = False
    is_weak = False
    headword_gens = []
    headword_pls = []
    if headspec != "p":
      pls = convert_pls(pagetitle, pls, is_proper=is_proper)
      headword_pls = pls
      if saw_mn:
        gens = convert_gens(pagetitle, gens)
        headword_gens = gens
        if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"])
          and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))):
          is_weak = True
        def_gens = []
        for gender in genders:
          def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak)
          if def_gen not in def_gens:
            def_gens.append(def_gen)
        if set(def_gens) == set(gens):
          headspec += ","
        else:
          headspec += ",%s" % analyze_forms(pagetitle, gens, None)
      def_pls = []
      for gender in genders:
        def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak)
        if def_pl not in def_pls:
          def_pls.append(def_pl)
      if set(def_pls) == set(pls):
        headspec += ","
        if is_proper:
          is_both = True
      elif pls == ["-"]:
        is_sg = True
      else:
        headspec += ",%s" % analyze_forms(pagetitle, pls, None)
    headspec = re.sub(",*$", "", headspec)
    if is_weak:
      headspec += ".weak"
    if is_sg:
      headspec += ".sg"
    if ss:
      headspec += ".ss"

  extraspec = ""
  if dims:
    extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",")
  if fems:
    extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",")
  if mascs:
    extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",")

  if declts and not adjectival:
    retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls)
    if retval is None:
      return
    declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval
    if headspec != declspec:
      if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls):
        if set(all_decl_genders) == set(headword_genders):
          pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt)))
          declspec = headspec
        else:
          pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s"
              % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
                ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))

          return
      else:
        pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s"
            % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls),
              ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt)))
        return

  if is_proper:
    headspec = headspec.replace(".sg", "")
    if is_both:
      if ".ss" in headspec:
        headspec = headspec.replace(".ss", ".both.ss")
      else:
        headspec += ".both"
  newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec)
  headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt)
  outmsg = "Would " + headt_outmsg
  if declts:
    newdeclt = "{{de-ndecl|%s}}" % declspec
    declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt)
    outmsg += " and " + declt_outmsg
  pagemsg(outmsg)

  if unicode(headt) != newheadt:
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(headt_outmsg)
    subsections[subsection_with_head] = newsectext
  if declts:
    declts_existing = "\n".join(unicode(declt) for declt in declts)
    newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True)
    if not replaced:
      return
    notes.append(declt_outmsg)
    subsections[subsection_with_declts] = newsectext

  return notes
Beispiel #22
0
            def parse_syns(syns):
                retval = []
                syns = syns.strip()
                orig_syns = syns
                qualifier = None
                while True:
                    # check for qualifiers specified using a qualifier template
                    m = re.search(
                        "^(.*?)\{\{(?:qualifier|qual|q|i)\|([^{}|=]*)\}\}(.*?)$",
                        syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    # check for qualifiers using e.g. {{lb|ru|...}}
                    m = re.search(
                        "^(.*?)\{\{(?:lb)\|%s\|([^{}=]*)\}\}(.*?)$" %
                        re.escape(args.lang), syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        # do this before handling often/sometimes/etc. in case the label has often|_|pejorative or similar
                        qualifier = qualifier.replace("|_|", " ")
                        terms_no_following_comma = [
                            "also", "and", "or", "by", "with", "except",
                            "outside", "in", "chiefly", "mainly", "mostly",
                            "primarily", "especially", "particularly",
                            "excluding", "extremely", "frequently",
                            "humorously", "including", "many", "markedly",
                            "mildly", "now", "occasionally", "of", "often",
                            "sometimes", "originally", "possibly", "rarely",
                            "slightly", "somewhat", "strongly", "then",
                            "typically", "usually", "very"
                        ]
                        qualifier = re.sub(
                            r"\b(%s)\|" % "|".join(terms_no_following_comma),
                            r"\1 ", qualifier)
                        qualifier = qualifier.replace("|", ", ")
                        syns = before_text + after_text
                        break
                    # check for qualifier-like ''(...)''
                    m = re.search("^(.*?)''\(([^'{}]*)\)''(.*?)$", syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    # check for qualifier-like (''...'')
                    m = re.search("^(.*?)\(''([^'{}]*)''\)(.*?)$", syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    break

                # Split on commas, semicolons, slashes but don't split commas etc. inside of braces or brackets
                split_by_brackets_braces = re.split(
                    r"(\{\{[^{}]*\}\}|\[\[[^\[\]]*\]\])", syns.strip())
                comma_separated_runs = blib.split_alternating_runs(
                    split_by_brackets_braces, "(?: *[,;] *| +/ +)")
                syns = [
                    "".join(comma_separated_run)
                    for comma_separated_run in comma_separated_runs
                ]

                if qualifier and len(syns) > 1:
                    pagemsg(
                        "WARNING: Saw qualifier along with multiple synonyms, not sure how to proceed: <%s>"
                        % orig_syns)
                    return None
                joiner_after = ";" if qualifier or len(syns) > 1 else ","
                for synindex, syn in enumerate(syns):
                    orig_syn = syn
                    m = re.search(
                        r"^\{\{[lm]\|%s\|([^{}]*)\}\}$" % re.escape(args.lang),
                        syn)
                    if m:
                        decl = blib.parse_text(syn).filter_templates()[0]
                        gender = None
                        translit = None
                        raw_syn = None
                        alt = None
                        gloss = None
                        lit = None
                        pos = None
                        for param in decl.params:
                            pn = pname(param)
                            pv = unicode(param.value)
                            if pn in ["1"]:
                                pass
                            elif pn == "2":
                                raw_syn = pv
                            elif pn == "3":
                                alt = pv
                            elif pn in ["4", "t", "gloss"]:
                                gloss = pv
                            elif pn == "g":
                                gender = pv
                            elif pn in ["g2", "g3", "g4"]:
                                if not gender:
                                    pagemsg(
                                        "WARNING: Saw %s=%s without g= in %s <%s> in line: %s"
                                        % (pn, pv, syntype, orig_syn, line))
                                    return None
                                gender += "," + pv
                            elif pn == "tr":
                                translit = pv
                            elif pn == "lit":
                                lit = pv
                            elif pn == "pos":
                                pos = pv
                            else:
                                pagemsg(
                                    "WARNING: Unrecognized param %s=%s in %s <%s> in line: %s"
                                    % (pn, pv, syntype, orig_syn, line))
                                return None
                        if not raw_syn:
                            pagemsg(
                                "WARNING: Couldn't find raw synonym in %s <%s> in line: %s"
                                % (syntype, orig_syn, line))
                            return None
                        if raw_syn and alt:
                            if "[[" in raw_syn or "[[" in alt:
                                pagemsg(
                                    "WARNING: Saw both synonym=%s and alt=%s with brackets in one or both in %s <%s> in line: %s"
                                    % (raw_syn, alt, syntype, orig_syn, line))
                                return None
                            syn = "[[%s|%s]]" % (raw_syn, alt)
                        elif raw_syn:
                            if "[[" in raw_syn:
                                syn = raw_syn
                            else:
                                syn = "[[%s]]" % raw_syn
                        elif alt:
                            pagemsg(
                                "WARNING: Saw alt=%s but no link text in %s <%s> in line: %s"
                                % (alt, syntype, orig_syn, line))
                            return
                    else:

                        def add_brackets_if_not_already(m):
                            raw_syn = m.group(1)
                            if "[[" not in raw_syn:
                                raw_syn = "[[%s]]" % raw_syn
                            return raw_syn

                        syn = re.sub(
                            r"\{\{[lm]\|%s\|([^{}=]*)\}\}" %
                            re.escape(args.lang), add_brackets_if_not_already,
                            syn)
                        gender = None
                        translit = None
                        gloss = None
                        lit = None
                        pos = None
                    if "{{" in syn or "}}" in syn:
                        pagemsg(
                            "WARNING: Unmatched braces in %s <%s> in line: %s"
                            % (syntype, orig_syn, line))
                        return None
                    if "''" in syn:
                        pagemsg(
                            "WARNING: Italicized text in %s <%s> in line: %s" %
                            (syntype, orig_syn, line))
                        return None
                    if "(" in syn or ")" in syn:
                        pagemsg(
                            "WARNING: Unmatched parens in %s <%s> in line: %s"
                            % (syntype, orig_syn, line))
                        return None
                    if ":" in syn:
                        pagemsg(
                            "WARNING: Unmatched colon in %s <%s> in line: %s" %
                            (syntype, orig_syn, line))
                        return None
                    # Strip brackets around entire synonym
                    syn = re.sub(r"^\[\[([^\[\]|{}]*)\]\]$", r"\1", syn)
                    # If there are brackets around some words but not all, put brackets around the remaining words
                    if "[[" in syn:
                        split_by_brackets = re.split(
                            r"([^ ]*\[\[[^\[\]]*\]\][^ ]*)", syn)

                        def maybe_add_brackets(m):
                            text = m.group(1)
                            if "[" in text or "]" in text:
                                pagemsg(
                                    "WARNING: Saw nested brackets in %s in %s <%s> in line: %s"
                                    % (text, syntype, orig_syn, line))
                                return text
                            if not re.search(r"\w", text, re.U):
                                pagemsg(
                                    "Not adding brackets around '%s', saw no letters in %s <%s> in line: %s"
                                    % (text, syntype, orig_syn, line))
                                return text
                            return "[[%s]]" % text

                        # Put brackets around the remainin words not already bracketed or partially bracketed. But don't put
                        # brackets around words inside of HTML comments, and don't include punctuation inside the brackets.
                        for i in xrange(0, len(split_by_brackets), 2):
                            split_out_comments = re.split(
                                "(<!--.*?-->)", split_by_brackets[i])
                            for j in xrange(0, len(split_out_comments), 2):
                                split_out_comments[j] = re.sub(
                                    "([^ ,*/{}:;()?!+<>]+)",
                                    maybe_add_brackets, split_out_comments[j])
                            split_by_brackets[i] = "".join(split_out_comments)

                        new_syn = "".join(split_by_brackets)
                        if new_syn != syn:
                            pagemsg("Add brackets to '%s', producing '%s'" %
                                    (syn, new_syn))
                            syn = new_syn
                    other_params = [
                        ("tr", translit),
                        ("t", gloss),
                        ("q", qualifier),
                        ("g", gender),
                        ("pos", pos),
                        ("lit", lit),
                    ]
                    # Set the joiner_after to None for everything but the last synonym on the row; we will then change
                    # all commas to semicolons if there is any semicolon, so we are consistently using commas or
                    # semicolons to separate groups of synonyms.
                    retval.append(
                        (syn, other_params,
                         joiner_after if synindex == len(syns) - 1 else None))
                return retval
  def process_noun_headt(t, declt=None):
    origt = unicode(t)
    origdeclt = declt and unicode(declt) or "None"
    def getp(param):
      return getparam(t, param)
    if tname(t) == "head":
      pos = getp("2")
      head = getp("head")
      headtr = getp("tr")
      g = getp("g")
      g2 = getp("g2")
      g3 = getp("g3")
      anim = ""
      decl = ""
      gen = ""
      gentr = ""
      pl = ""
      pltr = ""
      f = ""
      ftr = ""
      m = ""
      mtr = ""
      collective = ""
      collectivetr = ""
      must_continue = False
      for param in t.params:
        pn = pname(param)
        if pn not in ["1", "2", "head", "tr", "g", "g2", "g3",
            # extra params to ignore
            "sc"]:
          pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
              (pn, unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        return False
    else:
      pos = getp("pos")
      head = getp("1") or getp("head") or getp("sg")
      headtr = getp("tr")
      g = getp("2") or getp("g")
      g2 = getp("g2")
      g3 = getp("g3")
      anim = getp("a")
      decl = getp("decl")
      gen = getp("gen") or getp("3")
      gentr = getp("gentr")
      pl = getp("pl") or getp("4")
      pltr = getp("pltr")
      f = getp("f")
      ftr = getp("ftr")
      m = getp("m")
      mtr = getp("mtr")
      collective = getp("collective")
      collectivetr = getp("collectivetr")
      must_continue = False
      for param in t.params:
        pn = pname(param)
        if pn not in ["pos", "1", "head", "sg", "tr", "2", "g", "g2", "g3",
            "a", "decl", "gen", "gentr", "3", "pl", "pltr", "4",
            "f", "ftr", "m", "mtr", "collective", "collectivetr",
            # extra params to ignore
            "sc"]:
          pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
              (pn, unicode(param.value), origt))
          must_continue = True
          break
      if must_continue:
        return False

    def clean_gender(g):
      gparts = g.split("-")
      realg = "?"
      realan = "?"
      realpl = ""
      for part in gparts:
        if part in ["m", "f", "n"]:
          realg = part
        elif part in ["an", "in"]:
          realan = part
        elif part == "p":
          realpl = part
        elif part != "?":
          pagemsg("WARNING: Encountered unrecognized gender part '%s' in gender '%s': %s" % (
            part, g, origt))
      an = anim
      if an in ["a", "an"]:
        an = "an"
      elif an in ["i", "in"]:
        an = "in"
      elif an:
        pagemsg("WARNING: Unrecognized animacy a=%s: %s" % (an, origt))
        an = "?"
      if realan != "?" and an and an != "?" and an != realan:
        pagemsg("WARNING: Animacy mismatch, anim %s in gender spec %s but a=%s: %s" % (
          realan, g, anim, origt))
      if realan == "?" and an:
        realan = an
      pl = ""
      if realpl:
        pl = "-%s" % realpl
      if realg == "?":
        pagemsg("WARNING: Unknown gender in gender spec %s: %s" % (g, origt))
      if realan == "?":
        pagemsg("WARNING: Unknown animacy in gender spec %s and a=%s: %s" % (g, anim, origt))
      if realg == "?" and realan == "?":
        return "?%s" % pl
      else:
        return "%s-%s%s" % (realg, realan, pl)

    if not g and not g2 and not g3:
      pagemsg("WARNING: No gender specified: %s" % origt)
      g = "?"
    genders = []
    if g:
      genders.append(clean_gender(g))
    if g2:
      genders.append(clean_gender(g2))
    if g3:
      genders.append(clean_gender(g3))

    if not head:
      head = pagetitle
    if decl and decl not in ["off", "no", "indeclinable"]:
      pagemsg("WARNING: Unrecognized value for decl=%s: %s" % (decl, origt))
      decl = ""
    if decl:
      if gen and gen != "-":
        pagemsg("WARNING: Indeclinable but gen=%s specified: %s" % (gen, origt))
      else:
        gen = "-"

    del t.params[:]
    if tname(t) == "head":
      blib.set_template_name(t, "be-" + pos)
    elif pos:
      t.add("pos", pos)

    def split_form(form):
      forms = re.split(r",\s*", form.strip())
      forms = [re.sub(r"^\[\[([^\[\]]*)\]\]$", r"\1", f) for f in forms]
      forms = [belib.add_accent_to_o(f) for f in forms]
      for f in forms:
        if "[[" in f:
          pagemsg("WARNING: Link in form %s: headword=%s, decl=%s" %
              (f, origt, origdeclt))
        if belib.needs_accents(f):
          pagemsg("WARNING: Form %s missing accents: headword=%s, decl=%s" %
              (f, origt, origdeclt))
      forms = [f for f in forms if f != "-"]
      return forms

    def handle_multiform(firstparam, restparam, form, formtr, declparam=None):
      if form:
        form = split_form(form)
      if declparam:
        if declparam == "-":
          declforms = ["-"]
        else:
          declforms = split_form(getparam(declt, declparam))
        if not form:
          form = declforms
        elif set(form) != set(declforms):
          pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" %
              (restparam, ",".join(form), ",".join(declforms), origt, origdeclt))
      if form:
        blib.set_param_chain(t, form, firstparam, restparam)
      if formtr:
        trparam = ("" if restparam == "head" else restparam) + "tr"
        if not form:
          pagemsg("WARNING: Saw %s=%s but no %s=: %s" %
              ("trparam", formtr, restparam, origt))
        elif len(form) > 1:
          pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" %
              (trparam, formtr, restparam, ",".join(form), origt))
        t.add(trparam, formtr)

    decl_headparam = None
    decl_genparam = None
    decl_plparam = None
    if declt:
      decl_headparam = "1"
      tn = tname(declt)
      if tn == "be-decl-noun":
        decl_genparam = "3"
        decl_plparam = "2"
      elif tn == "be-decl-noun-unc":
        decl_genparam = "2"
        decl_plparam = "-"
      else:
        decl_genparam = "2"
      if tn == "be-decl-noun-pl":
        for g in genders:
          if not g.endswith("-p"):
            pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % (
              g, unicode(declt), origt))
      else:
        for g in genders:
          if g.endswith("-p"):
            pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % (
              g, unicode(declt), origt))

    handle_multiform("1", "head", head, headtr, decl_headparam)
    blib.set_param_chain(t, genders, "2", "g")
    handle_multiform("3", "gen", gen, gentr, decl_genparam)
    if not getp("3") and pl:
      t.add("3", "")
    handle_multiform("4", "pl", pl, pltr, decl_plparam)
    handle_multiform("m", "m", m, mtr)
    handle_multiform("f", "f", f, ftr)
    handle_multiform("collective", "collective", collective, collectivetr)

    if origt != unicode(t):
      notes.append("fix up {{%s}} to use new param convention" % tname(t))
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
    return True
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    global args

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    notes = []

    pagemsg("Processing")

    parsed = blib.parse_text(text)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "autocat":
            blib.set_template_name(t, "auto cat")
            notes.append("{{autocat}} -> {{auto cat}}")
        elif tn == "langcatboiler":
            m = re.search("^Category:(.* Language)$", pagetitle)
            if not m:
                m = re.search("^Category:(.*) language$", pagetitle)
            if not m:
                pagemsg("WARNING: Can't parse page title")
                continue
            langname = m.group(1)
            t_lang = getparam(t, "1")
            if langname not in blib.languages_byCanonicalName:
                pagemsg("WARNING: Unrecognized language name: %s" % langname)
                continue
            langobj = blib.languages_byCanonicalName[langname]
            if langobj["code"] != t_lang:
                pagemsg(
                    "WARNING: Auto-determined code %s for language name %s != manually specified %s"
                    % (langobj["code"], langname, t_lang))
                continue
            numbered_params = []
            non_numbered_params = []
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value).strip()
                if pn == "1" or not pv:
                    pass
                elif re.search("^[0-9]+$", pn):
                    numbered_params.append(pv)
                elif pn not in [
                        "setwiki", "setwikt", "setsister", "entryname"
                ]:
                    pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" %
                            (pn, pv, unicode(t)))
                    return
                elif (pn in ["setwiki", "setsister"]
                      and pv == langname + " language"
                      or pn == "entryname" and pv == langname
                      or pn == "setwikt" and pv == langobj["code"]):
                    pagemsg("WARNING: Unnecessary param %s=%s, omitting: %s" %
                            (pn, pv, unicode(t)))
                else:
                    non_numbered_params.append((pn, pv))
            if len(numbered_params) == 0:
                if langobj["type"] == "reconstructed" or langobj[
                        "family"] == "art":
                    pagemsg(
                        "Reconstructed or constructed language, allowing no countries"
                    )
                else:
                    pagemsg(
                        "WARNING: No countries and not reconstructed or constructed language, adding UNKNOWN"
                    )
                    numbered_params.append("UNKNOWN")
            blib.set_template_name(t, "auto cat")
            del t.params[:]
            for index, numbered_param in enumerate(numbered_params):
                t.add(str(index + 1), numbered_param, preserve_spacing=False)
            for name, value in non_numbered_params:
                t.add(name, value, preserve_spacing=False)
            notes.append("convert {{%s}} to {{auto cat}}" % tn)

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #25
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  notes = []

  if "sa-noun" not in text and "sa-decl-noun" not in text:
    return

  if ":" in pagetitle:
    pagemsg("Skipping non-mainspace title")
    return

  pagemsg("Processing")

  parsed = blib.parse_text(text)

  headt = None
  saw_decl = False

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)

    if tn == "sa-noun":
      pagemsg("Saw headt=%s" % unicode(t))
      if headt and not saw_decl:
        pagemsg("WARNING: Saw two {{sa-noun}} without {{sa-decl-noun}}: %s and %s" % (unicode(headt), unicode(t)))
      headt = t
      saw_decl = False
      continue

    if tn in ["sa-decl-noun", "sa-decl"]:
      pagemsg("WARNING: Saw raw {{%s}}: %s, headt=%s" % (tn, unicode(t), headt and unicode(headt) or None))
      continue

    if tn.startswith("sa-decl-noun-"):
      pagemsg("Saw declt=%s" % unicode(t))
      if not headt:
        pagemsg("WARNING: Saw {{%s}} without {{sa-noun}}: %s" % (tn, unicode(t)))
        continue
      saw_decl = True

      tr = getparam(headt, "tr")
      accented_tr = False
      if not tr:
        tr = expand_text("{{xlit|sa|%s}}" % pagetitle)
        pagemsg("WARNING: No translit in %s, using %s from pagetitle: declt=%s" % (unicode(headt), tr, unicode(t)))
      else:
        if "-" in tr:
          pagemsg("WARNING: Saw translit %s in head with hyphen: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
          tr = tr.replace("-", "")
        decomptr = unicodedata.normalize("NFD", tr).replace("s" + AC, u"ś")
        if AC not in decomptr and GR not in decomptr:
          pagemsg("WARNING: Saw translit %s in head without accent: headt=%s, declt=%s" % (tr, unicode(headt), unicode(t)))
        else:
          accented_tr = True
      genders = blib.fetch_param_chain(headt, "g")
      genders = [g.replace("-p", "").replace("bysense", "") for g in genders]
      genders = [g for gs in genders for g in (
        ["m", "f"] if gs in ["mf", "fm"] else ["m", "n"] if gs in ["mn", "nm"] else [gs]
      )]

      if tn in ["sa-decl-noun-m", "sa-decl-noun-f", "sa-decl-noun-n"]:
        tg = tn[-1]
        if tg not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            tg, ",".join(genders), unicode(headt), unicode(t)))
          continue

        decltr = getparam(t, "1")
        if not decltr:
          if not accented_tr:
            pagemsg("WARNING: No param in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add (unaccented) translit %s to {{%s}}" % (tr, tn))
          else:
            pagemsg("WARNING: No param in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("add accented translit %s to {{%s}}" % (tr, tn))
        elif re.search(u"[\u0900-\u097F]", decltr): # translit is actually Devanagari
          if not accented_tr:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with unaccented tr %s from head or pagename: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with (unaccented) translit %s" % (tr, tn))
          else:
            pagemsg("WARNING: Devanagari in {{%s}}, replacing with accented tr %s from head: headt=%s, declt=%s" % (tn, tr, unicode(headt), unicode(t)))
            t.add("1", tr)
            notes.append("replace Devanagari in {{%s}} with accented translit %s" % (tr, tn))
        else:
          decompdecltr = unicodedata.normalize("NFD", decltr).replace("s" + AC, u"ś")
          subbed = False
          if AC not in decompdecltr and GR not in decompdecltr:
            if accented_tr:
              pagemsg("WARNING: Saw translit %s in decl without accent, subbing accented tr %s from head: headt=%s, declt=%s" %
                  (decltr, tr, unicode(headt), unicode(t)))
              t.add("1", tr)
              notes.append("replace existing translit %s with accented translit %s in {{%s}}" % (decltr, tr, tn))
              subbed = True
            else:
              pagemsg("WARNING: Saw translit %s in decl without accent and unable to replace with accented tr from head: headt=%s, declt=%s" %
                  (decltr, unicode(headt), unicode(t)))
          if not subbed and "-" in decltr:
            pagemsg("WARNING: Saw translit %s in decl with hyphen: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove hyphen from existing translit %s in {{%s}}" % (decltr, tn))
            decltr = decltr.replace("-", "")
            t.add("1", decltr)
            subbed = True
          stripped_decltr = decltr.strip()
          if "\n" not in decltr and stripped_decltr != decltr:
            pagemsg("WARNING: Saw translit '%s' in decl with extraneous space: headt=%s, declt=%s" %
                (decltr, unicode(headt), unicode(t)))
            notes.append("remove extraneous space from existing translit '%s' in {{%s}}" % (decltr, tn))
            decltr = stripped_decltr
            t.add("1", decltr)
            subbed = True
        continue

      if tn in [u"sa-decl-noun-ī", u"sa-decl-noun-ī-f"] and getparam(t, "mono"):
        pagemsg("WARNING: Saw mono=, skipping: headt=%s, declt=%s" % (unicode(headt), unicode(t)))
        continue

      if tn in old_template_to_gender:
        must_continue = False
        for param in t.params:
          pn = pname(param)
          if pn not in ["1", "2", "3", "4", "n"]:
            pagemsg("WARNING: Saw unknown param %s=%s in %s: headt=%s" % (pn, unicode(param.value), unicode(t),
              unicode(headt)))
            must_continue = True
            break
        if must_continue:
          continue

        g = old_template_to_gender[tn]
        if g not in genders:
          pagemsg("WARNING: Saw decl gender %s that disagrees with headword gender(s) %s: headt=%s, declt=%s" % (
            g, ",".join(genders), unicode(headt), unicode(t)))
          continue

        blib.set_template_name(t, "sa-decl-noun-%s" % g)
        rmparam(t, "n")
        rmparam(t, "4")
        rmparam(t, "3")
        rmparam(t, "2")
        t.add("1", tr)
        notes.append("convert {{%s}} to {{sa-decl-noun-%s}}" % (tn, g))
      else:
        pagemsg("WARNING: Saw unrecognized decl template: %s" % unicode(t))

    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  if headt:
    pagemsg("WARNING: Saw {{sa-noun}} without {{sa-decl-noun-*}}: %s" % unicode(headt))

  return unicode(parsed), notes
Beispiel #26
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not re.search(r"\{\{pt-noun form of", text):
        return

    pagemsg("Processing")

    notes = []

    parsed = blib.parse_text(text)

    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        def feminize_noun(noun):
            if noun.endswith(u"ão"):
                return noun[:-2] + "ona"
            if noun.endswith("dor"):
                return noun + "a"
            if noun.endswith("o"):
                return noun[:-1] + "a"
            pagemsg(
                "WARNING: Don't know how to compute female equivalent of %s: %s"
                % (noun, unicode(t)))
            return None

        def singularize_feminine_noun(noun):
            if noun.endswith("as"):
                return noun[:-1]
            pagemsg(
                "WARNING: Don't know how to compute singular equivalent of feminine noun %s: %s"
                % (noun, unicode(t)))
            return None

        if tn == "pt-noun form of":
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "2", "3", "4", "t", "nocap", "nodot"]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s: %s" %
                            (pn, unicode(param.value, origt)))
                    return

            lemma = blib.remove_links(getparam(t, "1"))
            gender = getparam(t, "2")
            number = getparam(t, "3")
            dimaug = getparam(t, "4")
            gloss = getparam(t, "t")
            if dimaug:
                pagemsg("WARNING: Not sure what to do with 4=%s: %s" %
                        (dimaug, origt))
                return
            if gender in ["m", "mf", "m-f", "onlym", "onlyf"]:
                if number == "sg":
                    pagemsg("WARNING: Not sure what to do with 2=%s 3=s: %s" %
                            (gender, origt))
                    return
                if number != "pl":
                    pagemsg("WARNING: Unrecognized number 3=%s: %s" %
                            (number, origt))
                    return
                newname = "plural of"
            elif gender != "f":
                pagemsg("WARNING: Unrecognized gender 2=%s: %s" %
                        (gender, origt))
                return
            else:
                if number == "sg":
                    newname = "female equivalent of"
                elif number != "pl":
                    pagemsg("WARNING: Unrecognized number 3=%s: %s" %
                            (number, origt))
                    return
                else:
                    lemma = singularize_feminine_noun(pagetitle)
                    if not lemma:
                        return
                    newname = "plural of"
            del t.params[:]
            blib.set_template_name(t, newname)
            t.add("1", "pt")
            t.add("2", lemma)
            if gloss:
                t.add("3", "")
                t.add("4", gloss)
            notes.append("replace {{pt-noun form of}} with {{%s|pt}}" %
                         newname)
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    return unicode(parsed), notes
Beispiel #27
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not re.search(
            r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form",
            text):
        return

    pagemsg("Processing")

    notes = []

    retval = blib.find_modifiable_lang_section(text, "German", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find German section")
        return
    sections, j, secbody, sectail, has_non_lang = retval

    if re.search("== *Etymology 1 *==", secbody):
        pagemsg("WARNING: Multiple etymology sections, skipping")
        return

    parsed = blib.parse_text(secbody)

    headt = None
    comparative_of_t = None
    superlative_of_t = None
    inflection_of_t = None
    need_superlative_of_t_lemma = None
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        def do_comparative_superlative_of(pos, existing_t, should_end):
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{%s of}}, skipping: %s" %
                    (pos, origt))
                return False
            if existing_t:
                pagemsg(
                    "WARNING: Saw two {{%s of}} templates, skipping: %s and %s"
                    % (pos, unicode(existing_t), origt))
                return False
            if not headt:
                pagemsg(
                    "WARNING: Saw {{%s of}} without head template, skipping: %s"
                    % (pos, origt))
                return False
            if not pagetitle.endswith(should_end):
                pagemsg(
                    "WARNING: Incorrect ending for %s, should be -%s, skipping"
                    % (pos, should_end))
                return False
            param2 = getparam(headt, "2")
            if param2 != "%s adjective" % pos:
                headt.add("2", "%s adjective" % pos)
                notes.append(
                    "convert {{head|de|%s}} to {{head|de|%s adjective}}" %
                    (param2, pos))
            return t

        if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [
                "adjective form", "adjective comparative form",
                "adjective superlative form", "participle form"
        ]:
            if headt:
                pagemsg(
                    "WARNING: Saw two head templates, skipping: %s and %s" %
                    (unicode(headt), origt))
                return
            headt = t
        elif tn == "head" and getparam(t, "1") == "de" and getparam(
                t, "2") == "verb form":
            pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt)
        elif tn == "head":
            pagemsg("WARNING: Saw unrecognized head template, skipping: %s" %
                    origt)
            return
        elif tn == "comparative of":
            comparative_of_t = do_comparative_superlative_of(
                "comparative", comparative_of_t, "er")
            if not comparative_of_t:
                return
        elif tn == "superlative of":
            superlative_of_t = do_comparative_superlative_of(
                "superlative", superlative_of_t, "sten")
            if not superlative_of_t:
                return
        elif tn == "de-adj form of":
            pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" %
                    origt)
            return
        elif tn in ["inflection of", "infl of"]:
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{inflection of}}, skipping: %s"
                    % origt)
                return
            if not headt:
                pagemsg(
                    "WARNING: Saw {{inflection of}} without head template, skipping: %s"
                    % origt)
                return
            if inflection_of_t:
                pagemsg(
                    "WARNING: Saw {{inflection of}} twice, skipping: %s and %s"
                    % (unicode(inflection_of_t), origt))
                return
            inflection_of_t = t
            lemma = getparam(t, "2")
            if getparam(t, "3"):
                pagemsg(
                    "WARNING: Saw alt form in {{inflection of}}, skipping: %s"
                    % origt)
                return
            infl_tags = []
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn):
                    pagemsg(
                        "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s"
                        % (pn, pv, origt))
                    return
                if int(pn) >= 4:
                    infl_tags.append(pv)
            tags = "|".join(infl_tags)
            if tags not in tags_to_ending:
                pagemsg(
                    "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s"
                    % origt)
                return
            del t.params[:]
            ending = tags_to_ending[tags]
            if ending in ["sten", "esten"]:
                need_superlative_of_t_lemma = lemma
            blib.set_template_name(t, "de-adj form of")
            t.add("1", lemma)

            no_explicit = check_if_lemma_and_ending_match_pagetitle(
                lemma, ending, pagetitle, allow_umlaut=True)
            if not no_explicit:
                pagemsg("WARNING: Explicit ending %s required for lemma %s" %
                        (ending, lemma))
                t.add("2", ending)
            notes.append(
                "convert {{inflection of|de|...}} to {{de-adj form of}}")
            if "comd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "comparative adjective form":
                    headt.add("2", "comparative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|comparative adjective form}}"
                        % param2)
            elif "supd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "superlative adjective form":
                    headt.add("2", "superlative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|superlative adjective form}}"
                        % param2)

    secbody = unicode(parsed)

    def add_adj_form_of(secbody, pos, comparative_superlative_t, ending):
        lemma = getparam(comparative_superlative_t, "2")
        if check_if_lemma_and_ending_match_pagetitle(lemma,
                                                     ending,
                                                     pagetitle,
                                                     allow_umlaut=False):
            form_pos = "superlative adjective form" if pos == "superlative" else "adjective form"
            newsec = """

===Adjective===
{{head|de|%s}}

# {{de-adj form of|%s}}""" % (form_pos, lemma)
            secbody, replaced = blib.replace_in_text(
                secbody,
                unicode(comparative_superlative_t),
                unicode(comparative_superlative_t) + newsec,
                pagemsg,
                abort_if_warning=True)
            if not replaced:
                pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" %
                        (ending, unicode(comparative_of_t)))
                return secbody, False
            notes.append("add {{de-adj form of}} for %s" % pos)
        else:
            pagemsg(
                "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" %
                (lemma, pos, ending))
        return secbody, True

    if comparative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t,
                                      "er")
        if not ok:
            return

    if superlative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t,
                                      "sten")
        if not ok:
            return

    if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma:
        cursec = """===Adjective===
{{head|de|superlative adjective form}}

# %s""" % unicode(inflection_of_t)
        newsec = """===Adjective===
{{head|de|superlative adjective}}

# {{superlative of|de|%s}}

""" % need_superlative_of_t_lemma
        secbody, replaced = blib.replace_in_text(secbody,
                                                 cursec,
                                                 newsec + cursec,
                                                 pagemsg,
                                                 abort_if_warning=True)
        if not replaced:
            pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" %
                    unicode(inflection_of_t))
            return
        notes.append("add {{superlative of|de|...}}")

    sections[j] = secbody + sectail
    text = "".join(sections)

    if not notes:
        pagemsg("WARNING: Couldn't convert page")

    return text, notes
Beispiel #28
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Italian", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  has_etym_sections = "==Etymology 1==" in secbody
  saw_pronun_section_at_top = False
  split_pronun_sections = False
  saw_pronun_section_this_etym_section = False
  saw_existing_pron = False
  saw_existing_pron_this_etym_section = False

  etymsection = "top" if has_etym_sections else "all"
  etymsections_to_first_subsection = {}
  if etymsection == "top":
    after_etym_1 = False
    for k in xrange(2, len(subsections), 2):
      if "==Etymology 1==" in subsections[k - 1]:
        after_etym_1 = True
      if "==Pronunciation==" in subsections[k - 1]:
        if after_etym_1:
          split_pronun_sections = True
        else:
          saw_pronun_section_at_top = True
      m = re.search("==Etymology ([0-9]*)==", subsections[k - 1])
      if m:
        etymsections_to_first_subsection[int(m.group(1))] = k

  msgs = []

  def append_msg(txt):
    if txt not in msgs:
      msgs.append(txt)

  def apply_default_pronun_to_pagetitle():
    respellings, this_msgs = apply_default_pronun(pagetitle)
    for msg in this_msgs:
      append_msg(msg)
    return respellings

  for k in xrange(2, len(subsections), 2):
    msgs = []
    def check_missing_pronun(etymsection):
      if split_pronun_sections and not saw_existing_pron_this_etym_section:
        pagemsg("WARNING: Missing pronunciations in etym section %s" % etymsection)
        append_msg("MISSING_PRONUN")
        append_msg("NEW_DEFAULTED")
        respellings = apply_default_pronun_to_pagetitle()
        pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs)))

      #pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all",
      #  " ".join(x.replace(" ", "_") for x in respellings), " ".join(msgs)))

    m = re.search("==Etymology ([0-9]*)==", subsections[k - 1])
    if m:
      if etymsection != "top":
        check_missing_pronun(etymsection)
      etymsection = m.group(1)
      saw_pronun_section_this_etym_section = False
      saw_existing_pron_this_etym_section = False
    if "==Pronunciation " in subsections[k - 1]:
      pagemsg("WARNING: Saw Pronunciation N section header: %s" % subsections[k - 1].strip())
    if "==Pronunciation==" in subsections[k - 1]:
      if saw_pronun_section_this_etym_section:
        pagemsg("WARNING: Saw two Pronunciation sections under etym section %s" % etymsection)
      if saw_pronun_section_at_top and etymsection != "top":
        pagemsg("WARNING: Saw Pronunciation sections both at top and in etym section %s" % etymsection)
      saw_pronun_section_this_etym_section = True
      parsed = blib.parse_text(subsections[k])

      respellings = []
      prev_it_IPA_t = None
      prev_it_pr_t = None
      must_continue = False
      for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "it-IPA":
          saw_existing_pron = True
          saw_existing_pron_this_etym_section = True
          if prev_it_IPA_t:
            pronun_lines = re.findall(r"^.*\{\{it-IPA.*$", subsections[k], re.M)
            pagemsg("WARNING: Saw multiple {{it-IPA}} templates in a single Pronunciation section: %s" %
              " ||| ".join(pronun_lines))
            must_continue = True
            break
          prev_it_IPA_t = t
          this_respellings = []
          saw_pronun = False
          last_numbered_param = 0
          for param in t.params:
            pn = pname(param)
            pv = unicode(param.value).strip().replace(" ", "_")
            if re.search("^[0-9]+$", pn):
              last_numbered_param += 1
              saw_pronun = True
              if pv == "+":
                append_msg("EXISTING_DEFAULTED")
                this_respellings.extend(apply_default_pronun_to_pagetitle())
              else:
                append_msg("EXISTING")
                this_respellings.append(pv)
            elif re.search("^ref[0-9]*$", pn) and int(pn[3:] or "1") == last_numbered_param:
              m = re.search(r"^\{\{R:it:(DiPI|Olivetti|Treccani|Trec)(\|[^{}]*)?\}\}$", pv)
              if m:
                refname, refparams = m.groups()
                refname = "Treccani" if refname == "Trec" else refname
                this_respellings.append("n:%s%s" % (refname, refparams or ""))
              else:
                this_respellings.append("%s=%s" % (pn, pv))
            else:
              this_respellings.append("%s=%s" % (pn, pv))
          if not saw_pronun:
            append_msg("EXISTING_DEFAULTED")
            this_respellings.extend(apply_default_pronun_to_pagetitle())
          respellings.extend(this_respellings)
        if tn == "it-pr":
          saw_existing_pron = True
          saw_existing_pron_this_etym_section = True
          if prev_it_pr_t:
            pronun_lines = re.findall(r"^.*\{\{it-pr.*$", subsections[k], re.M)
            pagemsg("WARNING: Saw multiple {{it-pr}} templates in a single Pronunciation section: %s" %
              " ||| ".join(pronun_lines))
            must_continue = True
            break
          prev_it_pr_t = t
          this_respellings = []
          saw_pronun = False
          for param in t.params:
            pn = pname(param)
            pv = unicode(param.value).strip().replace(" ", "_")
            if re.search("^[0-9]+$", pn):
              saw_pronun = True
              #if pv == "+":
              #  append_msg("EXISTING_DEFAULTED")
              #  this_respellings.extend(apply_default_pronun_to_pagetitle())
              #else:
              def fix_ref(m):
                refname, refparams = m.groups()
                refname = "Treccani" if refname == "Trec" else refname
                return "<r:%s%s>" % (refname, refparams or "")
              pv = re.sub(r"<ref:\{\{R:it:(DiPI|Olivetti|Treccani|Trec|DOP)(\|[^{}]*)?\}\}>", fix_ref, pv)
              append_msg("EXISTING")
              this_respellings.append(pv)
            else:
              this_respellings.append("%s=%s" % (pn, pv))
          if not saw_pronun:
            append_msg("EXISTING_DEFAULTED")
            #this_respellings.extend(apply_default_pronun_to_pagetitle())
            this_respellings.append("+")
          respellings.extend(this_respellings)
      if must_continue:
        continue

      if args.include_defns and etymsection not in ["top", "all"]:
        first_etym_subsec = etymsections_to_first_subsection.get(int(etymsection), None)
        next_etym_subsec = etymsections_to_first_subsection.get(1 + int(etymsection), None)
        if first_etym_subsec is None:
          pagemsg("WARNING: Internal error: Unknown first etym section for =Etymology %s=" % etymsection)
        else:
          if next_etym_subsec is None:
            next_etym_subsec = len(subsections)
          defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it")
          append_msg("defns: %s" % ";".join(defns))

      if respellings:
        pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs)))

  check_missing_pronun(etymsection)
  if not saw_existing_pron:
    if args.include_defns and has_etym_sections:
      for etymsec in sorted(list(etymsections_to_first_subsection.keys())):
        msgs = []
        first_etym_subsec = etymsections_to_first_subsection[etymsec]
        next_etym_subsec = etymsections_to_first_subsection.get(1 + etymsec, None)
        if next_etym_subsec is None:
          next_etym_subsec = len(subsections)
        append_msg("NEW_DEFAULTED")
        defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it")
        append_msg("defns: %s" % ";".join(defns))
        respellings = apply_default_pronun_to_pagetitle()
        pagemsg("<respelling> %s: %s <end> %s" % (etymsec, " ".join(respellings), " ".join(msgs)))
    else:
      msgs = []
      append_msg("NEW_DEFAULTED")
      respellings = apply_default_pronun_to_pagetitle()
      pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", " ".join(respellings), " ".join(msgs)))
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")

    for t in parsed.filter_templates():
        if tname(t) == "bg-adj-form":
            origt = unicode(t)
            must_continue = False
            for param in t.params:
                if pname(param) not in ["1", "2", "3", "head"]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s: %s" %
                            (pname(param), unicode(param.value), origt))
                    must_continue = True
                    break
            if must_continue:
                continue
            rmparam(t, "1")
            rmparam(t, "2")
            head = getparam(t, "head")
            rmparam(t, "head")
            g = getparam(t, "3")
            rmparam(t, "3")
            blib.set_template_name(t, "head")
            t.add("1", "bg")
            t.add("2", "adjective form")
            if head:
                t.add("head", head)
            else:
                if bglib.needs_accents(pagetitle):
                    pagemsg(
                        "WARNING: Can't add head= to {{bg-adj-form}} missing it because pagetitle is multisyllabic: %s"
                        % unicode(t))
                else:
                    t.add("head", pagetitle)
            if g:
                t.add("g", g)
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            notes.append(
                "replace {{bg-adj-form}} with {{head|bg|adjective form}}")

    headt = None
    saw_infl_after_head = False
    saw_headt = False
    saw_inflt = False
    for t in parsed.filter_templates():
        tn = tname(t)
        origt = unicode(t)
        saw_infl = False
        already_fetched_forms = False
        if tn == "head" and getparam(t, "1") == "bg" and getparam(
                t, "2") == "adjective form":
            saw_headt = True
            if headt and not saw_infl_after_head:
                pagemsg(
                    "WARNING: Saw two head templates %s and %s without intervening inflection"
                    % (unicode(headt), origt))
            saw_infl_after_head = False
            headt = t
        if tn == "bg-adj form of":
            saw_inflt = True
            if not headt:
                pagemsg(
                    "WARNING: Saw {{bg-adj form of}} without head template: %s"
                    % origt)
                continue
            must_continue = False
            for param in t.params:
                if pname(param) not in ["1", "2", "3", "adj"]:
                    pagemsg("WARNING: Saw unrecognized param %s=%s: %s" %
                            (pname(param), unicode(param.value), origt))
                    must_continue = True
                    break
            if must_continue:
                continue
            saw_infl_after_head = True
            adj = getparam(t, "adj")
            if not adj:
                pagemsg("WARNING: Didn't see adj=: %s" % origt)
                continue
            infls = []
            param2 = getparam(t, "2")
            if param2 == "indefinite":
                infls.append("indef")
            elif param2 == "definite":
                infls.append("def")
            elif param2 == "extended":
                infls.append("voc")
            else:
                pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt))
                continue
            param3 = getparam(t, "3")
            if param3 == "subject":
                infls.append("sbjv")
            elif param3 == "object":
                infls.append("objv")
            elif param3:
                pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt))
                continue
            param1 = getparam(t, "1")
            if param1 == "masculine":
                infls.extend(["m", "s"])
            elif param1 == "feminine":
                infls.extend(["f", "s"])
            elif param1 == "neuter":
                infls.extend(["n", "s"])
            elif param1 == "plural":
                infls.append("p")
            else:
                pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt))
                continue
            blib.set_template_name(t, "inflection of")
            del t.params[:]
            t.add("1", "bg")
            if adj in adjs_to_accents:
                adj = adjs_to_accents[adj]
            else:
                pagemsg(
                    "WARNING: Unable to find accented equivalent of %s: %s" %
                    (adj, origt))
            t.add("2", adj)
            t.add("3", "")
            for i, infl in enumerate(infls):
                t.add(str(i + 4), infl)
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            notes.append("convert {{bg-adj form of}} to {{inflection of}}")
            tn = tname(t)
        elif tn == "inflection of" and getparam(t, "1") == "bg":
            saw_inflt = True

    if saw_headt and not saw_inflt:
        pagemsg("WARNING: Saw head template %s but no inflection template" %
                unicode(headt))

    return unicode(parsed), notes
Beispiel #30
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose)
  def verify_template_is_full_line(tn, line):
    templates = list(blib.parse_text(line).filter_templates())
    if type(tn) is list:
      tns = tn
    else:
      tns = [tn]
    tntext = "/".join(tns)
    if len(templates) == 0:
      pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line))
      return None
    t = templates[0]
    if tname(t) not in tns:
      pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" %
          (tntext, tntext, line))
      return None
    if unicode(t) != line:
      pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line))
      return None
    return t

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  sect_for_wiki = 0
  for k in xrange(1, len(subsections), 2):
    if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]):
      sect_for_wiki = k + 1
    elif re.search(r"==\s*Pronunciation\s*==", subsections[k]):
      secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k])
      if secheader != subsections[k]:
        subsections[k] = secheader
        notes.append("remove extraneous spaces in ==Pronunciation== header")
      extra_notes = []
      parsed = blib.parse_text(subsections[k + 1])
      num_it_IPA = 0
      saw_it_pr = False
      for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["it-pr", "it-pronunciation"]:
          saw_it_pr = True
          break
        if tn == "it-IPA":
          num_it_IPA += 1
      if saw_it_pr:
        pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t))
        continue
      if num_it_IPA == 0:
        pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping")
        continue
      if num_it_IPA > 1:
        pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping")
        continue
      lines = subsections[k + 1].strip().split("\n")
      # Remove blank lines.
      lines = [line for line in lines if line]
      hyph_lines = []
      homophone_lines = []
      rfap_lines = []
      rhyme_lines = []
      must_continue = False
      audioarg = ""
      args = []
      bare_args = []
      args_for_hyph = []
      lines_so_far = []
      for lineind, line in enumerate(lines):
        origline = line
        lines_so_far.append(line)
        # In case of "* {{it-IPA|...}}", chop off the "* ".
        line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line)
        if line.startswith("{{it-IPA"):
          if args:
            pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline)
            must_continue = True
            break
          outer_ref_arg = None
          m = re.search("^(.*?) *<ref>(.*?)</ref>$", line)
          if m:
            line, outer_ref_arg = m.groups()
          ipat = verify_template_is_full_line("it-IPA", line)
          if ipat is None:
            must_continue = True
            break
          bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"]
          bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args]
          bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args]
          bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args]
          normalized_bare_args = [
            normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline)))
            for arg in bare_args
          ]
          if None in normalized_bare_args:
            must_continue = True
            break
          args = [x for x in bare_args]

          args_for_hyph = []
          for arg in normalized_bare_args:
            hypharg = (
              arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z")
              .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z")
            )
            hypharg = re.sub(pron_sign_c, "", hypharg)
            putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", ""))
            putative_pagetitle = remove_non_final_accents(putative_pagetitle)
            # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized
            # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not
            # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation
            # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want
            # pronunciation rè to match page title ré or vice versa.)
            if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle:
              args_for_hyph.append(hypharg)

          for param in ipat.params:
            pn = pname(param)
            pv = unicode(param.value)
            if re.search("^[0-9]+$", pn):
              continue
            m = re.search("^(ref|qual)([0-9]*)$", pn)
            if m:
              parampref, argnum = m.groups()
              argnum = int(argnum or "1") - 1
              if argnum >= len(args):
                pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % (
                  pn, pv, origline))
                must_continue = True
                break
              args[argnum] += "<%s:%s>" % (parampref, pv)
            else:
              pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if outer_ref_arg:
            if "<ref:" in args[-1]:
              pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s"
                  % (outer_ref_arg, args[-1], origline))
              must_continue = True
              break
            else:
              args[-1] += "<ref:%s>"  % outer_ref_arg
              extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}")
          continue
        if line.startswith("{{rfap"):
          line = "* " + line
        if line.startswith("{{wiki"):
          subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki]
          # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we
          # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate
          # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section.
          del lines_so_far[-1]
          subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:])
          notes.append("move {{wikipedia}} line to top of etym section")
          continue
        if not line.startswith("* ") and not line.startswith("*{"):
          pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s"
              % origline)
          must_continue = True
          break
        if line.startswith("* "):
          line = line[2:]
        else:
          line = line[1:]
        if line.startswith("{{hyph"):
          hyph_lines.append("* " + line)
        elif line.startswith("{{homophone"):
          homophone_lines.append("* " + line)
        elif line.startswith("{{rfap"):
          rfap_lines.append(line)
        elif line.startswith("{{audio"):
          audiot = verify_template_is_full_line("audio", line)
          if audiot is None:
            must_continue = True
            break
          if getparam(audiot, "1") != "it":
            pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline)
            must_continue = True
            break
          audiofile = getparam(audiot, "2")
          audiogloss = getparam(audiot, "3")
          for param in audiot.params:
            pn = pname(param)
            pv = unicode(param.value)
            if pn not in ["1", "2", "3"]:
              pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if audiogloss in ["Audio", "audio"]:
            audiogloss = ""
          if audiogloss:
            audiogloss = ";%s" % audiogloss
          audiopart = "<audio:%s%s>" % (audiofile, audiogloss)
          audioarg += audiopart
          pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart))
          extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot))
        elif line.startswith("{{rhyme"):
          rhyme_lines.append(line)
        elif remove_accents(line) == remove_accents(pagetitle):
          pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline)
        else:
          pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline)
          must_continue = True
          break
      if must_continue:
        continue

      if rhyme_lines:
        rhyme_error = False
        rhyme_pronuns = []
        for bare_arg in normalized_bare_args:
          pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg))
          if not pronun:
            rhyme_error = True
            break
          rhyme_pronun = (
            re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "")
            .replace(".", ""))
          if rhyme_pronun not in rhyme_pronuns:
            rhyme_pronuns.append(rhyme_pronun)
        if not rhyme_error:
          saw_non_matching_rhyme = False
          normalized_rhymes = []
          rhyme_line_text = ", ".join(rhyme_lines)
          normalized_bare_arg_text = ",".join(normalized_bare_args)
          rhyme_pronun_text = ",".join(rhyme_pronuns)
          for rhyme_line in rhyme_lines:
            rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line)
            if not rhymet:
              break
            if getparam(rhymet, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line))
              break
            rhymes = []
            must_break = False
            num_syl = ""
            rhyme_specific_num_syl = []
            for param in rhymet.params:
              pn = pname(param)
              pv = unicode(param.value)
              if not re.search("^s?[0-9]*$", pn):
                pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                    (pn, pv, tname(rhymet), rhyme_line))
                must_break = True
                break
              if pn == "s":
                num_syl = "<s:%s>" % pv
              elif pn.startswith("s"):
                rhyme_no = int(pn[1:]) - 1
                rhyme_specific_num_syl.append((rhyme_no, pv))
              elif int(pn) > 1:
                if pv:
                  rhymes.append([pv, ""])
            if must_break:
              break
            for rhyme_no, this_num_syl in rhyme_specific_num_syl:
              if rhyme_no >= len(rhymes):
                pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % (
                  rhyme_no + 1, this_num_syl, rhyme_line))
                must_break = True
                break
              rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl
            if must_break:
              break
            for rhyme, this_num_syl in rhymes:
              normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm")
              normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme)
              this_num_syl = this_num_syl or num_syl
              if this_num_syl and not args_for_hyph and not hyph_lines:
                pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s"
                    % (this_num_syl, rhyme, rhyme_line_text))
                saw_non_matching_rhyme = True
                normalized_rhymes.append(normalized_rhyme + this_num_syl)
              else:
                normalized_rhymes.append(normalized_rhyme)
                if rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif normalized_rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif rhyme != normalized_rhyme:
                  pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
                else:
                  pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
          else: # no break
            if saw_non_matching_rhyme:
              pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s"
                  % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
              args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes)
              extra_notes.append("incorporate non-default rhymes into {{it-pr}}")
            else:
              extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}")
            rhyme_lines = []

      if not args:
        pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?")
        continue
      args[-1] += audioarg

      if hyph_lines:
        if len(hyph_lines) > 1:
          pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines))
        else:
          assert hyph_lines[0].startswith("* ")
          hyph_line = hyph_lines[0][2:]
          hyph_templates = re.split(", *", hyph_line)
          hyphs = []
          for hyph_template in hyph_templates:
            hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template)
            if not hypht:
              break
            syls = []
            if getparam(hypht, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template))
              break
            else:
              must_break = False
              for param in hypht.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn) and pn != "nocaption":
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hypht), hyph_line))
                  must_break = True
                  break
                if pn != "nocaption" and int(pn) > 1:
                  if not pv:
                    hyphs.append(syls)
                    syls = []
                  else:
                    syls.append(pv)
              if must_break:
                break
              if syls:
                hyphs.append(syls)
          else: # no break
            if hyphs:
              specified_hyphenations = [".".join(syls) for syls in hyphs]
              specified_hyphenations = [
                re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [
                adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations]
              hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph]
              if set(specified_hyphenations) < set(hyphenations):
                pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
              elif set(specified_hyphenations) != set(hyphenations):
                hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations]
                rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations]
                def indices_of_syllable_markers(hyph):
                  # Get the character indices of the syllable markers, but not counting the syllable markers themselves
                  # (i.e. return the number of characters preceding the syllable marker).
                  raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."]
                  adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)]
                  return set(adjusted_indices)
                if set(specified_hyphenations) == set(hyphenations_without_accents):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif set(rehyphenated_specified_hyphenations) == set(hyphenations):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1
                    and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "")
                    and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                else:
                  if not hyphenations:
                    pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" %
                        (",".join(specified_hyphenations), hyph_line))
                  else:
                    pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" %
                        (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                  args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations)
                  extra_notes.append("incorporate non-default hyphenations into {{it-pr}}")
              else:
                pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line)
                extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}")
              hyph_lines = []

      if homophone_lines:
        if len(homophone_lines) > 1:
          pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines))
        else:
          assert homophone_lines[0].startswith("* ")
          homophone_line = homophone_lines[0][2:]
          homophones = {}
          homophone_qualifiers = {}
          hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line)
          if hmpt:
            if getparam(hmpt, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line))
            else:
              for param in hmpt.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^q?[0-9]+$", pn):
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hmpt), homophone_line))
                  break
                if pn.startswith("q"):
                  homophone_qualifiers[int(pn[1:])] = pv
                elif int(pn) > 1:
                  homophones[int(pn) - 1] = pv
              else: # no break
                hmp_args = []
                for pn, pv in sorted(homophones.items()):
                  hmp_args.append(pv)
                  if pn in homophone_qualifiers:
                    hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn]
                args[-1] += "<hmp:%s>" % ",".join(hmp_args)
                extra_notes.append("incorporate homophones into {{it-pr}}")
                homophone_lines = []

      if args == ["+"]:
        it_pr = "{{it-pr}}"
      else:
        it_pr = "{{it-pr|%s}}" % ",".join(args)
      pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr))

      all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines)
      newsubsec = "%s\n\n" % all_lines
      if subsections[k + 1] != newsubsec:
        this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes
        notes.extend(this_notes)
      subsections[k + 1] = newsubsec

  secbody = "".join(subsections)
  # Strip extra newlines added to secbody
  sections[j] = secbody.rstrip("\n") + sectail
  return "".join(sections), notes