Esempio n. 1
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "German", pagemsg)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  if "=Etymology 1=" in secbody:
    notes = []
    etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    for k in xrange(2, len(etym_sections), 2):
      retval = process_text_in_section(index, pagetitle, etym_sections[k])
      if retval:
        newsectext, newnotes = retval
        etym_sections[k] = newsectext
        notes.extend(newnotes)
    secbody = "".join(etym_sections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
  else:
    retval = process_text_in_section(index, pagetitle, secbody)
    if retval:
      secbody, notes = retval
      sections[j] = secbody + sectail
      return "".join(sections), notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else args.langname,
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    for k in xrange(1, len(subsections), 2):
        if re.search("==Anagrams==", subsections[k]):
            if k + 2 < len(subsections):
                subsections = (subsections[0:k] +
                               subsections[k + 2:len(subsections)] +
                               subsections[k:k + 2])
                notes.append("put Anagrams last in %s section" % args.langname)

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Esempio n. 3
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")
    text = unicode(page.text)

    retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Hungarian section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    if "==Alternative forms==" in secbody:
        pagemsg("WARNING: Skipping page with 'Alternative forms' section")
        return

    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn in ["compound", "affix", "af"] and getparam(
                t, "1") == "hu" and not getparam(t, "pos"):
            t.add("pos", "noun")
        if origt != unicode(t):
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            notes.append("add pos=noun to {{%s|hu}}" % tn)
    sections[j] = unicode(parsed) + sectail
    text = "".join(sections)
    return text, notes
Esempio n. 4
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = blib.find_modifiable_lang_section(text, "Chinese", pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    m = re.search(r"\A(.*?)(\n*)\Z", secbody, re.S)
    secbody, secbody_finalnl = m.groups()
    secbody += "\n\n"

    notes = []

    new_secbody = secbody
    new_secbody = re.sub(
        r"^\* http://www\.trade\.gov\.bt/administration/mktbriefs/10\.pdf\n",
        "", new_secbody, 0, re.M)
    new_secbody = re.sub(
        r"^\* http://www\.koreantk\.com/en/m_sta/med_stat_search\.jsp\?searchGbn=statis\n",
        "", new_secbody, 0, re.M)
    new_secbody = re.sub(r"^\* http://www1\.dict\.li/?\n", "", new_secbody, 0,
                         re.M)
    new_secbody = re.sub(r"^\* http://www1\.dict\.li/ and ", "* ", new_secbody,
                         0, re.M)
    if new_secbody != secbody:
        notes.append(
            "remove bad Chinese links (see [[Wiktionary:Grease pit/2019/September#Requesting bot help]])"
        )
        secbody = new_secbody
    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    subsections_to_delete = []
    for k in xrange(1, len(subsections), 2):
        if (subsections[k] in ["===References===\n", "====References====\n"]
                and not subsections[k + 1].strip()):
            subsections_to_delete.append(k)
    if subsections_to_delete:
        for k in reversed(subsections_to_delete):
            del subsections[k:k + 2]
        notes.append("remove empty References section")

    secbody = "".join(subsections)
    sections[j] = secbody.rstrip("\n") + secbody_finalnl + sectail
    return "".join(sections), notes
Esempio n. 5
0
def process_page_for_modification(index, pagetitle, text, new_pronuns):
    if pagetitle not in new_pronuns:
        return

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Old English section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    heads = None
    if "Etymology 1" in secbody:
        etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0,
                                 re.M)
        for k in xrange(2, len(etym_sections), 2):
            parsed = blib.parse_text(etym_sections[k])
            secheads = []
            for t in parsed.filter_templates():
                this_heads = get_head_param(t, pagetitle)
                if this_heads:
                    this_heads = [blib.remove_links(x) for x in this_heads]
                    for head in this_heads:
                        if head not in secheads:
                            secheads.append(head)
            if heads is None:
                heads = secheads
            elif set(heads) != set(secheads):
                pagemsg(
                    "Saw head(s) %s in one etym section and %s in another, splitting pronuns per etym section"
                    % (",".join(heads), ",".join(secheads)))
                for k in xrange(2, len(etym_sections), 2):
                    etym_sections[k] = process_section_for_modification(
                        index, pagetitle, etym_sections[k], 4,
                        new_pronuns[pagetitle])
                sections[j] = "".join(etym_sections) + sectail
                return "".join(
                    sections), "add pronunciation(s) to Old English lemma(s)"
        pagemsg(
            "All etym sections have same head(s) %s, creating a single pronun section"
            % ",".join(heads))
    secbody = process_section_for_modification(index, pagetitle, secbody, 3,
                                               new_pronuns[pagetitle])
    sections[j] = secbody + sectail
    return "".join(sections), "add pronunciation(s) to Old English lemma(s)"
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = blib.find_modifiable_lang_section(text,
                                               "Japanese",
                                               pagemsg,
                                               force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    newsecbody = re.sub("^====Compounds====$", "====Derived terms====",
                        secbody, 0, re.M)
    if newsecbody != secbody:
        notes.append(
            "Compounds -> Derived terms in Japanese section (see [[Wiktionary:Grease pit/2019/September#Requesting bot help]])"
        )
        secbody = newsecbody

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    for k in xrange(1, len(subsections), 2):
        if subsections[k] == "====Derived terms====\n":
            endk = k + 2
            while endk < len(subsections) and (re.search(
                    "^====(Synonyms|Antonyms)====\n$", subsections[endk])):
                endk += 2
            if endk > k + 2:
                subsections = (subsections[0:k] + subsections[k + 2:endk] +
                               subsections[k:k + 2] + subsections[endk:])
                notes.append("reorder Derived terms after Synonyms/Antonyms")

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
def process_text_on_page(index, pagetitle, text):
  global args

  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  notes = []

  retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Hungarian section")
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  for k in xrange(1, len(subsections), 2):
    if ("===Verb===" in subsections[k] and "{{head|hu|verb form" in subsections[k + 1] and
      "{{participle of|hu|" in subsections[k + 1]):
      if args.split_participle:
        newsubsec = re.sub(r"^(#.*\{\{participle of\|hu\|.*)\n(#.*\{\{inflection of\|hu\|.*)\n\n", r"\2\n\1\n\n",
            subsections[k + 1], 0, re.M)
        if newsubsec != subsections[k + 1]:
          notes.append("reorder {{inflection of|hu|...}} before {{participle of|hu|...}}")
          subsections[k + 1] = newsubsec
        elif re.search(r"\{\{participle of\|hu\|.*\{\{inflection of\|hu\|", subsections[k + 1], re.S):
          pagemsg("WARNING: Saw {{participle of|hu|...}} before {{inflection of|hu|...}} with likely usage examples")
          continue
      if args.split_participle and "{{inflection of|hu|" in subsections[k + 1]:
        subsections[k + 1] = re.sub(r"^(#.*\{\{participle of\|hu\|)", r"\n===Participle===\n{{head|hu|participle}}\n\n\1", subsections[k + 1], 0, re.M)
        notes.append("split Hungarian verb form from participle")
      else:
        subsections[k] = subsections[k].replace("===Verb===", "===Participle===")
        subsections[k + 1] = re.sub(r"\{\{head\|hu\|verb form", "{{head|hu|participle", subsections[k + 1])
        notes.append("Hungarian verb form -> participle in section with {{participle of}}")

  secbody = "".join(subsections)
  sections[j] = secbody + sectail
  return "".join(sections), notes
Esempio n. 8
0
def process_page(page, index, parsed):
  notes = []
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  text = unicode(page.text)
  retval = blib.find_modifiable_lang_section(text, "Belarusian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Belarusian section")
    return
  sections, j, secbody, sectail, has_non_lang = retval
  if "Etymology 1" in secbody:
    etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    for k in xrange(2, len(etym_sections), 2):
      etym_sections[k], this_notes = process_section(index, pagetitle, etym_sections[k])
      notes.extend(this_notes)
    secbody = "".join(etym_sections)
  else:
    secbody, this_notes = process_section(index, pagetitle, secbody)
    notes.extend(this_notes)
  sections[j] = secbody + sectail
  if notes:
    sections[j] = re.sub(r"\{\{cln\|be\|(in)?transitive verbs\}\}\n?", "", sections[j])
  return "".join(sections), notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "English",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    for k in xrange(1, len(subsections) - 2, 2):
        if (re.search(r"==%s==" % headers_to_swap_regex, subsections[k])
                and re.search("==Translations==", subsections[k + 2])):
            notes.append("swap %s and %s sections" %
                         (subsections[k].strip(), subsections[k + 2].strip()))
            temp = subsections[k]
            subsections[k] = subsections[k + 2]
            subsections[k + 2] = temp
            temp = subsections[k + 1]
            subsections[k + 1] = subsections[k + 3]
            subsections[k + 3] = temp

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    text = "".join(sections)
    return text, notes
Esempio n. 10
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Italian",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    need_ref_section = False

    for k in xrange(2, len(subsections), 2):
        if "==Pronunciation==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])

            all_pronun_templates = []
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it":
                    all_pronun_templates.append(t)

            saw_it_pr = False
            pronun_based_respellings = []
            for t in parsed.filter_templates():
                origt = unicode(t)

                def tmsg(txt):
                    other_templates = []
                    for t in all_pronun_templates:
                        thist = unicode(t)
                        if thist != origt:
                            other_templates.append(thist)
                    pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" %
                                          ", ".join(other_templates)
                                          if len(other_templates) > 0 else ""))

                tn = tname(t)
                if tn == "it-pr":
                    saw_it_pr = True
                    respellings = blib.fetch_param_chain(t, "1")
                    # FIXME, need to split on comma
                    pronun_based_respellings.extend(respellings)
                    break
                if tn == "IPA" and getparam(t, "1") == "it":
                    saw_it_pr = True
                    pronuns = blib.fetch_param_chain(t, "2")
                    this_phonemic_pronun = None
                    this_phonemic_respelling = None
                    this_phonetic_pronun = None
                    this_phonetic_respelling = None
                    respellings = []
                    all_warnings = []
                    hack_respelling_warnings = []
                    main_warnings = []
                    unable = [False]
                    for pronun in pronuns:
                        respelling = ipa_to_respelling(pronun)
                        respelling, this_hack_respelling_warnings = hack_respelling(
                            pagetitle, respelling)
                        hack_respelling_warnings.extend(
                            this_hack_respelling_warnings)

                        def set_unable(msg):
                            main_warnings.append(msg)
                            unable[0] = True

                        tmsg("For pronun %s, generated respelling %s" %
                             (pronun, respelling))
                        respelling_words = respelling.split(" ")
                        for rw in respelling_words:
                            if rw.endswith("-"):  # prefix
                                continue
                            hacked_rw = re.sub(
                                u".[\u0323\u0331]", "e", rw
                            )  # pretend vowels with secondary or no stress are 'e'
                            if not re.search(
                                    u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len(
                                        re.sub("[^aeiouAEIOU]", "",
                                               hacked_rw)) > 1:
                                set_unable(
                                    "WARNING: For respelling %s for pronun %s, word %s is missing stress"
                                    % (respelling, pronun, rw))
                        if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$",
                                         respelling):
                            set_unable(
                                "WARNING: Strange char in respelling %s for pronun %s"
                                % (respelling, pronun))
                        else:
                            putative_pagetitle = re.sub(
                                u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])",
                                lambda m: vowel_respelling_to_spelling[m.group(
                                    1)] + m.group(2), respelling)
                            pagetitle_words = pagetitle.split(" ")
                            putative_pagetitle_words = putative_pagetitle.split(
                                " ")
                            if len(pagetitle_words) != len(
                                    putative_pagetitle_words):
                                set_unable(
                                    "WARNING: Page title has %s words but putative page title %s has %s words"
                                    %
                                    (len(pagetitle_words), putative_pagetitle,
                                     len(putative_pagetitle_words)))
                            else:
                                hacked_putative_pagetitle_words = []
                                for ptw, puptw in zip(
                                        pagetitle_words,
                                        putative_pagetitle_words):
                                    split_ptw = re.split("([Zz]+)", ptw)
                                    split_puptw = re.split(
                                        "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw)
                                    if len(split_ptw) != len(split_puptw):
                                        set_unable(
                                            "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s"
                                            % (ptw, puptw))
                                        hacked_putative_pagetitle_words.append(
                                            puptw)
                                    else:
                                        parts = []
                                        for i in xrange(len(split_puptw)):
                                            if i % 2 == 0:
                                                parts.append(split_puptw[i])
                                            else:
                                                parts.append(split_ptw[i])
                                        hacked_putative_pagetitle_words.append(
                                            "".join(parts))
                                putative_pagetitle = " ".join(
                                    hacked_putative_pagetitle_words)
                                if putative_pagetitle != pagetitle:
                                    # If respelling already seen, we already warned about it.
                                    if respelling in respellings:
                                        assert unable[0]
                                    else:
                                        set_unable(
                                            "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)"
                                            % (respelling, putative_pagetitle,
                                               pronun))

                        def append_respelling(respelling):
                            if respelling not in respellings:
                                respellings.append(respelling)

                        def append_warnings(warning):
                            if warning:
                                all_warnings.append(warning)
                            for warning in hack_respelling_warnings:
                                all_warnings.append(warning)
                            del hack_respelling_warnings[:]
                            for warning in main_warnings:
                                all_warnings.append(warning)
                            del main_warnings[:]

                        append_respelling(respelling)
                        if pronun.startswith("/"):
                            if this_phonemic_pronun is not None:
                                append_warnings(
                                    "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun"
                                    % (this_phonemic_pronun,
                                       this_phonemic_respelling, pronun,
                                       respelling))
                            this_phonemic_pronun = pronun
                            this_phonemic_respelling = respelling
                            this_phonetic_pronun = None
                            this_phonetic_respelling = None
                        elif pronun.startswith("["):
                            if this_phonemic_pronun is None:
                                if this_phonetic_pronun is not None:
                                    unable[0] = True
                                    append_warnings(
                                        "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun"
                                        % (this_phonetic_pronun,
                                           this_phonetic_respelling, pronun,
                                           respelling))
                                else:
                                    append_warnings(
                                        "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun"
                                        % (pronun, respelling))
                                this_phonetic_pronun = pronun
                                this_phonetic_respelling = respelling
                            elif this_phonemic_respelling != respelling:
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)"
                                    %
                                    (this_phonemic_respelling,
                                     this_phonemic_pronun, respelling, pronun))
                            else:
                                if unable[0] and len(main_warnings) > 0:
                                    # `unable` could be set from a previous pronunciation but no main warnings this time around
                                    # because the previously generated warnings have already been appended to all_warnings.
                                    mesg = main_warnings[0]
                                    del main_warnings[0]
                                    append_warnings(mesg)
                                else:
                                    append_warnings(None)
                            this_phonemic_pronun = None
                            this_phonemic_respelling = None
                        else:
                            unable[0] = True
                            append_warnings(
                                "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic"
                                % (pronun, respelling))
                    if this_phonemic_pronun is not None:
                        append_warnings(
                            "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun"
                            % (this_phonemic_pronun, this_phonemic_respelling))
                    if not unable[0]:
                        for param in t.params:
                            pn = pname(param)
                            if not re.search("^[0-9]+$",
                                             pn) and pn != "nocount":
                                unable[0] = True
                                append_warnings(
                                    "WARNING: Saw unrecognized param %s=%s" %
                                    (pn, unicode(param.value)))
                    manual_assist = ""
                    if unable[0]:
                        if pagetitle in ipa_directives:
                            respellings = ipa_directives[pagetitle]
                            unable[0] = False
                            manual_assist = " (manually assisted)"
                            tmsg(
                                "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s"
                                % ("[MULTIPLE PRONUN TEMPLATES] "
                                   if len(all_pronun_templates) > 1 else "",
                                   "s" if len(respellings) > 1 else "",
                                   ",".join(respellings),
                                   " ||| ".join(all_warnings)))
                        else:
                            tmsg("%s<respelling> %s <end> %s" %
                                 ("[MULTIPLE PRONUN TEMPLATES] "
                                  if len(all_pronun_templates) > 1 else "",
                                  " ".join(respellings),
                                  " ||| ".join(all_warnings)))
                    if not unable[0]:
                        del t.params[:]
                        nextparam = 0
                        for param in respellings:
                            if "=" in param:
                                paramname, paramval = param.split("=", 1)
                            else:
                                nextparam += 1
                                paramname = str(nextparam)
                                paramval = param
                            if re.search("^n[0-9]*$", paramname):
                                need_ref_section = True
                            t.add(paramname, paramval)
                        blib.set_template_name(t, "it-pr")
                        notes.append(
                            "replace raw {{IPA|it}} with {{it-pr|%s}}%s" %
                            ("|".join(respellings), manual_assist))
                    pronun_based_respellings.extend(respellings)
                if unicode(t) != origt:
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
            subsections[k] = unicode(parsed)

            rhymes_template = None
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it":
                    if rhymes_template:
                        pagemsg(
                            "WARNING: Saw two {{rhymes|it}} templates: %s and %s"
                            % (unicode(rhymes_template), unicode(t)))
                    rhymes_template = t
            if rhymes_template:
                rhyme_based_respellings = []
                all_warnings = []

                def append_respelling(respelling):
                    if respelling not in rhyme_based_respellings:
                        rhyme_based_respellings.append(respelling)

                def append_warnings(warning):
                    all_warnings.append(warning)

                rhymes = blib.fetch_param_chain(rhymes_template, "2")
                unable = False
                for rhy in rhymes:
                    spellings = rhyme_to_spelling(rhy)
                    matched = False
                    bad_rhyme_msgs = []
                    for ending, ending_respelling in spellings:
                        if pagetitle.endswith(ending):
                            prevpart = pagetitle[:-len(ending)]
                            respelling = prevpart + ending_respelling
                            saw_oso_ese = False
                            if ending_respelling == u"óso":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"ó[s]o")
                            elif ending_respelling == u"ése":
                                saw_oso_ese = True
                                append_respelling(respelling)
                                append_respelling("#" + prevpart + u"é[s]e")
                            else:
                                if respelling.endswith(u"zióne"):
                                    new_respelling = re.sub(
                                        u"zióne$", u"tsióne", respelling)
                                    pagemsg(
                                        "Replaced respelling '%s' with '%s'" %
                                        (respelling, new_respelling))
                                    respelling = new_respelling
                                    prevpart = respelling[:-len(
                                        ending)] + ending_respelling
                                append_respelling(respelling)
                            if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)",
                                          prevpart.lower())
                                    or not saw_oso_ese and re.search(
                                        u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]",
                                        ending_respelling.lower())):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s"
                                    % rhy)
                                unable = True
                                break
                            if "z" in prevpart:
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to z in part before rhyme: %s"
                                    % rhy)
                                unable = True
                                break
                            hacked_prevpart = re.sub("([gq])u", r"\1w",
                                                     prevpart)
                            hacked_prevpart = hacked_prevpart.replace(
                                "gli", "gl")
                            hacked_prevpart = re.sub("([cg])i", r"\1",
                                                     hacked_prevpart)
                            if re.search("[^aeiou][iu]([aeiou]|$)",
                                         hacked_prevpart.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s"
                                    % rhy)
                                unable = True
                                break
                            if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)",
                                         respelling.lower()):
                                append_warnings(
                                    "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s"
                                    % rhy)
                                unable = True
                                break
                            matched = True
                            break
                        else:
                            bad_rhyme_msgs.append(
                                "WARNING: Unable to match rhyme %s, spelling %s, respelling %s"
                                % (rhy, ending, ending_respelling))
                    if not matched and not unable and bad_rhyme_msgs:
                        for bad_rhyme_msg in bad_rhyme_msgs:
                            pagemsg(bad_rhyme_msg)
                if rhyme_based_respellings:
                    if not saw_it_pr:
                        manual_assist = ""
                        if pagetitle in rhyme_directives:
                            rhyme_based_respellings = rhyme_directives[
                                pagetitle]
                            manual_assist = " (manually assisted)"
                            pagemsg(
                                "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s"
                                % ("s" if len(rhyme_based_respellings) > 1 else
                                   "", ",".join(rhyme_based_respellings),
                                   " ||| ".join(all_warnings),
                                   unicode(rhymes_template)))
                            subsections[k] = "* {{it-pr|%s}}\n" % ",".join(
                                rhyme_based_respellings) + subsections[k]
                            notes.append(
                                "add Italian rhyme-based respelling%s %s%s" %
                                ("s" if len(rhyme_based_respellings) > 1 else
                                 "", ",".join(rhyme_based_respellings),
                                 manual_assist))
                        else:
                            different_headers = []
                            for pos in [
                                    "Noun", "Verb", "Adjective", "Adverb",
                                    "Participle"
                            ]:
                                if "==%s==" % pos in secbody:
                                    different_headers.append(pos)
                            if len(different_headers) > 1:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple headers %s seen" %
                                    ",".join(different_headers)
                                ]
                            if "Etymology 1" in secbody:
                                all_warnings[0:0] = [
                                    "WARNING: Multiple etymologies seen"
                                ]

                            pagemsg(
                                "<respelling> all: %s <end>%s: <from> %s <to> %s <end>"
                                % (" ".join(rhyme_based_respellings), " " +
                                   " ||| ".join(all_warnings) if all_warnings
                                   else "", unicode(rhymes_template),
                                   unicode(rhymes_template)))
                    else:
                        for respelling in rhyme_based_respellings:
                            if (not re.search("^qual[0-9]*=", respelling)
                                    and pronun_based_respellings and respelling
                                    not in pronun_based_respellings):
                                pagemsg(
                                    "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s"
                                    %
                                    (" (with problems)" if
                                     len(all_warnings) > 0 else "", respelling,
                                     ",".join(pronun_based_respellings),
                                     ": %s" % " ||| ".join(all_warnings)
                                     if len(all_warnings) > 0 else ""))

    if need_ref_section:
        for k in xrange(len(subsections) - 1, 2, -2):
            if re.search(r"^===\s*References\s*===$",
                         subsections[k - 1].strip()):
                if not re.search(r"<references\s*/?\s*>", subsections[k]):
                    subsections[k] = subsections[k].rstrip(
                        "\n") + "\n<references />\n\n"
                    notes.append(
                        "add <references /> to existing ===References=== section for pronunciation refs"
                    )
                break
        else:  # no break
            for k in xrange(len(subsections) - 1, 2, -2):
                if not re.search(r"==\s*(Anagrams|Further reading)\s*==",
                                 subsections[k - 1]):
                    subsections[k + 1:k + 1] = [
                        "===References===\n", "<references />\n\n"
                    ]
                    notes.append(
                        "add new ===References=== section for pronunciation refs"
                    )
                    break
            else:  # no break
                pagemsg(
                    "WARNING: Something wrong, couldn't find location to insert ===References=== section"
                )

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Esempio n. 11
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Polish", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  has_etym_sections = "==Etymology 1==" in secbody
  if has_etym_sections:
    # Check if either Pronunciation with pronunciation template above Etymology 1, or every
    # Etymology N section has Pronunciation with pronunciation template.
    saw_etym_1 = False
    cur_etym_header = None
    saw_pron_in_etym = False
    for k in range(1, len(subsections), 2):
      if "==Pronunciation==" in subsections[k]:
        secparsed = blib.parse_text(subsections[k + 1])
        for t in secparsed.filter_templates():
          tn = tname(t)
          if tn in pronun_templates:
            if saw_etym_1:
              saw_pron_in_etym = True
              break
            else:
              pagemsg("Already saw pronunciation template above ==Etymology 1==: %s" % unicode(t))
              return
        else: # no break
          pagemsg("WARNING: Saw ==Pronunciation== section without pronunciation template, along with ==Etymology 1==; can't handle, skipping")
          return

      if "==Etymology 1==" in subsections[k]:
        saw_etym_1 = True
        cur_etym_header = subsections[k].strip()
      elif re.search("==Etymology [0-9]+==", subsections[k]):
        if not saw_pron_in_etym:
          pagemsg("WARNING: No ==Pronunciation== section above ==Etymology N== headers and saw %s without pronunciation template; can't handle, skipping"
              % cur_etym_header)
          return
        saw_pron_in_etym = False
        cur_etym_header = subsections[k].strip()
    if not saw_pron_in_etym:
      # Last Etymology N section didn't have pronunciation template.
      pagemsg("WARNING: No ==Pronunciation== section above ==Etymology N== headers and saw %s without pronunciation template; can't handle, skipping"
          % cur_etym_header)
      return

  parsed = blib.parse_text(secbody)

  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in pronun_templates:
      pagemsg("Already saw pronunciation template: %s" % unicode(t))
      return

  if not args.ignore_lemma_respelling:
    lemmas = set()
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn in infl_templates:
        def getp(param):
          return getparam(t, param)
        if getp("1") != "pl":
          pagemsg("WARNING: Wrong language in {{%s}}, skipping: %s" % (tn, unicode(t)))
          return
        lemma = getparam(t, "2")
        lemmas.add(lemma)
    if len(lemmas) > 1:
      pagemsg("WARNING: Saw inflection of multiple lemmas %s, skipping" % ",".join(lemmas))
      return
    if not lemmas:
      pagemsg("WARNING: Didn't see inflection template, skipping")
      return
    lemma = list(lemmas)[0]
    pl_p_prop, pl_p_respellings = get_pl_p_property(index, lemma)
    if pl_p_prop == "no-pl-p":
      pagemsg("WARNING: Lemma page %s has no {{pl-p}}, not sure what to do, skipping" % lemma)
      return
    elif pl_p_prop == "pl-p-respelling":
      pagemsg("WARNING: Lemma page %s has respelling(s) %s, skipping" % (
        lemma, ",".join(pl_p_respellings)))
      return
    else:
      pagemsg("Lemma page %s has {{pl-p}} without respelling, proceeding" % lemma)

  def construct_new_pron_template():
    return "{{pl-p}}", ""

  def insert_into_existing_pron_section(k):
    parsed = blib.parse_text(subsections[k])
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn in pronun_templates:
        pagemsg("Already saw pronunciation template: %s" % unicode(t))
        break
    else: # no break
      new_pron_template, pron_prefix = construct_new_pron_template()
      # Remove existing rhymes/hyphenation/pl-IPA lines
      for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]:
        re_template = template.replace("|", r"\|")
        regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
        m = re.search(regex, subsections[k], re.M)
        if m:
          pagemsg("Removed existing %s" % m.group(1).strip())
          notes.append("remove existing {{%s}}" % template)
          subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
      for template in ["audio|pl"]:
        re_template = template.replace("|", r"\|")
        regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
        all_audios = re.findall(regex, subsections[k], re.M)
        if len(all_audios) > 1:
          pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios()))
          return
        if len(all_audios) == 1:
          audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0]
          assert(tname(audiot) == "audio")
          if getparam(audiot, "1") != "pl":
            pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line)
            return
          audiofile = getparam(audiot, "2")
          audiogloss = getparam(audiot, "3")
          for param in audiot.params:
            pn = pname(param)
            pv = unicode(param.value)
            if pn not in ["1", "2", "3"]:
              pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
                pn, pv, audio_line))
              return
          if audiogloss in ["Audio", "audio"]:
            audiogloss = ""
          params = "|a=%s" % audiofile
          if audiogloss:
            params += "|ac=%s" % audiogloss
          new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:]
          pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip())
          notes.append("incorporate existing {{%s}} into {{pl-p}}" % template)
          subsections[k] = re.sub(regex, "", subsections[k], 0, re.M)
      subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k]
      notes.append("insert %s into existing Pronunciation section" % new_pron_template)
    return True

  def insert_new_l3_pron_section(k):
    new_pron_template, pron_prefix = construct_new_pron_template()
    subsections[k:k] = ["===Pronunciation===\n", pron_prefix + new_pron_template + "\n\n"]
    notes.append("add top-level Polish pron %s" % new_pron_template)

  for k in xrange(2, len(subsections), 2):
    if "==Pronunciation==" in subsections[k - 1]:
      if not insert_into_existing_pron_section(k):
        return
      break
  else: # no break
    k = 2
    while k < len(subsections) and re.search("==(Alternative forms|Etymology)==", subsections[k - 1]):
      k += 2
    if k -1 >= len(subsections):
      pagemsg("WARNING: No lemma or non-lemma section at top level")
      return
    insert_new_l3_pron_section(k - 1)

  secbody = "".join(subsections)
  # Strip extra newlines added to secbody
  sections[j] = secbody.rstrip("\n") + sectail
  return "".join(sections), notes
Esempio n. 12
0
def process_text_on_page(pageindex, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (pageindex, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else args.langname,
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    defn_subsection = None
    saw_two_defn_subsections = False
    for k in xrange(2, len(subsections), 2):
        if re.search("=Etymology", subsections[k - 1]):
            defn_subsection = None
            saw_two_defn_subsections = False
        if "\n#" in subsections[k] and not re.search(
                "=(Etymology|Pronunciation|Usage notes)", subsections[k - 1]):
            if defn_subsection:
                saw_two_defn_subsections = True
            defn_subsection = k
            defn_subsection_level = get_subsection_level(subsections[k - 1])
            saw_nyms_already = set()
        m = re.search("=(Synonyms|Antonyms)=", subsections[k - 1])
        if m:
            syntype = m.group(1).lower()[:-1]
            if defn_subsection is None:
                pagemsg(
                    "WARNING: Encountered %ss section #%s without preceding definition section"
                    % (syntype, k // 2 + 1))
                continue
            synant_subsection_level = get_subsection_level(subsections[k - 1])
            if saw_two_defn_subsections and synant_subsection_level <= defn_subsection_level:
                pagemsg(
                    "WARNING: Saw two definition sections followed by %s section #%s at same level or higher, skipping section"
                    % (syntype, k // 2 + 1))
                continue
            if syntype in saw_nyms_already:
                pagemsg(
                    "WARNING: Encountered two %s sections without intervening definition section"
                    % syntype)
                continue

            def parse_syns(syns):
                retval = []
                syns = syns.strip()
                orig_syns = syns
                qualifier = None
                while True:
                    # check for qualifiers specified using a qualifier template
                    m = re.search(
                        "^(.*?)\{\{(?:qualifier|qual|q|i)\|([^{}|=]*)\}\}(.*?)$",
                        syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    # check for qualifiers using e.g. {{lb|ru|...}}
                    m = re.search(
                        "^(.*?)\{\{(?:lb)\|%s\|([^{}=]*)\}\}(.*?)$" %
                        re.escape(args.lang), syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        # do this before handling often/sometimes/etc. in case the label has often|_|pejorative or similar
                        qualifier = qualifier.replace("|_|", " ")
                        terms_no_following_comma = [
                            "also", "and", "or", "by", "with", "except",
                            "outside", "in", "chiefly", "mainly", "mostly",
                            "primarily", "especially", "particularly",
                            "excluding", "extremely", "frequently",
                            "humorously", "including", "many", "markedly",
                            "mildly", "now", "occasionally", "of", "often",
                            "sometimes", "originally", "possibly", "rarely",
                            "slightly", "somewhat", "strongly", "then",
                            "typically", "usually", "very"
                        ]
                        qualifier = re.sub(
                            r"\b(%s)\|" % "|".join(terms_no_following_comma),
                            r"\1 ", qualifier)
                        qualifier = qualifier.replace("|", ", ")
                        syns = before_text + after_text
                        break
                    # check for qualifier-like ''(...)''
                    m = re.search("^(.*?)''\(([^'{}]*)\)''(.*?)$", syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    # check for qualifier-like (''...'')
                    m = re.search("^(.*?)\(''([^'{}]*)''\)(.*?)$", syns)
                    if m:
                        before_text, qualifier, after_text = m.groups()
                        syns = before_text + after_text
                        break
                    break

                # Split on commas, semicolons, slashes but don't split commas etc. inside of braces or brackets
                split_by_brackets_braces = re.split(
                    r"(\{\{[^{}]*\}\}|\[\[[^\[\]]*\]\])", syns.strip())
                comma_separated_runs = blib.split_alternating_runs(
                    split_by_brackets_braces, "(?: *[,;] *| +/ +)")
                syns = [
                    "".join(comma_separated_run)
                    for comma_separated_run in comma_separated_runs
                ]

                if qualifier and len(syns) > 1:
                    pagemsg(
                        "WARNING: Saw qualifier along with multiple synonyms, not sure how to proceed: <%s>"
                        % orig_syns)
                    return None
                joiner_after = ";" if qualifier or len(syns) > 1 else ","
                for synindex, syn in enumerate(syns):
                    orig_syn = syn
                    m = re.search(
                        r"^\{\{[lm]\|%s\|([^{}]*)\}\}$" % re.escape(args.lang),
                        syn)
                    if m:
                        decl = blib.parse_text(syn).filter_templates()[0]
                        gender = None
                        translit = None
                        raw_syn = None
                        alt = None
                        gloss = None
                        lit = None
                        pos = None
                        for param in decl.params:
                            pn = pname(param)
                            pv = unicode(param.value)
                            if pn in ["1"]:
                                pass
                            elif pn == "2":
                                raw_syn = pv
                            elif pn == "3":
                                alt = pv
                            elif pn in ["4", "t", "gloss"]:
                                gloss = pv
                            elif pn == "g":
                                gender = pv
                            elif pn in ["g2", "g3", "g4"]:
                                if not gender:
                                    pagemsg(
                                        "WARNING: Saw %s=%s without g= in %s <%s> in line: %s"
                                        % (pn, pv, syntype, orig_syn, line))
                                    return None
                                gender += "," + pv
                            elif pn == "tr":
                                translit = pv
                            elif pn == "lit":
                                lit = pv
                            elif pn == "pos":
                                pos = pv
                            else:
                                pagemsg(
                                    "WARNING: Unrecognized param %s=%s in %s <%s> in line: %s"
                                    % (pn, pv, syntype, orig_syn, line))
                                return None
                        if not raw_syn:
                            pagemsg(
                                "WARNING: Couldn't find raw synonym in %s <%s> in line: %s"
                                % (syntype, orig_syn, line))
                            return None
                        if raw_syn and alt:
                            if "[[" in raw_syn or "[[" in alt:
                                pagemsg(
                                    "WARNING: Saw both synonym=%s and alt=%s with brackets in one or both in %s <%s> in line: %s"
                                    % (raw_syn, alt, syntype, orig_syn, line))
                                return None
                            syn = "[[%s|%s]]" % (raw_syn, alt)
                        elif raw_syn:
                            if "[[" in raw_syn:
                                syn = raw_syn
                            else:
                                syn = "[[%s]]" % raw_syn
                        elif alt:
                            pagemsg(
                                "WARNING: Saw alt=%s but no link text in %s <%s> in line: %s"
                                % (alt, syntype, orig_syn, line))
                            return
                    else:

                        def add_brackets_if_not_already(m):
                            raw_syn = m.group(1)
                            if "[[" not in raw_syn:
                                raw_syn = "[[%s]]" % raw_syn
                            return raw_syn

                        syn = re.sub(
                            r"\{\{[lm]\|%s\|([^{}=]*)\}\}" %
                            re.escape(args.lang), add_brackets_if_not_already,
                            syn)
                        gender = None
                        translit = None
                        gloss = None
                        lit = None
                        pos = None
                    if "{{" in syn or "}}" in syn:
                        pagemsg(
                            "WARNING: Unmatched braces in %s <%s> in line: %s"
                            % (syntype, orig_syn, line))
                        return None
                    if "''" in syn:
                        pagemsg(
                            "WARNING: Italicized text in %s <%s> in line: %s" %
                            (syntype, orig_syn, line))
                        return None
                    if "(" in syn or ")" in syn:
                        pagemsg(
                            "WARNING: Unmatched parens in %s <%s> in line: %s"
                            % (syntype, orig_syn, line))
                        return None
                    if ":" in syn:
                        pagemsg(
                            "WARNING: Unmatched colon in %s <%s> in line: %s" %
                            (syntype, orig_syn, line))
                        return None
                    # Strip brackets around entire synonym
                    syn = re.sub(r"^\[\[([^\[\]|{}]*)\]\]$", r"\1", syn)
                    # If there are brackets around some words but not all, put brackets around the remaining words
                    if "[[" in syn:
                        split_by_brackets = re.split(
                            r"([^ ]*\[\[[^\[\]]*\]\][^ ]*)", syn)

                        def maybe_add_brackets(m):
                            text = m.group(1)
                            if "[" in text or "]" in text:
                                pagemsg(
                                    "WARNING: Saw nested brackets in %s in %s <%s> in line: %s"
                                    % (text, syntype, orig_syn, line))
                                return text
                            if not re.search(r"\w", text, re.U):
                                pagemsg(
                                    "Not adding brackets around '%s', saw no letters in %s <%s> in line: %s"
                                    % (text, syntype, orig_syn, line))
                                return text
                            return "[[%s]]" % text

                        # Put brackets around the remainin words not already bracketed or partially bracketed. But don't put
                        # brackets around words inside of HTML comments, and don't include punctuation inside the brackets.
                        for i in xrange(0, len(split_by_brackets), 2):
                            split_out_comments = re.split(
                                "(<!--.*?-->)", split_by_brackets[i])
                            for j in xrange(0, len(split_out_comments), 2):
                                split_out_comments[j] = re.sub(
                                    "([^ ,*/{}:;()?!+<>]+)",
                                    maybe_add_brackets, split_out_comments[j])
                            split_by_brackets[i] = "".join(split_out_comments)

                        new_syn = "".join(split_by_brackets)
                        if new_syn != syn:
                            pagemsg("Add brackets to '%s', producing '%s'" %
                                    (syn, new_syn))
                            syn = new_syn
                    other_params = [
                        ("tr", translit),
                        ("t", gloss),
                        ("q", qualifier),
                        ("g", gender),
                        ("pos", pos),
                        ("lit", lit),
                    ]
                    # Set the joiner_after to None for everything but the last synonym on the row; we will then change
                    # all commas to semicolons if there is any semicolon, so we are consistently using commas or
                    # semicolons to separate groups of synonyms.
                    retval.append(
                        (syn, other_params,
                         joiner_after if synindex == len(syns) - 1 else None))
                return retval

            def find_defns():
                m = re.search(r"\A(.*?)((?:^#[^\n]*\n)+)(.*?)\Z",
                              subsections[defn_subsection], re.M | re.S)
                if not m:
                    pagemsg(
                        "WARNING: Couldn't find definitions in definition subsection #%s"
                        % (defn_subsection // 2 + 1))
                    return None, None, None
                before_defn_text, defn_text, after_defn_text = m.groups()
                if re.search("^#", before_defn_text, re.M) or re.search(
                        "^#", after_defn_text, re.M):
                    pagemsg(
                        "WARNING: Saw definitions in before or after text in definition subsection #%s, not sure what to do"
                        % (defn_subsection // 2 + 1))
                    return None, None, None
                if re.search("^##", defn_text, re.M):
                    pagemsg(
                        "WARNING: Found ## definition in definition subsection #%s, not sure what to do"
                        % (defn_subsection // 2 + 1))
                    return None, None, None
                defns = re.split("^(#[^*:].*\n(?:#[*:].*\n)*)", defn_text, 0,
                                 re.M)
                for between_index in xrange(0, len(defns), 2):
                    if defns[between_index]:
                        pagemsg(
                            "WARNING: Saw unknown text <%s> between definitions, not sure what to do"
                            % defns[between_index].strip())
                        return None, None, None
                defns = [x for i, x in enumerate(defns) if i % 2 == 1]
                return before_defn_text, defns, after_defn_text

            def add_syns_to_defn(syns, defn, add_fixme):
                for syn, other_params, joiner_after in syns:
                    if not syn and joiner_after is not None:
                        pagemsg(
                            "WARNING: Would remove last synonym from a group: %s"
                            % ",".join(
                                syn
                                for syn, other_params, joiner_after in syns))
                        return None
                syns = [(syn, other_params, joiner_after)
                        for syn, other_params, joiner_after in syns if syn]
                if len(syns) == 0:
                    return defn
                any_semicolon = any(joiner_after == ";"
                                    for sy, other_params, joiner_after in syns)
                if any_semicolon:
                    syns = [(syn, other_params, ";" if joiner_after is not None
                             and any_semicolon else joiner_after)
                            for syn, other_params, joiner_after in syns]
                saw_nyms_already.add(syntype)
                joined_syns = "|".join(
                    "%s%s%s" %
                    (syn, "".join("<%s:%s>" % (param, val) if val else ""
                                  for param, val in other_params),
                     "|" + joiner_after if i < len(syns) - 1 and joiner_after
                     is not None and joiner_after != "," else "")
                    for i, (syn, other_params,
                            joiner_after) in enumerate(syns))
                fixme_msg = " FIXME" if add_fixme else ""
                if syntype == "synonym":
                    if re.search(r"\{\{(syn|synonyms)\|", defn):
                        pagemsg(
                            "WARNING: Already saw inline synonyms in definition: <%s>"
                            % defn)
                        return None
                    return re.sub(
                        r"^(.*\n)", r"\1#: {{syn|%s|%s}}%s" %
                        (args.lang, joined_syns, fixme_msg) + "\n", defn)
                else:
                    if re.search(r"\{\{(ant|antonyms)\|", defn):
                        pagemsg(
                            "WARNING: Already saw inline antonyms in definition: <%s>"
                            % defn)
                        return None
                    # Need to put antonyms after any inline synonyms
                    return re.sub(
                        r"^(.*\n(?:#: *\{\{(?:syn|synonyms)\|.*\n)*)",
                        r"\1#: {{ant|%s|%s}}%s" %
                        (args.lang, joined_syns, fixme_msg) + "\n", defn)

            # Find definitions
            before_defn_text, defns, after_defn_text = find_defns()
            if before_defn_text is None:
                continue

            def put_back_new_defns(defns, syndesc, skipped_a_line, lines,
                                   skipped_linenos):
                subsections[defn_subsection] = before_defn_text + "".join(
                    defns) + after_defn_text
                if skipped_a_line:
                    skipped_linenos = sorted(skipped_linenos)
                    skipped_lines = [
                        lines[lineno] for lineno in skipped_linenos
                    ]
                    subsections[k] = "\n".join(skipped_lines)
                else:
                    subsections[k - 1] = ""
                    subsections[k] = ""
                notes.append(
                    "Convert %ss in %s subsection %s to inline %ss in subsection %s based on %s"
                    % (syntype, args.langname, k // 2 + 1, syntype,
                       defn_subsection // 2 + 1, syndesc))

            # Pull out all synonyms by number
            unparsable = False
            syns_by_number = defaultdict(list)
            skipped_lines = []
            skipped_a_line = False
            lines = subsections[k].split("\n")
            for lineno, line in enumerate(lines):
                if not line.strip():
                    skipped_lines.append(lineno)
                    continue
                # Look for '* (1) {{l|...}}'
                m = re.search(r"^\* *\(([0-9]+)\) *(.*?)$", line)
                if m:
                    defnum, syns = m.groups()
                else:
                    # Look for '* {{l|...}} (1)'
                    m = re.search(r"^\* *(.*?) *\(([0-9]+)\)$", line)
                    if m:
                        syns, defnum = m.groups()
                    else:
                        # Look for '* {{sense|1}} {{l|...}}'
                        m = re.search(
                            r"^\* *\{\{(?:s|sense)\|([0-9]+)\}\} *(.*?)$",
                            line)
                        if m:
                            defnum, syns = m.groups()
                        else:
                            # couldn't parse line
                            pagemsg("Couldn't parse %s line for numbers: %s" %
                                    (syntype, line))
                            unparsable = True
                            break

                parsed_syns = parse_syns(syns)
                if parsed_syns is None:
                    skipped_a_line = True
                    skipped_lines.append(lineno)
                else:
                    syns_by_number[int(defnum)] += parsed_syns

            if not unparsable:
                # Find definitions
                before_defn_text, defns, after_defn_text = find_defns()
                if before_defn_text is None:
                    continue

                # Don't consider definitions with {{reflexive of|...}} in them
                reindexed_defns = {}
                next_index = 1
                for index, defn in enumerate(defns):
                    if "{{reflexive of|" in defn:
                        continue
                    reindexed_defns[next_index] = index
                    next_index += 1

                # Make sure synonyms don't refer to nonexistent definition
                max_syn = max(syns_by_number.keys())
                max_defn = max(reindexed_defns.keys())
                if max_syn > max_defn:
                    pagemsg(
                        "WARNING: Numbered synonyms refer to maximum %s > maximum defn %s"
                        % (max_syn, max_defn))
                    continue

                # Add inline synonyms
                must_continue = False
                for synno, syns in syns_by_number.iteritems():
                    index = reindexed_defns[synno]
                    new_defn = add_syns_to_defn(syns, defns[index], False)
                    if new_defn is None:
                        must_continue = True
                        break
                    defns[index] = new_defn
                if must_continue:
                    continue

                # Put back new definition text and clear out synonyms
                put_back_new_defns(defns, "numbered %ss" % syntype,
                                   skipped_a_line, lines, skipped_lines)
                continue

            # Try checking for {{sense|...}} or (''...'') indicators
            unparsable = False
            syns_by_tag = {}
            skipped_lines = []
            skipped_a_line = False
            must_continue = False
            lines = subsections[k].split("\n")
            for lineno, line in enumerate(lines):
                if not line.strip():
                    skipped_lines.append(lineno)
                    continue
                m = re.search(r"^\* *\(''([^']*?)''\) *(.*?)$", line)
                if m:
                    tag, syns = m.groups()
                else:
                    m = re.search(r"^\* *''\(([^']*?)\)'' *(.*?)$", line)
                    if m:
                        tag, syns = m.groups()
                    else:
                        m = re.search(
                            r"^\* *\{\{(?:s|sense)\|([^{}|]*?)\}\} *(.*?)$",
                            line)
                        if m:
                            tag, syns = m.groups()
                        else:
                            # couldn't parse line
                            pagemsg("Couldn't parse %s line for tags: %s" %
                                    (syntype, line))
                            unparsable = True
                            break
                tag = re.sub(r",? +etc\.?$", "", tag)
                parsed_syns = parse_syns(syns)
                if parsed_syns is None:
                    skipped_a_line = True
                    skipped_lines.append(lineno)
                else:
                    if tag in syns_by_number:
                        pagemsg("WARNING: Saw the same tag '%s' twice" % tag)
                        must_continue = True
                        break
                    syns_by_tag[tag] = (parsed_syns, lineno)
            if must_continue:
                continue

            if not unparsable:
                # Pull out each definition (not including continuations) and remove links
                unlinked_defns = []
                must_continue = False
                for defn in defns:
                    m = re.search("^# *(.*)\n", defn)
                    if not m:
                        pagemsg(
                            "WARNING: Something wrong, can't pull out definition from <%s>"
                            % defn)
                        must_continue = True
                        break
                    unlinked_defns.append(blib.remove_links(m.group(1)))
                if must_continue:
                    continue

                # Match tags against definitions
                tag_to_defn = {}
                defn_to_tag = {}
                must_continue = False
                bad = False
                for tag in syns_by_tag.keys():
                    matching_defn = None
                    must_break = False
                    for defno, unlinked_defn in enumerate(unlinked_defns):
                        tag_re = r"\b" + re.sub(r"[ ,.*/{}:;()?!\[\]+]+",
                                                r"\\b.*\\b", tag) + r"\b"
                        if re.search(tag_re, unlinked_defn):
                            if matching_defn is not None:
                                pagemsg(
                                    "WARNING: Matched tag '%s' against both defn <%s> and <%s>"
                                    % (tag, unlinked_defns[matching_defn],
                                       unlinked_defn))
                                if args.do_your_best:
                                    bad = True
                                else:
                                    must_break = True
                                    must_continue = True
                                    break
                            else:
                                matching_defn = defno
                    if must_break:
                        break
                    if not bad and matching_defn is None:
                        pagemsg(
                            "WARNING: Couldn't match tag '%s' against definitions %s"
                            % (tag, ", ".join(
                                "<%s>" % unlinked_defn
                                for unlinked_defn in unlinked_defns)))
                        if args.do_your_best:
                            bad = True
                        else:
                            must_continue = True
                            break
                    if not bad and matching_defn in defn_to_tag:
                        pagemsg(
                            "WARNING: Matched two tags '%s' and '%s' against the same defn <%s>"
                            % (tag, defn_to_tag[matching_defn],
                               unlinked_defns[matching_defn]))
                        if args.do_your_best:
                            bad = True
                        else:
                            must_continue = True
                            break
                    if not bad:
                        defn_to_tag[matching_defn] = tag
                        tag_to_defn[tag] = matching_defn
                if must_continue:
                    continue

                # Add inline synonyms
                must_continue = False
                for tag, (syns, lineno) in syns_by_tag.iteritems():
                    if tag in tag_to_defn:
                        index = tag_to_defn[tag]
                        new_defn = add_syns_to_defn(syns, defns[index], bad)
                        if new_defn is None:
                            must_continue = True
                            break
                        defns[index] = new_defn
                    else:
                        skipped_a_line = True
                        skipped_lines.append(lineno)
                if must_continue:
                    continue

                # Put back new definition text and clear out synonyms
                put_back_new_defns(defns, "tagged %ss" % syntype,
                                   skipped_a_line, lines, skipped_lines)
                continue

            # Add synonyms if only one definition or --do-your-best
            if len(defns) > 1:
                pagemsg(
                    "WARNING: Saw %s subsection %s with %s definitions and don't know where to add, %s"
                    % (syntype, k // 2 + 1, len(defns),
                       "adding to first definition"
                       if args.do_your_best else "can't add"))
            if len(defns) == 1 or args.do_your_best:
                unparsable = False
                all_syns = []
                syns_by_tag = {}
                skipped_lines = []
                skipped_a_line = False
                lines = subsections[k].split("\n")
                total_syns = 0
                for lineno, line in enumerate(lines):
                    if not line.strip():
                        skipped_lines.append(lineno)
                        continue
                    m = re.search(r"^\* *(.*?)$", line)
                    if m:
                        syns = m.group(1)
                    else:
                        # couldn't parse line
                        pagemsg(
                            "WARNING: Couldn't parse %s line in last stage: %s"
                            % (syntype, line))
                        unparsable = True
                        break
                    parsed_syns = parse_syns(syns)
                    if parsed_syns is None:
                        skipped_a_line = True
                        skipped_lines.append(lineno)
                    else:
                        all_syns.append((lineno, total_syns, parsed_syns))
                    total_syns += 1

                if not unparsable:
                    changed = False
                    if total_syns > 1 and len(defns) == total_syns:
                        # only happens when --do-your-best
                        pagemsg(
                            "Saw %s definitions and %s synonym lines, matching definitions and synonym lines"
                            % (len(defns), total_syns))
                        for lineno, synno, parsed_syns in all_syns:
                            # Add inline synonyms
                            new_defn = add_syns_to_defn(
                                parsed_syns, defns[synno], True)
                            if new_defn is None:
                                pagemsg(
                                    "WARNING: Couldn't add %s line when matching definitions and synonym lines: %s"
                                    % (syntype, lines[lineno]))
                                skipped_a_line = True
                                skipped_lines.append(lineno)
                                continue
                            defns[synno] = new_defn
                            changed = True
                    else:
                        if len(defns) > 1:
                            # only happens when --do-your-best
                            pagemsg(
                                "WARNING: Saw %s definitions but %s synonym lines, adding to first definition"
                                % (len(defns), total_syns))
                            # If more than one synonym line, add a qualifier specifying the original synonym line number
                            # to the first synonym on the line to make it easier to manually line up synonyms with definitions.
                            if total_syns > 1:
                                all_syns = [(lineno, synno, [
                                    (syn, other_params +
                                     [("qq", "l%s" % (synno + 1))] if synindex
                                     == 0 else other_params, joiner_after)
                                    for synindex,
                                    (syn, other_params,
                                     joiner_after) in enumerate(parsed_syns)
                                ]) for lineno, synno, parsed_syns in all_syns]
                        # Add inline synonyms
                        all_syns = [
                            syn for lineno, synno, parsed_syns in all_syns
                            for syn in parsed_syns
                        ]  # flatten
                        new_defn = add_syns_to_defn(all_syns, defns[0],
                                                    len(defns) > 1)
                        if new_defn is None:
                            continue
                        defns[0] = new_defn
                        changed = True

                    # Put back new definition text and clear out moved synonyms
                    if changed:
                        put_back_new_defns(
                            defns, "%ss with only one definition" % syntype,
                            skipped_a_line, lines, skipped_lines)
                    continue

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Esempio n. 13
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    m = re.search(
        "^Category:(Japanese|Okinawan) terms spelled with (.*) read as (.*)$",
        pagetitle)
    if not m:
        pagemsg("Skipped")
        return

    notes = []

    lang, spelling, reading = m.groups()
    langcode = lang == "Japanese" and "ja" or "ryu"
    spelling_page = pywikibot.Page(site, spelling)

    def pagemsg_with_spelling(txt):
        pagemsg("%s: %s" % (spelling, txt))

    def errandpagemsg_with_spelling(txt):
        pagemsg_with_spelling(txt)
        errmsg("Page %s %s: %s: %s" % (index, pagetitle, spelling, txt))

    if not blib.safe_page_exists(spelling_page, pagemsg_with_spelling):
        pagemsg_with_spelling("Spelling page doesn't exist, skipping")
        return
    spelling_page_text = blib.safe_page_text(spelling_page,
                                             pagemsg_with_spelling)
    retval = blib.find_modifiable_lang_section(spelling_page_text, lang,
                                               pagemsg_with_spelling)
    if retval is None:
        pagemsg_with_spelling("WARNING: Couldn't find %s section" % lang)
        return
    sections, j, secbody, sectail, has_non_lang = retval

    parsed = blib.parse_text(secbody)
    saw_readings_template = False
    reading_types = []
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "%s-readings" % langcode:
            saw_readings_template = True
            for reading_type in allowed_reading_types:
                readings = getparam(t, reading_type).strip()
                if readings:
                    readings = re.split(r"\s*,\s*", readings)
                    readings = [re.sub("[<-].*", "", r) for r in readings]
                    if reading in readings:
                        reading_type = canonicalize_reading_types.get(
                            reading_type, reading_type)
                        pagemsg_with_spelling(
                            "Appending reading type %s based on %s" %
                            (reading_type, unicode(t)))
                        if reading_type not in reading_types:
                            reading_types.append(reading_type)
                            notes.append(
                                "add %s reading based on {{%s-readings}} on page [[%s]]"
                                % (reading_type, langcode, spelling))
            if not reading_types:
                pagemsg_with_spelling(
                    "WARNING: Can't find reading %s among readings listed in %s"
                    % (reading, unicode(t).replace("\n", r"\n")))

    if not saw_readings_template:
        pagemsg_with_spelling(
            "WARNING: Couldn't find reading template {{%s-readings}}" %
            langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling("WARNING: Can't find reading %s on page" %
                              reading)

    for i, contents_page in blib.cat_articles(
            re.sub("^Category:", "", pagetitle)):
        contents_title = unicode(contents_page.title())

        def pagemsg_with_contents(txt):
            pagemsg("%s: %s" % (contents_title, txt))

        def errandpagemsg_with_contents(txt):
            pagemsg_with_contents(txt)
            errmsg("Page %s %s: %s: %s" %
                   (index, pagetitle, contents_title, txt))

        contents_page_text = blib.safe_page_text(contents_page,
                                                 pagemsg_with_contents)
        retval = blib.find_modifiable_lang_section(contents_page_text, lang,
                                                   pagemsg_with_contents)
        if retval is None:
            pagemsg_with_contents("WARNING: Couldn't find %s section" % lang)
            return
        sections, j, secbody, sectail, has_non_lang = retval

        saw_kanjitab = False
        must_continue = False
        for ch in contents_title:
            if 0xD800 <= ord(ch) <= 0xDFFF:
                pagemsg_with_contents(
                    "WARNING: Surrogates in page name, skipping: %s" % ord(ch))
                must_continue = True
                break
        if must_continue:
            continue
        chars_in_contents_title = [x for x in contents_title]
        for i, ch in enumerate(chars_in_contents_title):
            if ch == u"々":  # kanji repeat char
                if i == 0:
                    pagemsg_with_contents(
                        u"Repeat char 々 found at beginning of contents title")
                    must_continue = True
                    break
                else:
                    chars_in_contents_title[i] = chars_in_contents_title[i - 1]
        if must_continue:
            continue
        kanji_in_contents_title = [
            x for x in chars_in_contents_title
            if unicodedata.name(x).startswith("CJK UNIFIED IDEOGRAPH")
        ]
        parsed = blib.parse_text(secbody)
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "%s-kanjitab" % langcode:
                saw_kanjitab = True
                readings = []
                for i in range(1, 10):
                    contents_reading = getparam(t, str(i))
                    if contents_reading:
                        readings.append(contents_reading)
                if len(kanji_in_contents_title) != len(readings):
                    pagemsg_with_contents(
                        "WARNING: Saw %s chars in contents title but %s readings %s, skipping: %s"
                        % (len(kanji_in_contents_title), len(readings),
                           ",".join(readings), unicode(t)))
                    continue
                yomi = getparam(t, "yomi")
                if not yomi:
                    pagemsg_with_contents("WARNING: No yomi, skipping: %s" %
                                          unicode(t))
                    continue
                if "," in yomi or re.search("[0-9]$", yomi):
                    yomi = yomi.split(",")
                if type(yomi) is list:
                    expanded_yomi = []
                    for y in yomi:
                        m = re.search("^(.*?)([0-9]+)$", y)
                        if m:
                            baseyomi, numyomi = m.groups()
                            numyomi = int(numyomi)
                            expanded_yomi.extend([baseyomi] * numyomi)
                        else:
                            expanded_yomi.append(y)
                    if expanded_yomi != yomi:
                        pagemsg_with_contents(
                            "Expanding yomi %s to %s" %
                            (",".join(yomi), ",".join(expanded_yomi)))
                    yomi = expanded_yomi
                if type(yomi) is list and len(yomi) != len(
                        kanji_in_contents_title):
                    pagemsg_with_contents(
                        "WARNING: %s values in yomi=%s but %s chars in contents, skipping: %s"
                        % (len(yomi), ",".join(yomi),
                           len(kanji_in_contents_title), unicode(t)))
                    continue
                saw_spelling_in_contents = False
                must_continue = False
                for i, (ch, contents_reading) in enumerate(
                        zip(kanji_in_contents_title, readings)):
                    if ch == spelling:
                        saw_spelling_in_contents = True
                        if contents_reading == reading:
                            if type(yomi) is list:
                                reading_type = yomi[i]
                            else:
                                reading_type = yomi
                            yomi_to_canonical_reading_type = {
                                "o": "on",
                                "on": "on",
                                "kanon": "kanon",
                                "goon": "goon",
                                "soon": "soon",
                                "toon": "toon",
                                "kan": "kanyoon",
                                "kanyo": "kanyoon",
                                "kanyoon": "kanyoon",
                                "k": "kun",
                                "kun": "kun",
                                "juku": "jukujikun",
                                "jukuji": "jukujikun",
                                "jukujikun": "jukujikun",
                                "n": "nanori",
                                "nanori": "nanori",
                                "ok": "jubakoyomi",
                                "j": "jubakoyomi",
                                "ko": "yutoyomi",
                                "y": "yutoyomi",
                                "irr": "irregular",
                                "irreg": "irregular",
                                "irregular": "irregular",
                            }
                            if reading_type not in yomi_to_canonical_reading_type:
                                pagemsg_with_contents(
                                    "WARNING: Unrecognized reading type %s: %s"
                                    % (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = yomi_to_canonical_reading_type[
                                reading_type]
                            if reading_type not in allowed_reading_types:
                                pagemsg_with_contents(
                                    "WARNING: Disallowed reading type %s: %s" %
                                    (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = canonicalize_reading_types.get(
                                reading_type, reading_type)
                            pagemsg_with_contents(
                                "Appending reading type %s based on %s" %
                                (reading_type, unicode(t)))
                            if reading_type not in reading_types:
                                reading_types.append(reading_type)
                                notes.append(
                                    "add %s reading based on {{%s-kanjitab}} on page [[%s]]"
                                    % (reading_type, langcode, contents_title))
                if must_continue:
                    continue
                if not saw_spelling_in_contents:
                    pagemsg_with_contents(
                        "WARNING: Didn't see spelling in contents: %s" %
                        unicode(t))
                    continue
        if not saw_kanjitab:
            pagemsg_with_contents("WARNING: Didn't see {{%s-kanjitab}}" %
                                  langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling(
            "WARNING: Can't find reading %s by looking through category contents"
            % reading)
Esempio n. 14
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Italian",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    def extract_pronouns(form1, form2):
        prons = []
        if form1:
            prons.append(form1)
        if form2.startswith("glie"):
            prons.extend(["glie", form2[4:]])
        else:
            prons.append(form2)
        return prons

    def extract_base(pron1, pron2):
        if pron1:
            prontext = pron1 + pron2
        else:
            prontext = pron2
        m = re.search(r"^(.*)%s$" % prontext, pagetitle)
        if not m:
            pagemsg("WARNING: Page title should end in '%s' but doesn't" %
                    prontext)
            return None
        return m.group(1)

    def fix_compound_of(m):
        origtext = m.group(0)
        m = re.search(r"^# Compound of (.*?)\.*\n$", origtext)
        if not m:
            pagemsg("WARNING: Internal error: Can't match line: %s" % origtext)
            return origtext
        text = m.group(1)

        def do_fix_compound_of(text):
            # Convert {{m|it|ci}} to [[ci]]
            text = re.sub(r"\{\{m\|it\|([^{}]*?)\}\}", r"[[\1]]", text)
            # Convert [[ci#Italian|ci]] to [[ci]]
            text = re.sub(r"\[\[[^\[\]|]*?#Italian\|([^\[\]|]*?)\]\]",
                          r"[[\1]]", text)
            m = re.search(
                r"^(?:the )?gerund of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$",
                text)
            if m:
                inf, pron1, pron2 = m.groups()
                prons = extract_pronouns(pron1, pron2)
                base = extract_base(pron1, pron2)
                if not base:
                    return None
                notes.append(
                    "templatize Italian gerund compound-of expression")
                if len(prons) == 1 and base.endswith("ando"):
                    return ""
                elif len(prons) == 1:
                    return "|inf=%s" % inf
                elif base.endswith("ando"):
                    return "|%s" % "|".join(prons)
                else:
                    return "|%s|inf=%s" % ("|".join(prons), inf)
            m = re.search(
                r"^imperative(?: \(\[*(tu|noi|voi?|singular|plural|let's|)\]*(?: (?:form|person))?\))? of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$",
                text)
            if m:
                imp_pers, inf, pron1, pron2 = m.groups()
                if not imp_pers:
                    base = extract_base(pron1, pron2)
                    if not base:
                        return None
                    if base.endswith("te"):
                        imp_pers = "voi"
                    else:
                        imp_pers = "tu"
                prons = extract_pronouns(pron1, pron2)
                imp_pers_to_pos = {
                    "tu": "imp2s",
                    "noi": "imp1p",
                    "voi": "imp2p",
                    "vo": "imp2p",
                    "singular": "imp2s",
                    "plural": "imp2p",
                    "let's": "imp1p"
                }
                pos = imp_pers_to_pos[imp_pers]
                notes.append(
                    "templatize Italian imperative compound-of expression")
                if len(prons) == 1:
                    return "|pos=%s|inf=%s" % (pos, inf)
                else:
                    return "|%s|pos=%s|inf=%s" % ("|".join(prons), pos, inf)
            m = re.search(
                r"^'*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$",
                text)
            if m:
                inf, pron1, pron2 = m.groups()
                prons = extract_pronouns(pron1, pron2)
                if inf.endswith("ando"):
                    notes.append(
                        "templatize Italian gerund compound-of expression")
                    if len(prons) == 1:
                        return ""
                    else:
                        return "|%s" % "|".join(prons)
                if not inf.endswith("re") and not re.search("r[mtscv]i$", inf):
                    pagemsg("WARNING: Unrecognized infinitive %s: %s" %
                            (inf, origtext.strip()))
                    return None
                notes.append(
                    "templatize Italian infinitive compound-of expression")
                if len(prons) == 1 and inf.endswith("re"):
                    return ""
                inf_pron_to_pos = {
                    "mi": "inf1s",
                    "ti": "inf2s",
                    "ci": "inf1p",
                    "vi": "inf2p"
                }
                if re.search("[mtcv]i$", inf):
                    pos = inf_pron_to_pos[inf[-2:]]
                    return "|%s|%s|pos=%s" % (inf, "|".join(prons), pos)
                elif len(prons) == 1 and pagetitle.endswith(prons[0]):
                    return "|pos=inf|inf=%s" % inf
                elif inf.endswith("re"):
                    return "|%s" % "|".join(prons)
                else:
                    return "|%s|pos=inf|inf=%s" % ("|".join(prons), inf)
            m = re.search(
                r"^(feminine|plural|masculine plural|feminine plural|) *past participle of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$",
                text)
            if m:
                ppform, inf, pron1, pron2 = m.groups()
                prons = extract_pronouns(pron1, pron2)
                ppform_to_pos = {
                    "": "ppms",
                    "feminine": "ppfs",
                    "plural": "ppmp",
                    "masculine plural": "ppmp",
                    "feminine plural": "ppfp"
                }
                pos = ppform_to_pos[ppform]
                notes.append(
                    "templatize Italian past participle compound-of expression"
                )
                if len(prons) == 1:
                    return "|pos=%s|inf=%s" % (pos, inf)
                else:
                    return "|%s|pos=%s|inf=%s" % ("|".join(prons), pos, inf)
            pagemsg("WARNING: Unrecognized raw compound-of expression: %s" %
                    origtext.strip())
            return None

        retval = do_fix_compound_of(text)
        if retval is None:
            return origtext
        return "# {{it-compound of%s}}\n" % retval

    hacked_secbody = re.sub(r"# \[\[[Cc]ompound\|[Cc]ompound\]\]",
                            "# Compound", secbody)
    hacked_secbody = re.sub(r"# compound", "# Compound", hacked_secbody)
    hacked_secbody = re.sub(
        r"# \{\{(?:non-gloss definition|n-g)\|[Cc]ompound (.*)\}\}",
        r"# Compound \1", hacked_secbody)
    fixed_secbody = re.sub(r"# (Compound of.*?\.*)\n", fix_compound_of,
                           hacked_secbody)
    if "{{it-compound of" in fixed_secbody:
        newsecbody = re.sub(r"\{\{head\|it\|combined forms?\}\}",
                            "{{head|it|verb form}}", fixed_secbody)
        if newsecbody != fixed_secbody:
            notes.append(
                "replace {{head|it|combined form}} with {{head|it|verb form}}")
            fixed_secbody = newsecbody
        secbody = fixed_secbody

    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    text = "".join(sections)
    return text, notes
Esempio n. 15
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Polish", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  # Add missing space between * and { in case of {{R:pl:WSJP}} or {{R:pl:PWN}} directly after * without space
  newsecbody = re.sub("^\*\{", "* {", secbody, 0, re.M)
  if newsecbody != secbody:
    notes.append("add missing space after bullet *")
    secbody = newsecbody

  # Remove trailing spaces to avoid issues with spaces after {{R:pl:WSJP}} or {{R:pl:PWN}}
  newsecbody = re.sub(" *\n", "\n", secbody)
  if newsecbody != secbody:
    notes.append("remove extraneous trailing spaces")
    secbody = newsecbody

  # See if there are definition lines that do not contain {{surname}}, {{given name}}, {{verbal noun of}},
  # {{inflection of}} and {{infl of}}.
  lines = secbody.split("\n")
  saw_good_defn_line = False
  bad_templates = ["surname", "given name", "verbal noun of", "inflection of", "infl of"]
  for line in lines:
    if line.startswith("#") and not re.search(r"\{\{(%s)\|pl[|}]" % "|".join(bad_templates), line):
      saw_good_defn_line = True
  if not saw_good_defn_line:
    saw_bad_templates = []
    for bad_template in bad_templates:
      if re.search(r"\{\{%s\|pl[|}]" % bad_template, secbody):
        saw_bad_templates.append(bad_template)
    if saw_bad_templates:
      pagemsg("Skipping page because saw no good definition lines, and saw %s" % (
        " and ".join("{{%s|pl}}" % bad_template for bad_template in saw_bad_templates)))
    else:
      pagemsg("WARNING: Skipping page because saw no good definition lines; didn't see any of %s" % (
        ", ".join("{{%s|pl}}" % bad_template for bad_template in bad_templates)))
    return

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    
  # Check for templates in sections outside of 'Further reading'
  for k in xrange(2, len(subsections), 2):
    if not re.search("^==+Further reading==+\n", subsections[k - 1]):
      if "{{R:pl:WSJP}}" in subsections[k] or "{{R:pl:PWN}}" in subsections[k]:
        if re.search("^==+References==+\n", subsections[k - 1]):
          pagemsg("WARNING: Saw {{R:pl:WSJP}} or {{R:pl:PWN}} in %s section, can't handle" % subsections[k - 1].strip())
          return
        else:
          pagemsg("WARNING: Saw {{R:pl:WSJP}} or {{R:pl:PWN}} in %s section, need to review manually" % subsections[k - 1].strip())

  # Check for References or Further reading already present
  for k in xrange(2, len(subsections), 2):
    if re.search("^==+Further reading==+\n", subsections[k - 1]):
      newsubsecval = "===Further reading===\n"
      if subsections[k - 1] != newsubsecval:
        for l in xrange(k + 2, len(subsections), 2):
          if not re.search("^===Anagrams===\n", subsections[l - 1]):
            pagemsg("WARNING: Saw level > 3 Further reading and a following non-Anagrams section %s, can't handle"
                % subsections[l - 1].strip())
            return
        notes.append("replaced %s with level-3 %s" % (subsections[k - 1].strip(), newsubsecval.strip()))
        subsections[k - 1] = newsubsecval
      newsubsec = re.sub(r"^(\* \{\{R:pl:PWN\}\}\n)(.*)(\* \{\{R:pl:WSJP\}\}\n)", r"\3\1\2", subsections[k],
          0, re.M | re.S)
      if newsubsec != subsections[k]:
        notes.append("standardize order of ===Further reading=== with {{R:pl:WSJP}} followed by {{R:pl:PWN}} followed by anything else")
        subsections[k] = newsubsec
      else:
        has_wsjp = "{{R:pl:WSJP}}" in subsections[k]
        has_pwn = "{{R:pl:PWN}}" in subsections[k]
        if has_wsjp and not has_pwn:
          newsubseck = subsections[k].replace("* {{R:pl:WSJP}}\n", "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n")
          if newsubseck == subsections[k]:
            pagemsg("WARNING: Unable to add {{R:pl:PWN}} after {{R:pl:WSJP}}")
          else:
            subsections[k] = newsubseck
            notes.append("add {{R:pl:PWN}} to Polish lemma in ===Further reading===")
        elif has_pwn and not has_wsjp:
          newsubseck = subsections[k].replace("* {{R:pl:PWN}}\n", "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n")
          if newsubseck == subsections[k]:
            pagemsg("WARNING: Unable to add {{R:pl:WSJP}} before {{R:pl:PWN}}")
          else:
            subsections[k] = newsubseck
            notes.append("add {{R:pl:WSJP}} to Polish lemma in ===Further reading===")
        elif has_wsjp and has_pwn:
          pagemsg("Already has {{R:pl:WSJP}} and {{R:pl:PWN}}")
        else:
          subsections[k] = "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n" + subsections[k]
          notes.append("add {{R:pl:WSJP}} and {{R:pl:PWN}} to Polish lemma in ===Further reading===")
      break
  else: # no break
    k = len(subsections) - 1
    while k >= 2 and re.search(r"==\s*Anagrams\s*==", subsections[k - 1]):
      k -= 2
    if k < 2:
      pagemsg("WARNING: No lemma or non-lemma section")
      return
    subsections[k + 1:k + 1] = ["===Further reading===\n* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n\n"]
    notes.append("add new ===Further reading=== section to Polish lemma with {{R:pl:WSJP}} and {{R:pl:PWN}}")

  secbody = "".join(subsections)
  # Strip extra newlines added to secbody
  sections[j] = secbody.rstrip("\n") + sectail
  return "".join(sections), notes
Esempio n. 16
0
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    notes = []

    retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Hungarian section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    parsed = blib.parse_text(secbody)
    saw_mpos_inflection_of = False
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)
        if tn == "inflection of":
            if getparam(t, "1") != "hu":
                pagemsg(
                    "WARNING: Saw non-Hungarian {{inflection of}}, skipping")
                return
            for i in range(4, 30):
                if getparam(t, str(i)) == "(single possession)":
                    t.add(str(i), "spos")
                    notes.append(
                        "(single possession) -> spos in {{inflection of|hu}}")
                if getparam(t, str(i)) in [
                        "(multiple possessions)", "(multiple possession)"
                ]:
                    t.add(str(i), "mpos")
                    notes.append(
                        "(multiple possessions) -> mpos in {{inflection of|hu}}"
                    )
                if getparam(t, str(i)) == "mpos" and getparam(
                        t, str(i + 1)) == "poss":
                    saw_mpos_inflection_of = True
        if tn == "hu-infl-nom" and saw_mpos_inflection_of:
            n = getparam(t, "n")
            if n == "isg":
                pass
            elif n == "sg":
                t.add("n", "isg")
                notes.append(
                    "n=sg -> n=isg in {{hu-infl-nom}} in the context of {{inflection of|hu|...|mpos|poss}}"
                )
            else:
                pagemsg("WARNING: Saw strange value n=%s in %s" %
                        (n, unicode(t)))
        if unicode(t) != origt:
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))

    secbody = unicode(parsed)
    if notes and "==Etymology 1==" in secbody:
        pagemsg(
            "WARNING: Would make a change, but saw ==Etymology 1==, skipping")
        return
    sections[j] = secbody + sectail
    text = "".join(sections)
    return text, notes
Esempio n. 17
0
def process_text_on_page(index, pagetitle, text, lang, pos):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    cappos = pos.capitalize()
    notes = []

    pagemsg("Processing")

    retval = blib.find_modifiable_lang_section(text, lang_to_name[lang],
                                               pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find %s section" % lang_to_name[lang])
        return
    sections, j, secbody, sectail, has_non_lang = retval
    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    k = 1
    last_pos = None
    if "indeclinable %ss" % pos in secbody + sectail:
        pagemsg("Saw 'indeclinable %ss' in text, skipping: %s" % pos)
        return
    while k < len(subsections):
        if re.search(r"=\s*%s\s*=" % cappos, subsections[k]):
            level = get_indentation_level(subsections[k])
            last_pos = cappos
            endk = k + 2
            while endk < len(subsections) and get_indentation_level(
                    subsections[endk]) > level:
                endk += 2
            if endk < len(subsections) and re.search(
                    r"=\s*(Declension|Inflection|Conjugation)\s*=",
                    subsections[endk]):
                pagemsg(
                    "WARNING: Found probably misindented inflection header after ==%s== header: %s"
                    % (cappos, subsections[endk].strip()))
                k = endk + 2
                continue
            pos_text = "".join(subsections[k:endk])
            parsed = blib.parse_text(pos_text)
            saw_head = False
            saw_head_form = False
            head_is_indeclinable = False
            saw_inflection_of = False
            inflt = None
            found_rfinfl = False
            for t in parsed.filter_templates():
                tn = tname(t)
                if re.search(
                        "^" + pos_to_headword_template[lang][pos] + "$",
                        tn) or (tn == "head" and getparam(t, "1") == lang
                                and getparam(t, "2") in [pos, "%ss" % pos]):
                    if saw_head:
                        pagemsg(
                            "WARNING: Found two heads under one POS section: second is %s"
                            % unicode(t))
                    saw_head = True
                    if tn != "head" and lemma_is_indeclinable[lang](
                            t, pagetitle, pagemsg):
                        pagemsg("Headword template is indeclinable: %s" %
                                unicode(t))
                        head_is_indeclinable = True
                        break
                if re.search("^" + pos_to_infl_template[lang][pos] + "$", tn):
                    exclude_re = pos_to_infl_template_exclude.get(
                        lang, {}).get(pos, None)
                    if not exclude_re or not re.search("^" + exclude_re + "$",
                                                       tn):
                        if inflt:
                            pagemsg(
                                "WARNING: Found two inflection templates under one POS section: %s and %s"
                                % (unicode(inflt), unicode(t)))
                        inflt = t
                        pagemsg("Found %s inflection: %s" % (pos, unicode(t)))
                if tn in ["inflection of", "infl of"]:
                    pagemsg("Saw 'inflection of': %s" % unicode(t))
                    saw_inflection_of = True
                if pos_to_nonlemma_template[lang] and re.search(
                        "^" + pos_to_nonlemma_template[lang] + "$",
                        tn) or (tn == "head" and getparam(t, "1") == lang
                                and re.search(" forms?$", getparam(t, "2"))):
                    pagemsg("Saw non-lemma headword template: %s" % unicode(t))
                    saw_head_form = True
            if not inflt:
                pagemsg("Didn't find %s inflection" % pos)
                if saw_head_form:
                    pagemsg(
                        "Saw non-lemma headword template, not adding {{rfinfl}}"
                    )
                elif saw_inflection_of:
                    pagemsg(
                        "WARNING: Didn't see non-lemma headword template but saw 'inflection of'; not adding {{rfinfl}}"
                    )
                elif not saw_head:
                    pagemsg(
                        "WARNING: Didn't see lemma or non-lemma headword template; not adding {{rfinfl}}"
                    )
                elif head_is_indeclinable:
                    pagemsg(
                        "Headword template is indeclinable, not adding {{rfinfl}}"
                    )
                else:
                    for l in xrange(k, endk, 2):
                        if re.search(
                                r"=\s*(Declension|Inflection|Conjugation)\s*=",
                                subsections[l]):
                            secparsed = blib.parse_text(subsections[l + 1])
                            for t in secparsed.filter_templates():
                                tn = tname(t)
                                if tname(t) != "rfinfl":
                                    pagemsg(
                                        "WARNING: Saw unknown template %s in existing inflection section, skipping"
                                        % (unicode(t)))
                                    break
                                else:
                                    pagemsg("Found %s" % unicode(t))
                            break
                    else:  # no break
                        insert_k = k + 2
                        while insert_k < endk and "Usage notes" in subsections[
                                insert_k]:
                            insert_k += 2
                        if not subsections[insert_k - 1].endswith("\n\n"):
                            subsections[insert_k - 1] = re.sub(
                                "\n*$", "\n\n",
                                subsections[insert_k - 1] + "\n\n")
                        subsections[insert_k:insert_k] = [
                            "%s%s%s\n" %
                            ("=" * (level + 1), "Conjugation" if pos == "verb"
                             else "Declension", "=" * (level + 1)),
                            "{{rfinfl|%s|%s}}\n\n" % (lang, pos)
                        ]
                        pagemsg(
                            "Inserted level-%s inflection section with {{rfinfl|%s|%s}}"
                            % (level + 1, lang, pos))
                        notes.append("add {{rfinfl|%s|%s}}" % (lang, pos))
                        endk += 2  # for the two subsections we inserted

            k = endk
        else:
            m = re.search(
                r"=\s*(Noun|Proper noun|Pronoun|Determiner|Verb|Adverb|Adjective|Interjection|Conjunction)\s*=",
                subsections[k])
            if m:
                last_pos = m.group(1)
            if re.search(r"=\s*(Declension|Inflection|Conjugation)\s*=",
                         subsections[k]):
                if not last_pos:
                    pagemsg(
                        "WARNING: Found inflection header before seeing any parts of speech: %s"
                        % (subsections[k].strip()))
                elif last_pos == cappos:
                    pagemsg(
                        "WARNING: Found probably misindented inflection header after ==%s== header: %s"
                        % (cappos, subsections[k].strip()))
            k += 2

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    text = "".join(sections)
    text = re.sub("\n\n\n+", "\n\n", text)
    if not notes:
        notes.append("convert 3+ newlines to 2")
    return text, notes
def delete_form_1(page, index, lemma, formind, formval, lang):
    notes = []

    def pagemsg(txt):
        msg("Page %s %s: form %s %s: %s" %
            (index, lemma, formind, formval, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: form %s %s: %s" %
                  (index, lemma, formind, formval, txt))

    text = unicode(page.text)
    origtext = text

    retval = blib.find_modifiable_lang_section(text, lang_to_langname[lang],
                                               pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_lang = retval

    # FIXME!

    #if "==Etymology 1==" in secbody:
    #  etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    #  for k in xrange(2, len(etym_sections), 2):
    #    etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True)
    #  secbody = "".join(etym_sections)

    subsections_to_delete = []
    subsections_to_remove_inflections_from = []

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        saw_head = False
        saw_infl = False
        saw_other_infl = False
        remove_deletable_tag_sets_from_subsection = False
        saw_bad_template = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn in lang_headword_templates[lang] or (
                    tn == "head" and getparam(t, "1") == lang
                    and getparam(t, "2") in form_poses):
                saw_head = True
            elif tn in inflection_of_templates:
                langcode = getparam(t, "1")
                if langcode != lang:
                    errandpagemsg(
                        "WARNING: In %s section, found {{%s}} for different language %s: %s"
                        % (lang_to_langname[lang], tn, langcode, unicode(t)))
                    return
                actual_lemma = getparam(t, "2")
                if actual_lemma == lemma:
                    saw_infl = True
                else:
                    pagemsg("Found {{%s}} for different lemma %s: %s" %
                            (tn, actual_lemma, unicode(t)))
                    saw_other_infl = True
            elif tn in lang_inflection_of_templates[lang]:
                actual_lemma = getparam(t, "1")
                if actual_lemma == lemma:
                    saw_infl = True
                else:
                    pagemsg("Found {{%s}} for different lemma %s: %s" %
                            (tn, actual_lemma, unicode(t)))
                    saw_other_infl = True
        if saw_head and saw_infl:
            if saw_other_infl:
                pagemsg(
                    "Found subsection #%s to delete but has inflection template for different lemma or nondeletable tag set, will remove only deletable tag sets"
                    % (k // 2))
                remove_deletable_tag_sets_from_subsection = True
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn not in lang_headword_templates[
                        lang] + lang_inflection_of_templates[
                            lang] + inflection_of_templates and not (
                                tn == "head" and getparam(t, "1") == lang
                                and getparam(t, "2") in form_poses):
                    pagemsg(
                        "WARNING: Saw unrecognized template in otherwise deletable subsection #%s: %s"
                        % (k // 2, unicode(t)))
                    saw_bad_template = True
                    break
            else:
                # No break
                if re.search("===(Noun|Verb|Adjective)===",
                             subsections[k - 1]):
                    indent_header = subsections[k - 1].strip()
                    indent = len(re.sub("^(=+).*", r"\1", indent_header))
                    has_non_deletable_subsubsection = False
                    extra_subsubsections_to_delete = []
                    l = k
                    while l + 1 < len(subsections):
                        nextindent = len(
                            re.sub("^(=+).*", r"\1",
                                   subsections[l + 1].strip()))
                        if nextindent <= indent:
                            break
                        # Italian verb forms often have Synonyms sections for alternative forms, and random Related terms sections
                        if re.search("==(Synonyms|Related terms)==",
                                     subsections[l + 1]):
                            extra_subsubsections_to_delete.append(l + 2)
                            l += 2
                        else:
                            has_non_deletable_subsubsection = True
                            pagemsg(
                                "WARNING: Subsection #%s (header %s, indent %s) has subsubsection with header %s (indent %s), not deleting"
                                % (l // 2, indent_header, indent,
                                   subsections[l + 1].strip(), nextindent))
                            break
                    if not has_non_deletable_subsubsection:
                        if remove_deletable_tag_sets_from_subsection:
                            subsections_to_remove_inflections_from.append(k)
                        else:
                            subsections_to_delete.append(k)
                            subsections_to_delete.extend(
                                extra_subsubsections_to_delete)
                else:
                    pagemsg(
                        "WARNING: Wrong header in otherwise deletable subsection #%s: %s"
                        % (k // 2, subsections[k - 1].strip()))

    if not subsections_to_delete and not subsections_to_remove_inflections_from:
        pagemsg("Found %s section but no deletable or excisable subsections" %
                lang_to_langname[lang])
        return

    #### Now, we can delete an inflection, a subsection or the whole section or page

    for k in subsections_to_remove_inflections_from:
        newsubsec = subsections[k]
        if not newsubsec.endswith("\n"):
            # This applies to the last subsection on the page
            newsubsec += "\n"

        def remove_inflections(m):
            parsed = blib.parse_text(m.group(0))
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in inflection_of_templates:
                    langcode = getparam(t, "1")
                    assert langcode == lang
                    actual_lemma = getparam(t, "2")
                    if actual_lemma == lemma:
                        return ""
                if tn in lang_inflection_of_templates[lang]:
                    actual_lemma = getparam(t, "1")
                    if actual_lemma == lemma:
                        return ""
            return unicode(parsed)

        for tn in lang_inflection_of_templates[lang] + inflection_of_templates:
            newnewsubsec = re.sub(r"^# \{\{%s\|[^{}\n]*\}\}\n" % re.escape(tn),
                                  remove_inflections, newsubsec, 0, re.M)
            if newnewsubsec != newsubsec:
                newsubsec = newnewsubsec
                notes.append(
                    "removed {{%s}} inflection(s) for bad %s form(s) of [[%s]]"
                    % (tn, lang_to_langname[lang], lemma))
                subsections[k] = newsubsec

    for k in reversed(subsections_to_delete):
        # Do in reverse order so indices don't change
        del subsections[k]
        del subsections[k - 1]

    whole_section_deletable = False
    if len(subsections) == 1:
        whole_section_deletable = True
    else:
        for k in xrange(3, len(subsections), 2):
            if not re.search("^==+(References|Anagrams)==+$",
                             subsections[k].strip()):
                break
        else:
            # no break
            whole_section_deletable = True
    if whole_section_deletable:
        # Whole section deletable
        if subsections[0].strip():
            pagemsg(
                "WARNING: Whole %s section deletable except that there's text above all subsections: <%s>"
                % (lang_to_langname[lang], subsections[0].strip()))
            return
        if "[[Category:" in sectail:
            pagemsg(
                "WARNING: Whole %s section deletable except that there's a category at the end: <%s>"
                % (lang_to_langname[lang], sectail.strip()))
            return
        if not has_non_lang:
            # Can delete the whole page, but check for non-blank section 0
            cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0])
            if cleaned_sec0.strip():
                pagemsg(
                    "WARNING: Whole page deletable except that there's text above all sections: <%s>"
                    % cleaned_sec0.strip())
                return
            pagetitle = unicode(page.title())
            pagemsg("Page %s should be deleted" % pagetitle)
            pages_to_delete.append(pagetitle)
            return
        del sections[j]
        del sections[j - 1]
        notes.append(
            "excised %s subsection%s for bad %s form(s) of [[%s]], leaving no %s section"
            % ((len(subsections_to_delete),
                "" if len(subsections_to_delete) == 1 else "s",
                lang_to_langname[lang], lemma, lang_to_langname[lang])))
        if j > len(sections):
            # We deleted the last section, remove the separator at the end of the
            # previous section.
            sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1])
        text = "".join(sections)

    else:
        # Some but not all subsections remain
        secbody = "".join(subsections)
        sections[j] = secbody + sectail
        if subsections_to_delete and subsections_to_remove_inflections_from:
            deletable_subsec_text = "Subsection(s) %s deletable and subsection(s) %s excisable" % (
                ",".join(str(k // 2) for k in subsections_to_delete), ",".join(
                    str(k // 2)
                    for k in subsections_to_remove_inflections_from))
            deletable_subsec_note_text = "deleted %s subsection%s and partly excised %s subsection%s" % (
                len(subsections_to_delete), "" if len(subsections_to_delete)
                == 1 else "s", len(subsections_to_remove_inflections_from), ""
                if len(subsections_to_remove_inflections_from) == 1 else "s")
        elif subsections_to_delete:
            deletable_subsec_text = "Subsection(s) %s deletable" % (",".join(
                str(k // 2) for k in subsections_to_delete))
            deletable_subsec_note_text = "deleted %s subsection%s" % (
                len(subsections_to_delete),
                "" if len(subsections_to_delete) == 1 else "s")
        else:
            deletable_subsec_text = "Subsection(s) %s excisable" % (",".join(
                str(k // 2) for k in subsections_to_remove_inflections_from))
            deletable_subsec_note_text = "partly excised %s subsection%s" % (
                len(subsections_to_remove_inflections_from), ""
                if len(subsections_to_remove_inflections_from) == 1 else "s")

        if "==Etymology" in sections[j]:
            pagemsg(
                "WARNING: %s but found Etymology subsection, don't know how to handle"
                % deletable_subsec_text)
            return
        if "==Pronunciation" in sections[j]:
            pagemsg(
                "WARNING: %s but found Pronunciation subsection, don't know how to handle"
                % deletable_subsec_text)
            return

        notes.append(
            "%s for bad %s form(s) of %s, leaving some subsections remaining" %
            (deletable_subsec_note_text, lang_to_langname[lang], lemma))
        text = "".join(sections)

    return text, notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc):
  notes = []
  pagetitle = unicode(nounpage.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  text = unicode(nounpage.text)
  retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % (
      desc, ",".join(new_adj_or_dims)))
    return
  sections, j, secbody, sectail, has_non_lang = retval
  parsed = blib.parse_text(secbody)
  head = None
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]:
      if head:
        pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" %
            (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims)))
        return
      head = t
  if not head:
    pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims)))
    return
  orig_adjs_or_dims = blib.fetch_param_chain(head, param, param)
  adjs_or_dims = blib.fetch_param_chain(head, param, param)
  added_adjs_or_dims = []
  for adj_or_dim in new_adj_or_dims:
    if adj_or_dim in adjs_or_dims:
      pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head)))
    else:
      adjs_or_dims.append(adj_or_dim)
      added_adjs_or_dims.append(adj_or_dim)
  if adjs_or_dims != orig_adjs_or_dims:
    orighead = unicode(head)
    blib.set_param_chain(head, adjs_or_dims, param, param)
    pagemsg("Replaced %s with %s" % (orighead, unicode(head)))
    notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims)))
    secbody = unicode(parsed)
  subsecs = re.split("(^==.*==\n)", secbody, 0, re.M)
  for k in xrange(2, len(subsecs), 2):
    if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]:
      header = re.sub("=", "", subsecs[k - 1]).strip()
      for adj_or_dim in adjs_or_dims:
        def note_removed_text(m):
          if m.group(1):
            pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" %
                (header, desc, adj_or_dim, m.group(0)))
          return ""
        newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k])
        if newsubsecsk != subsecs[k]:
          notes.append("remove %s %s from %s" % (desc, adj_or_dim, header))
        subsecs[k] = newsubsecsk
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        # Repeat in case adjacent terms removed (unlikely though).
        subsecs[k] = re.sub(", *,", ",", subsecs[k])
        subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M)
        subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M)
      if re.search(r"^\s*$", subsecs[k]):
        subsecs[k] = ""
        subsecs[k - 1] = ""
  secbody = "".join(subsecs)
  secj = secbody + sectail
  newsecj = re.sub(r"\n\n\n+", "\n\n", secj)
  if newsecj != secj and not notes:
    notes.append("eliminate sequences of 3 or more newlines")
  secj = newsecj
  sections[j] = secj
  return "".join(sections), notes
Esempio n. 20
0
def process_page(page, index, pos):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    cappos = pos.capitalize()
    notes = []

    pagemsg("Processing")

    text = unicode(page.text)
    retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Old English section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    k = 1
    last_pos = None
    while k < len(subsections):
        if re.search(r"=\s*%s\s*=" % cappos, subsections[k]):
            level = get_indentation_level(subsections[k])
            last_pos = cappos
            endk = k + 2
            while endk < len(subsections) and get_indentation_level(
                    subsections[endk]) > level:
                endk += 2
            pos_text = "".join(subsections[k:endk])
            parsed = blib.parse_text(pos_text)
            head = None
            inflt = None
            found_rfinfl = False
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == pos_to_headword_template[pos] or (
                        tn == "head" and getparam(t, "1") == "ang"
                        and getparam(t, "2") in [pos, "%ss" % pos]):
                    newhead = getparam(t, "head").strip() or pagetitle
                    if head:
                        pagemsg(
                            "WARNING: Found two heads under one POS section: %s and %s"
                            % (head, newhead))
                    head = newhead
                if tn == pos_to_new_style_infl_template[pos] or (
                        pos_to_old_style_infl_template_prefix[pos]
                        and tn.startswith(
                            pos_to_old_style_infl_template_prefix[pos])):
                    if inflt:
                        pagemsg(
                            "WARNING: Found two inflection templates under one POS section: %s and %s"
                            % (unicode(inflt), unicode(t)))
                    inflt = t
                    pagemsg(
                        "Found %s inflection for headword %s: <from> %s <to> {{%s|%s}} <end>"
                        %
                        (pos, head or pagetitle, unicode(t),
                         pos_to_new_style_infl_template[pos], getparam(t, "1")
                         if pos == "verb" else head or pagetitle))
            if not inflt:
                pagemsg(
                    "Didn't find %s inflection for headword %s: <new> {{%s|%s%s}} <end>"
                    % (pos, head
                       or pagetitle, pos_to_new_style_infl_template[pos], head
                       or pagetitle, "" if pos == "noun" else "<>"))
                if pages_to_infls:
                    for l in xrange(k, endk, 2):
                        if re.search(
                                r"=\s*(Declension|Inflection|Conjugation)\s*=",
                                subsections[l]):
                            secparsed = blib.parse_text(subsections[l + 1])
                            for t in secparsed.filter_templates():
                                tn = tname(t)
                                if tname(t) != "rfinfl":
                                    pagemsg(
                                        "WARNING: Saw unknown template %s in existing inflection section, skipping"
                                        % (unicode(t)))
                                    break
                            else:  # no break
                                if pagetitle not in pages_to_infls:
                                    pagemsg(
                                        "WARNING: Couldn't find inflection for headword %s"
                                        % (head or pagetitle))
                                else:
                                    m = re.search(r"\A(.*?)(\n*)\Z",
                                                  subsections[l + 1], re.S)
                                    sectext, final_newlines = m.groups()
                                    subsections[l + 1] = pages_to_infls[
                                        pagetitle] + final_newlines
                                    pagemsg(
                                        "Replaced existing decl text <%s> with <%s>"
                                        % (sectext, pages_to_infls[pagetitle]))
                                    notes.append(
                                        "replace decl text <%s> with <%s>" %
                                        (sectext, pages_to_infls[pagetitle]))
                            break
                    else:  # no break
                        if pagetitle not in pages_to_infls:
                            pagemsg(
                                "WARNING: Couldn't find inflection for headword %s"
                                % (head or pagetitle))
                        else:
                            insert_k = k + 2
                            while insert_k < endk and "Usage notes" in subsections[
                                    insert_k]:
                                insert_k += 2
                            if not subsections[insert_k - 1].endswith("\n\n"):
                                subsections[insert_k - 1] = re.sub(
                                    "\n*$", "\n\n",
                                    subsections[insert_k - 1] + "\n\n")
                            subsections[insert_k:insert_k] = [
                                "%s%s%s\n" %
                                ("=" * (level + 1), "Conjugation"
                                 if pos == "verb" else "Declension", "=" *
                                 (level + 1)),
                                pages_to_infls[pagetitle] + "\n\n"
                            ]
                            pagemsg(
                                "Inserted level-%s inflection section with inflection <%s>"
                                % (level + 1, pages_to_infls[pagetitle]))
                            notes.append("add decl <%s>" %
                                         pages_to_infls[pagetitle])
                            endk += 2  # for the two subsections we inserted

            k = endk
        else:
            m = re.search(
                r"=\s*(Noun|Proper noun|Pronoun|Determiner|Verb|Adjective|Adverb|Interjection|Conjunction)\s*=",
                subsections[k])
            if m:
                last_pos = m.group(1)
            if re.search(r"=\s*(Declension|Inflection|Conjugation)\s*=",
                         subsections[k]):
                if not last_pos:
                    pagemsg(
                        "WARNING: Found inflection header before seeing any parts of speech: %s"
                        % (subsections[k].strip()))
                elif last_pos == cappos:
                    pagemsg(
                        "WARNING: Found probably misindented inflection header after ==%s== header: %s"
                        % (cappos, subsections[k].strip()))
            k += 2

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    text = "".join(sections)
    text = re.sub("\n\n\n+", "\n\n", text)
    if not notes:
        notes.append("convert 3+ newlines to 2")
    return text, notes
Esempio n. 21
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    def verify_template_is_full_line(tn, line):
        line = line.strip()
        templates = list(blib.parse_text(line).filter_templates())
        if type(tn) is list:
            tns = tn
        else:
            tns = [tn]
        tntext = "/".join(tns)
        if len(templates) == 0:
            pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" %
                    (tntext, line))
            return None
        t = templates[0]
        if tname(t) not in tns:
            pagemsg(
                "WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s"
                % (tntext, tntext, line))
            return None
        if unicode(t) != line:
            pagemsg(
                "WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s"
                % (tntext, tntext, line))
            return None
        return t

    notes = []

    if len(pagetitle) == 1 or pagetitle.endswith("-"):
        pagemsg("Page title is a single letter or a prefix, skipping")
        return

    retval = blib.find_modifiable_lang_section(
        text,
        None if args.partial_page else "Polish",
        pagemsg,
        force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    for k in xrange(1, len(subsections), 2):
        if re.search(r"==\s*Pronunciation\s*==", subsections[k]):
            secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation",
                               subsections[k])
            if secheader != subsections[k]:
                subsections[k] = secheader
                notes.append(
                    "remove extraneous spaces in ==Pronunciation== header")
            extra_notes = []
            parsed = blib.parse_text(subsections[k + 1])
            num_pl_IPA = 0
            saw_pl_p = False
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["pl-p", "pl-pronunciation"]:
                    saw_pl_p = True
                    break
                if tn in ["pl-IPA", "pl-IPA-auto"]:
                    num_pl_IPA += 1
            if saw_pl_p:
                pagemsg("Already saw {{pl-p}}, skipping: %s" % unicode(t))
                continue
            if num_pl_IPA == 0:
                pagemsg(
                    "WARNING: Didn't see {{pl-IPA}} in Pronunciation section, skipping"
                )
                continue
            if num_pl_IPA > 1:
                pagemsg(
                    "WARNING: Saw multiple {{pl-IPA}} in Pronunciation section, skipping"
                )
                continue
            lines = subsections[k + 1].strip().split("\n")
            # Remove blank lines.
            lines = [line for line in lines if line]
            hyph_lines = []
            homophone_lines = []
            rhyme_lines = []
            audio_lines = []
            must_continue = False
            newtemp = None
            next_audio_param = 0
            has_respelling = False
            ipat = None
            for line in lines:
                origline = line
                # In case of "* {{pl-IPA|...}}", chop off the "* ".
                line = re.sub(r"^\*\s*(\{\{pl-IPA)", r"\1", line)
                if line.startswith("{{pl-IPA"):
                    if newtemp:
                        pagemsg(
                            "WARNING: Something wrong, already saw {{pl-IPA}}?: %s"
                            % origline)
                        must_continue = True
                        break
                    ipat = verify_template_is_full_line(
                        ["pl-IPA", "pl-IPA-auto"], line)
                    if ipat is None:
                        must_continue = True
                        break
                    newtemp_str = "{{pl-p}}"
                    newtemp = list(
                        blib.parse_text(newtemp_str).filter_templates())[0]
                    for param in ipat.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if re.search("^[0-9]+$", pn):
                            has_respelling = True
                            newtemp.add(pn, pv, preserve_spacing=False)
                        elif re.search("^qual[0-9]*$", pn):
                            newtemp.add(pn.replace("qual", "q"),
                                        pv,
                                        preserve_spacing=False)
                        else:
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{pl-IPA}}, skipping: %s"
                                % (pn, pv, origline))
                            must_continue = True
                            break
                    if has_respelling:
                        pagemsg("WARNING: {{pl-IPA}} has respelling: %s" %
                                unicode(ipat))
                    if must_continue:
                        break
                    continue
                if not line.startswith("* ") and not line.startswith("*{"):
                    pagemsg(
                        "WARNING: Pronunciation section line doesn't start with '* ', skipping: %s"
                        % origline)
                    must_continue = True
                    break
                if line.startswith("* "):
                    line = line[2:]
                else:
                    line = line[1:]
                if line.startswith("{{hyph"):
                    hyph_lines.append(line)
                elif line.startswith("{{homophone") or line.startswith(
                        "{{hmp"):
                    homophone_lines.append(line)
                elif line.startswith("{{audio"):
                    audio_lines.append(line)
                elif line.startswith("{{rhyme"):
                    rhyme_lines.append(line)
                else:
                    pagemsg(
                        "WARNING: Unrecognized Pronunciation section line, skipping: %s"
                        % origline)
                    must_continue = True
                    break
            if has_respelling and (rhyme_lines or hyph_lines):
                rhyme_hyph = []
                if rhyme_lines:
                    rhyme_hyph.append("rhyme line(s) %s" %
                                      ",".join(rhyme_lines))
                if hyph_lines:
                    rhyme_hyph.append("hyphenation line(s) %s" %
                                      ",".join(hyph_lines))
                # We formerly skipped these pages, but [[User:Vininn126]] requested running the bot on them.
                pagemsg("WARNING: Has respelling %s along with %s" %
                        (ipat and unicode(ipat)
                         or "UNKNOWN", " and ".join(rhyme_hyph)))
                #continue
            if must_continue:
                continue

            if audio_lines:
                must_continue = False
                for audio_line in audio_lines:
                    audiot = verify_template_is_full_line("audio", audio_line)
                    if audiot is None:
                        must_continue = True
                        break
                    if getparam(audiot, "1") != "pl":
                        pagemsg(
                            "WARNING: Wrong language in {{audio}}, skipping: %s"
                            % audio_line)
                        must_continue = True
                        break
                    audiofile = getparam(audiot, "2")
                    audiogloss = getparam(audiot, "3")
                    for param in audiot.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if pn not in ["1", "2", "3"]:
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s"
                                % (pn, pv, audio_line))
                            must_continue = True
                            break
                    if must_continue:
                        break
                    if audiogloss in ["Audio", "audio"]:
                        audiogloss = ""
                    if not newtemp:
                        pagemsg(
                            "WARNING: Saw %s without {{pl-IPA}}, skipping: %s"
                            % (unicode(audiot), audio_line))
                        must_continue = True
                        break
                    next_audio_param += 1
                    if next_audio_param == 1:
                        paramsuf = ""
                    else:
                        paramsuf = str(next_audio_param)
                    newtemp.add("a%s" % paramsuf,
                                audiofile,
                                preserve_spacing=False)
                    if audiogloss:
                        newtemp.add("ac%s" % paramsuf,
                                    audiogloss,
                                    preserve_spacing=False)
                    pagemsg("Replacing %s with %s" %
                            (unicode(audiot), unicode(newtemp)))
                    extra_notes.append("incorporate %s into {{pl-p}}" %
                                       unicode(audiot))
                if must_continue:
                    continue

            if rhyme_lines:
                if len(rhyme_lines) > 1:
                    pagemsg("WARNING: Multiple rhyme lines, not removing: %s" %
                            ", ".join(rhyme_lines))
                    continue
                rhyme_line = rhyme_lines[0]
                rhymet = verify_template_is_full_line(["rhyme", "rhymes"],
                                                      rhyme_line)
                if not rhymet:
                    continue
                if getparam(rhymet, "1") != "pl":
                    pagemsg(
                        "WARNING: Wrong language in {{%s}}, not removing: %s" %
                        (tname(rhymet), rhyme_line))
                    continue
                pagemsg("Ignoring rhyme line: %s" % rhyme_line)
                extra_notes.append("remove rhyme template %s" %
                                   unicode(rhymet))

            if hyph_lines:
                if len(hyph_lines) > 1:
                    pagemsg(
                        "WARNING: Multiple hyphenation lines, not removing: %s"
                        % ", ".join(hyph_lines))
                    continue
                hyph_line = hyph_lines[0]
                hypht = verify_template_is_full_line(["hyph", "hyphenation"],
                                                     hyph_line)
                if not hypht:
                    continue
                if getparam(hypht, "1") != "pl":
                    pagemsg(
                        "WARNING: Wrong language in {{%s}}, not removing: %s" %
                        (tname(hypht), hyph_line))
                    continue
                pagemsg("Ignoring hyphenation line: %s" % hyph_line)
                extra_notes.append("remove hyphenation template %s" %
                                   unicode(hypht))

            if homophone_lines:
                next_homophone_param = 0
                must_continue = False
                for homophone_line in homophone_lines:
                    homophones = {}
                    homophone_qualifiers = {}
                    hmpt = verify_template_is_full_line(
                        ["hmp", "homophone", "homophones"], homophone_line)
                    if not hmpt:
                        must_continue = True
                        break
                    if getparam(hmpt, "1") != "pl":
                        pagemsg(
                            "WARNING: Wrong language in {{%s}}, not removing: %s"
                            % (tname(hmpt), homophone_line))
                        must_continue = True
                        break
                    for param in hmpt.params:
                        pn = pname(param)
                        pv = unicode(param.value)
                        if not re.search("^q?[0-9]+$", pn):
                            pagemsg(
                                "WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s"
                                % (pn, pv, tname(hmpt), homophone_line))
                            must_continue = True
                            break
                        if pn.startswith("q"):
                            homophone_qualifiers[int(pn[1:])] = pv
                        elif int(pn) > 1:
                            homophones[int(pn) - 1] = pv
                    if must_continue:
                        break
                    if not newtemp:
                        pagemsg(
                            "WARNING: Something wrong, saw %s without {{pl-IPA}}, skipping"
                            % unicode(hmpt))
                        must_continue = True
                        break
                    hhs = []
                    hhp_args = []
                    for pn, pv in sorted(homophones.items()):
                        next_homophone_param += 1
                        hmp_param = "" if next_homophone_param == 1 else str(
                            next_homophone_param)
                        hhs.append(pv)
                        if pn in homophone_qualifiers:
                            hhp_args.append(("hhp%s" % hmp_param,
                                             homophone_qualifiers[pn]))
                    if hhs:
                        newtemp.add("hh", ",".join(hhs))
                        for pn, pv in hhp_args:
                            newtemp.add(pn, pv, preserve_spacing=False)
                    pagemsg("Replacing %s with %s" %
                            (unicode(hmpt), unicode(newtemp)))
                    extra_notes.append("incorporate homophones into {{pl-p}}")
                if must_continue:
                    continue

            pagemsg("Replaced %s with %s" % (unicode(ipat), unicode(newtemp)))

            all_lines = "\n".join([unicode(newtemp)])
            newsubsec = "%s\n\n" % all_lines
            if subsections[k + 1] != newsubsec:
                this_notes = ["convert {{pl-IPA}} to {{pl-p}}"] + extra_notes
                notes.extend(this_notes)
            subsections[k + 1] = newsubsec

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Esempio n. 22
0
def process_text_on_page(index, pagetitle, text):
  global args
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Italian", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  has_etym_sections = "==Etymology 1==" in secbody
  saw_pronun_section_at_top = False
  split_pronun_sections = False
  saw_pronun_section_this_etym_section = False
  saw_existing_pron = False
  saw_existing_pron_this_etym_section = False

  etymsection = "top" if has_etym_sections else "all"
  etymsections_to_first_subsection = {}
  if etymsection == "top":
    after_etym_1 = False
    for k in xrange(2, len(subsections), 2):
      if "==Etymology 1==" in subsections[k - 1]:
        after_etym_1 = True
      if "==Pronunciation==" in subsections[k - 1]:
        if after_etym_1:
          split_pronun_sections = True
        else:
          saw_pronun_section_at_top = True
      m = re.search("==Etymology ([0-9]*)==", subsections[k - 1])
      if m:
        etymsections_to_first_subsection[int(m.group(1))] = k

  msgs = []

  def append_msg(txt):
    if txt not in msgs:
      msgs.append(txt)

  def apply_default_pronun_to_pagetitle():
    respellings, this_msgs = apply_default_pronun(pagetitle)
    for msg in this_msgs:
      append_msg(msg)
    return respellings

  for k in xrange(2, len(subsections), 2):
    msgs = []
    def check_missing_pronun(etymsection):
      if split_pronun_sections and not saw_existing_pron_this_etym_section:
        pagemsg("WARNING: Missing pronunciations in etym section %s" % etymsection)
        append_msg("MISSING_PRONUN")
        append_msg("NEW_DEFAULTED")
        respellings = apply_default_pronun_to_pagetitle()
        pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs)))

      #pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all",
      #  " ".join(x.replace(" ", "_") for x in respellings), " ".join(msgs)))

    m = re.search("==Etymology ([0-9]*)==", subsections[k - 1])
    if m:
      if etymsection != "top":
        check_missing_pronun(etymsection)
      etymsection = m.group(1)
      saw_pronun_section_this_etym_section = False
      saw_existing_pron_this_etym_section = False
    if "==Pronunciation " in subsections[k - 1]:
      pagemsg("WARNING: Saw Pronunciation N section header: %s" % subsections[k - 1].strip())
    if "==Pronunciation==" in subsections[k - 1]:
      if saw_pronun_section_this_etym_section:
        pagemsg("WARNING: Saw two Pronunciation sections under etym section %s" % etymsection)
      if saw_pronun_section_at_top and etymsection != "top":
        pagemsg("WARNING: Saw Pronunciation sections both at top and in etym section %s" % etymsection)
      saw_pronun_section_this_etym_section = True
      parsed = blib.parse_text(subsections[k])

      respellings = []
      prev_it_IPA_t = None
      prev_it_pr_t = None
      must_continue = False
      for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "it-IPA":
          saw_existing_pron = True
          saw_existing_pron_this_etym_section = True
          if prev_it_IPA_t:
            pronun_lines = re.findall(r"^.*\{\{it-IPA.*$", subsections[k], re.M)
            pagemsg("WARNING: Saw multiple {{it-IPA}} templates in a single Pronunciation section: %s" %
              " ||| ".join(pronun_lines))
            must_continue = True
            break
          prev_it_IPA_t = t
          this_respellings = []
          saw_pronun = False
          last_numbered_param = 0
          for param in t.params:
            pn = pname(param)
            pv = unicode(param.value).strip().replace(" ", "_")
            if re.search("^[0-9]+$", pn):
              last_numbered_param += 1
              saw_pronun = True
              if pv == "+":
                append_msg("EXISTING_DEFAULTED")
                this_respellings.extend(apply_default_pronun_to_pagetitle())
              else:
                append_msg("EXISTING")
                this_respellings.append(pv)
            elif re.search("^ref[0-9]*$", pn) and int(pn[3:] or "1") == last_numbered_param:
              m = re.search(r"^\{\{R:it:(DiPI|Olivetti|Treccani|Trec)(\|[^{}]*)?\}\}$", pv)
              if m:
                refname, refparams = m.groups()
                refname = "Treccani" if refname == "Trec" else refname
                this_respellings.append("n:%s%s" % (refname, refparams or ""))
              else:
                this_respellings.append("%s=%s" % (pn, pv))
            else:
              this_respellings.append("%s=%s" % (pn, pv))
          if not saw_pronun:
            append_msg("EXISTING_DEFAULTED")
            this_respellings.extend(apply_default_pronun_to_pagetitle())
          respellings.extend(this_respellings)
        if tn == "it-pr":
          saw_existing_pron = True
          saw_existing_pron_this_etym_section = True
          if prev_it_pr_t:
            pronun_lines = re.findall(r"^.*\{\{it-pr.*$", subsections[k], re.M)
            pagemsg("WARNING: Saw multiple {{it-pr}} templates in a single Pronunciation section: %s" %
              " ||| ".join(pronun_lines))
            must_continue = True
            break
          prev_it_pr_t = t
          this_respellings = []
          saw_pronun = False
          for param in t.params:
            pn = pname(param)
            pv = unicode(param.value).strip().replace(" ", "_")
            if re.search("^[0-9]+$", pn):
              saw_pronun = True
              #if pv == "+":
              #  append_msg("EXISTING_DEFAULTED")
              #  this_respellings.extend(apply_default_pronun_to_pagetitle())
              #else:
              def fix_ref(m):
                refname, refparams = m.groups()
                refname = "Treccani" if refname == "Trec" else refname
                return "<r:%s%s>" % (refname, refparams or "")
              pv = re.sub(r"<ref:\{\{R:it:(DiPI|Olivetti|Treccani|Trec|DOP)(\|[^{}]*)?\}\}>", fix_ref, pv)
              append_msg("EXISTING")
              this_respellings.append(pv)
            else:
              this_respellings.append("%s=%s" % (pn, pv))
          if not saw_pronun:
            append_msg("EXISTING_DEFAULTED")
            #this_respellings.extend(apply_default_pronun_to_pagetitle())
            this_respellings.append("+")
          respellings.extend(this_respellings)
      if must_continue:
        continue

      if args.include_defns and etymsection not in ["top", "all"]:
        first_etym_subsec = etymsections_to_first_subsection.get(int(etymsection), None)
        next_etym_subsec = etymsections_to_first_subsection.get(1 + int(etymsection), None)
        if first_etym_subsec is None:
          pagemsg("WARNING: Internal error: Unknown first etym section for =Etymology %s=" % etymsection)
        else:
          if next_etym_subsec is None:
            next_etym_subsec = len(subsections)
          defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it")
          append_msg("defns: %s" % ";".join(defns))

      if respellings:
        pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs)))

  check_missing_pronun(etymsection)
  if not saw_existing_pron:
    if args.include_defns and has_etym_sections:
      for etymsec in sorted(list(etymsections_to_first_subsection.keys())):
        msgs = []
        first_etym_subsec = etymsections_to_first_subsection[etymsec]
        next_etym_subsec = etymsections_to_first_subsection.get(1 + etymsec, None)
        if next_etym_subsec is None:
          next_etym_subsec = len(subsections)
        append_msg("NEW_DEFAULTED")
        defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it")
        append_msg("defns: %s" % ";".join(defns))
        respellings = apply_default_pronun_to_pagetitle()
        pagemsg("<respelling> %s: %s <end> %s" % (etymsec, " ".join(respellings), " ".join(msgs)))
    else:
      msgs = []
      append_msg("NEW_DEFAULTED")
      respellings = apply_default_pronun_to_pagetitle()
      pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", " ".join(respellings), " ".join(msgs)))
def process_text_on_page(index, pagetitle, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if pagetitle in hu_pages_seen:
        pagemsg("Skipping because already seen")
        return
    hu_pages_seen.add(pagetitle)
    pagemsg("Processing")

    retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find Hungarian section")
        return
    sections, j, secbody, sectail, has_non_lang = retval
    if "==Etymology 1==" not in secbody:
        return
    etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    if len(etym_sections) < 5:
        pagemsg("WARNING: Not enough etym sections, found %s, expected >= 5" %
                len(etym_sections))
        return
    num_lemmas = 0
    num_nonlemma_forms = 0
    poses_seen_per_section = defaultdict(set)
    for k in range(2, len(etym_sections), 2):
        section = etym_sections[k]
        parsed = blib.parse_text(section)
        saw_lemma = False
        saw_nonlemma_form = False
        for t in parsed.filter_templates():
            tn = tname(t)
            p2 = getparam(t, "2")
            recording_lemma = None
            if tn in hu_lemma_template_mapping:
                recording_lemma = hu_lemma_template_mapping[tn]
            elif tn == "head" and getparam(t, "1") == "hu" and p2 in hu_lemmas:
                recording_lemma = hu_lemma_mapping.get(p2, p2)
            elif tn == "head" and getparam(
                    t, "1"
            ) == "hu" and p2 and p2[-1] == "s" and p2[:-1] in hu_lemmas:
                recording_lemma = hu_lemma_mapping.get(p2[:-1], p2[:-1])
            if recording_lemma:
                poses_seen_per_section[k // 2 - 1].add(recording_lemma)
                if not saw_lemma:
                    num_lemmas += 1
                    saw_lemma = True
            recording_nonlemma_form = None
            if tn == "head" and getparam(
                    t, "1") == "hu" and p2 in hu_nonlemma_forms:
                recording_nonlemma_form = p2
            elif tn == "head" and getparam(t, "1") == "hu" and p2 and p2[
                    -1] == "s" and p2[:-1] in hu_nonlemma_forms:
                recording_nonlemma_form = p2[:-1]
            if recording_nonlemma_form:
                poses_seen_per_section[k // 2 - 1].add(recording_nonlemma_form)
                if not saw_nonlemma_form:
                    num_nonlemma_forms += 1
                    saw_nonlemma_form = True
        if not saw_lemma and not saw_nonlemma_form:
            pagemsg("WARNING: In %s, didn't see lemma or non-lemma" %
                    etym_sections[k - 1].strip())
    pagemsg("Saw num_lemmas=%s, num_nonlemma_forms=%s" %
            (num_lemmas, num_nonlemma_forms))
    if num_lemmas and num_nonlemma_forms:
        secbody, sectail = add_category(
            secbody, sectail, pagemsg, notes,
            "terms with lemma and non-lemma form etymologies")
    if num_lemmas > 1:
        secbody, sectail = add_category(
            secbody, sectail, pagemsg, notes,
            "terms with multiple lemma etymologies")
    if num_nonlemma_forms > 1:
        secbody, sectail = add_category(
            secbody, sectail, pagemsg, notes,
            "terms with multiple non-lemma form etymologies")
    pairs_seen = set()
    for k in range((len(etym_sections) - 1) // 2):
        for l in range(k + 1, (len(etym_sections) - 1) // 2):
            for posk in poses_seen_per_section[k]:
                for posl in poses_seen_per_section[l]:
                    if posk in hu_nonlemma_forms and posl in hu_lemmas:
                        pairs_seen.add((posl, posk))
                    elif ((posk in hu_lemmas and posl in hu_lemmas or posk
                           in hu_nonlemma_forms and posl in hu_nonlemma_forms)
                          and posk > posl):
                        pairs_seen.add((posl, posk))
                    else:
                        pairs_seen.add((posk, posl))
    pagemsg("; ".join("%s: %s" % (sec + 1, ",".join(poses))
                      for sec, poses in sorted(poses_seen_per_section.items(),
                                               key=lambda x: x[0])))
    for posk, posl in pairs_seen:
        hu_pos_pos_pairs[(posk, posl)] += 1
        if posk == posl:
            secbody, sectail = add_category(
                secbody, sectail, pagemsg, notes,
                "terms with multiple %s etymologies" % posk)
        else:
            secbody, sectail = add_category(
                secbody, sectail, pagemsg, notes,
                "terms with %s and %s etymologies" % (posk, posl))
    sections[j] = secbody + sectail
    return "".join(sections), notes
Esempio n. 24
0
def process_page(index, page, spec):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing pronunciation spec: %s" % spec)
    m = re.search("^([a-z0-9]*): (.*)$", spec)
    if not m:
        pagemsg("WARNING: Unrecognized pronunciation spec: %s" % spec)
        return
    location, pronspecs = m.groups()
    if (pagetitle, location) in seen_pages:
        pagemsg("WARNING: Already saw page, skipping")
        return
    seen_pages.add((pagetitle, location))
    pronspecs = [
        pronspec.replace("_", " ") for pronspec in pronspecs.split(" ")
    ]
    if args.old_it_ipa:
        prons = []
        refs = []
        have_footnotes = False
        next_num_pron = 0
        last_num_pron = None
        last_footnote_param_index = None

        for pronspec in pronspecs:
            if pronspec.startswith("r:"):
                ref = pronspec[2:]
                if not re.search(r"^%s\b" % refs_re, ref):
                    pagemsg("WARNING: Unrecognized reference %s: pronspec=%s" %
                            (pronspec, spec))
                    return
                refs.append("{{R:it:%s}}" % ref)
            elif pronspec.startswith("n:"):
                ref = pronspec[2:]
                if not re.search(r"^%s\b" % refs_re, ref):
                    pagemsg("WARNING: Unrecognized reference %s: pronspec=%s" %
                            (pronspec, spec))
                    return
                if next_num_pron == 0:
                    pagemsg(
                        "WARNING: No preceding pronunciations for footnote %s: %s"
                        % (pronspec, spec))
                    return
                reftemp = "{{R:it:%s}}" % ref
                if next_num_pron == last_num_pron:
                    prons[last_footnote_param_index] += " !!! " + reftemp
                else:
                    last_footnote_param_index = len(prons)
                    last_num_pron = next_num_pron
                    prons.append(
                        "ref%s=%s" %
                        ("" if next_num_pron == 1 else next_num_pron, reftemp))
                have_footnotes = True
            else:
                if re.search("^ref[0-9]*=", pronspec):
                    have_footnotes = True
                if "=" not in pronspec:
                    respellings, msgs = apply_default_pronun(pronspec)
                    if "NEED_ACCENT" in msgs:
                        pagemsg(
                            "WARNING: Missing accent for pronunciation %s" %
                            pronspec)
                        return
                    if "Z" in msgs:
                        pagemsg("WARNING: Unconverted z in pronunciation %s" %
                                pronspec)
                        return
                    next_num_pron += 1
                prons.append(pronspec)
    else:
        prons = []
        refs = []
        have_footnotes = False
        for pronspec in pronspecs:
            pronspec_parts = re.split("(<r:[^<>]*)", pronspec)
            for i, pronspec_part in enumerate(pronspec_parts):
                if i % 2 == 1:  # a reference
                    if pronspec_part == "<r:":  # a cross-reference to another reference
                        pronspec_parts[i] = "<ref:"
                    else:
                        if not re.search(r"^<r:%s\b" % refs_re, pronspec_part):
                            pagemsg(
                                "WARNING: Unrecognized reference %s: pronspec=%s"
                                % (pronspec_part, spec))
                            return
                        ref_template_text = pronspec_part[3:]
                        # If the argument to the reference template is the page title, remove it.
                        m = re.search(r"^%s\|(.*)$" % refs_re,
                                      ref_template_text)
                        if m and m.group(2) == pagetitle:
                            ref_template_text = m.group(1)
                        pronspec_parts[
                            i] = "<ref:{{R:it:%s}}" % ref_template_text
            pronspec = "".join(pronspec_parts)
            if "<ref:" in pronspec:
                have_footnotes = True  # <r: or original <ref:
            # FIXME: Verify respellings checking for NEED_ACCENT and Z, as above.
            prons.append(pronspec)
    if not re.search("^[0-9]+$", location) and location not in ["top", "all"]:
        pagemsg("WARNING: Unrecognized location %s: pronspec=%s" %
                (location, spec))
        return

    notes = []

    text = unicode(page.text)
    retval = blib.find_modifiable_lang_section(text,
                                               "Italian",
                                               pagemsg,
                                               force_final_nls=True)
    if retval is None:
        return
    sections, j, secbody, sectail, has_non_lang = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    has_etym_sections = "==Etymology 1==" in secbody
    if has_etym_sections and location == "all":
        pagemsg("WARNING: With ==Etymology 1==, location cannot be 'all': %s" %
                spec)
        return
    if not has_etym_sections and location != "all":
        pagemsg(
            "WARNING: Without split etymology sections, location must be 'all': %s"
            % spec)
        return

    def construct_new_pron_template():
        if args.old_it_ipa:
            return "{{it-IPA|%s}}" % "|".join(prons), "* "
        else:
            return "{{it-pr|%s}}" % "|".join(prons), ""

    def insert_into_existing_pron_section(k):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "it-IPA" and args.old_it_ipa:
                origt = unicode(t)
                # Compute set of current reference params
                current_refs = set()
                for param in t.params:
                    pn = pname(param)
                    m = re.search("^n([0-9]*)$", pn)
                    if m:
                        current_refs.add(m.group(1) or "1")
                # Compute params to add along with set of new reference params
                params_to_add = []
                new_refs = set()
                nextparam = 0
                for param in prons:
                    if "=" in param:
                        pn, pv = param.split("=", 1)
                    else:
                        nextparam += 1
                        pn = str(nextparam)
                        pv = param
                    m = re.search("^n([0-9]*)$", pn)
                    if m:
                        new_refs.add(m.group(1) or "1")
                    params_to_add.append((pn, pv))

                # Make sure we're not removing references
                if len(current_refs - new_refs) > 0 and not args.override_refs:
                    pagemsg(
                        "WARNING: Saw existing refs not in new refs, not removing: existing=%s, new=%s"
                        % (origt, "{{it-IPA|%s}}" % "|".join(prons)))
                    return False

                # Now change the params
                del t.params[:]
                for pn, pv in params_to_add:
                    t.add(pn, pv)
                if origt != unicode(t):
                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                    notes.append(
                        "replace existing %s with %s (manually assisted)" %
                        (origt, unicode(t)))
                    subsections[k] = unicode(parsed)
                break
            if tn == "it-pr" and not args.old_it_ipa:
                origt = unicode(t)
                # Now change the params
                del t.params[:]
                for pn, pv in enumerate(prons):
                    t.add(str(pn + 1), pv)
                if origt != unicode(t):
                    # Make sure we're not removing references
                    if "<ref:" in origt and not args.override_refs:
                        pagemsg(
                            "WARNING: Saw existing refs not in new refs, not removing: existing=%s, new=%s"
                            % (origt, unicode(t)))
                        return False

                    # Make sure we're not removing audio or other modifiers
                    if re.search("<(audio|hmp|rhyme|hyph|pre|post):",
                                 origt) and not args.override_refs:
                        pagemsg(
                            "WARNING: Saw existing audio/hmp/rhyme/hyph/pre/post not in new refs, not removing: existing=%s, new=%s"
                            % (origt, unicode(t)))
                        return False

                    pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                    notes.append(
                        "replace existing %s with %s (manually assisted)" %
                        (origt, unicode(t)))
                    subsections[k] = unicode(parsed)
                break
        else:  # no break
            new_pron_template, pron_prefix = construct_new_pron_template()
            if not args.old_it_ipa:
                # Remove existing rhymes/hyphenation/it-IPA lines
                for template in [
                        "rhyme|it", "rhymes|it", "it-IPA", "hyph|it",
                        "hyphenation|it"
                ]:
                    re_template = template.replace("|", r"\|")
                    regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template
                    m = re.search(regex, subsections[k], re.M)
                    if m:
                        pagemsg("Removed existing %s" % m.group(1).strip())
                        notes.append("remove existing {{%s}}" % template)
                        subsections[k] = re.sub(regex, "", subsections[k], 0,
                                                re.M)
            subsections[
                k] = pron_prefix + new_pron_template + "\n" + subsections[k]
            notes.append(
                "insert %s into existing Pronunciation section (manually assisted)"
                % new_pron_template)
        return True

    def insert_new_l3_pron_section(k):
        new_pron_template, pron_prefix = construct_new_pron_template()
        subsections[k:k] = [
            "===Pronunciation===\n", pron_prefix + new_pron_template + "\n\n"
        ]
        notes.append("add top-level Italian pron %s (manually assisted)" %
                     new_pron_template)

    if location == "all":
        for k in xrange(2, len(subsections), 2):
            if "==Pronunciation==" in subsections[k - 1]:
                if not insert_into_existing_pron_section(k):
                    return
                break
        else:  # no break
            k = 2
            while k < len(subsections) and re.search(
                    "==(Alternative forms|Etymology)==", subsections[k - 1]):
                k += 2
            if k - 1 >= len(subsections):
                pagemsg("WARNING: No lemma or non-lemma section at top level")
                return
            insert_new_l3_pron_section(k - 1)
    elif location == "top":
        for k in xrange(2, len(subsections), 2):
            if "==Pronunciation==" in subsections[k - 1]:
                if not insert_into_existing_pron_section(k):
                    return
                break
        else:  # no break
            for k in xrange(2, len(subsections), 2):
                if "==Etymology 1==" in subsections[k - 1]:
                    insert_new_l3_pron_section(k - 1)
                    break
            else:  # no break
                pagemsg(
                    "WARNING: Something wrong, location == 'top' but can't find Etymology 1 section"
                )
                return
    else:
        begin_etym_n_section = None

        def insert_pron_section_in_etym_section():
            k = begin_etym_n_section + 2
            while k < len(subsections) and re.search("==Alternative forms==",
                                                     subsections[k - 1]):
                k += 2
            if k - 1 >= len(subsections):
                pagemsg(
                    "WARNING: No lemma or non-lemma section in Etymology N section: %s"
                    % subsections[begin_etym_n_section].strip())
                return
            new_pron_template, pron_prefix = construct_new_pron_template()
            subsections[k - 1:k - 1] = [
                "====Pronunciation====\n",
                pron_prefix + new_pron_template + "\n\n"
            ]
            notes.append(
                "add Italian pron %s to Etymology %s (manually assisted)" %
                (new_pron_template, location))

        for k in xrange(2, len(subsections), 2):
            if "==Etymology %s==" % location in subsections[k - 1]:
                begin_etym_n_section = k
            elif re.search("==Etymology [0-9]", subsections[k - 1]):
                if begin_etym_n_section:
                    # We encountered the next Etymology section and didn't see Pronunciation; insert a Pronunciation section.
                    insert_pron_section_in_etym_section()
                    break
            elif begin_etym_n_section and "==Pronunciation==" in subsections[
                    k - 1]:
                if not insert_into_existing_pron_section(k):
                    return
                break
        else:  # no break
            # We reached the end.
            if begin_etym_n_section:
                # We found the Etymology section to insert in; it was the last one and didn't see Pronunciation.
                # Insert a pronunciation section.
                insert_pron_section_in_etym_section()
            else:
                pagemsg(
                    "WARNING: Didn't find Etymology N section for location=%s: spec=%s"
                    % (location, spec))
                return

        if refs or have_footnotes:
            # Check for refs in References or Further reading embedded in Etym section
            begin_etym_n_section = None
            for k in xrange(2, len(subsections), 2):
                if "==Etymology %s==" % location in subsections[k - 1]:
                    begin_etym_n_section = k - 1
                elif re.search("==Etymology [0-9]", subsections[k - 1]):
                    # next etym section
                    break
                elif begin_etym_n_section:
                    if refs and re.search(
                            r"====\s*(References|Further reading)\s*====",
                            subsections[k - 1]):
                        # Found References or Further reading embedded in Etym section
                        pagemsg("Found %s in Etymology %s section" %
                                (subsections[k - 1].strip(), location))
                        needed_refs = []
                        for ref in refs:
                            if ref in subsections[k]:
                                pagemsg(
                                    "Already found %s in %s section %s under Etymology %s"
                                    % (ref, subsections[k - 1].strip(), k // 2,
                                       location))
                            else:
                                needed_refs.append(ref)
                        refs = needed_refs
                    if have_footnotes and re.search(
                            r"====\s*References\s*====", subsections[k - 1]):
                        # Check for <references/> in References embedded in Etym section
                        if re.search(r"<references\s*/?\s*>", subsections[k]):
                            pagemsg(
                                "Already found <references /> in ===References=== section %s under Etymology %s"
                                % (k // 2, location))
                            have_footnotes = False

    if refs:
        # Check for references already present
        for k in xrange(2, len(subsections), 2):
            if re.search("^===(References|Further reading)===\n",
                         subsections[k - 1]):
                needed_refs = []
                for ref in refs:
                    if ref in subsections[k]:
                        pagemsg("Already found %s in %s section %s" %
                                (ref, subsections[k - 1].strip(), k // 2))
                    else:
                        needed_refs.append(ref)
                refs = needed_refs
        if refs:
            added_ref_text = "\n".join("* " + ref for ref in refs) + "\n\n"
            # Still some references, need to add them to existing References section or create new one
            for k in xrange(2, len(subsections), 2):
                if re.search("^===References===\n", subsections[k - 1]):
                    subsections[k] = subsections[k].rstrip(
                        "\n") + "\n" + added_ref_text
                    notes.append(
                        "add Italian pronun reference%s %s to existing ===References=== section"
                        % ("s" if len(refs) > 1 else "", ", ".join(refs)))
                    break
            else:  # no break
                k = len(subsections) - 1
                while k >= 2 and re.search(
                        r"==\s*(Anagrams|Further reading)\s*==",
                        subsections[k - 1]):
                    k -= 2
                if k < 2:
                    pagemsg("WARNING: No lemma or non-lemma section")
                    return
                subsections[k + 1:k +
                            1] = ["===References===\n", added_ref_text]
                notes.append(
                    "add new ===References=== section for pron reference%s %s"
                    % ("s" if len(refs) > 1 else "", ", ".join(refs)))

    if have_footnotes:
        # Need <references/>; check if already present
        for k in xrange(len(subsections) - 1, 2, -2):
            if re.search(r"^===\s*References\s*===$",
                         subsections[k - 1].strip()):
                if re.search(r"<references\s*/?\s*>", subsections[k]):
                    pagemsg(
                        "Already found <references /> in ===References=== section %s"
                        % (k // 2))
                else:
                    subsections[k] = "<references />\n" + subsections[k]
                    notes.append(
                        "add <references /> to existing ===References=== section for pron footnotes"
                    )
                break
        else:  # no break
            k = len(subsections) - 1
            while k >= 2 and re.search(r"==\s*(Anagrams|Further reading)\s*==",
                                       subsections[k - 1]):
                k -= 2
            if k < 2:
                pagemsg("WARNING: No lemma or non-lemma section")
                return
            subsections[k + 1:k +
                        1] = ["===References===\n", "<references />\n\n"]
            notes.append("add new ===References=== section for pron footnotes")

    secbody = "".join(subsections)
    # Strip extra newlines added to secbody
    sections[j] = secbody.rstrip("\n") + sectail
    return "".join(sections), notes
Esempio n. 25
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  retval = blib.find_modifiable_lang_section(text, "Georgian", pagemsg)
  if retval is None:
    pagemsg("WARNING: Couldn't find Georgian section")
    return
  sections, j, secbody, sectail, has_non_lang = retval

  #newtext = re.sub(r"====[ ]?Declension[ ]?====\n\{\{ka-decl-adj-auto\}\}\n", "", secbody)
  #newtext = re.sub(r"====[ ]?Declension[ ]?====\n\{\{ka-adj-decl.*?\}\}\n", "", newtext)
  #if secbody != newtext:
  #  notes.append("remove Georgian adjectival declension for noun")
  #  secbody = newtext

  newtext = re.sub(r"\{\{ka-noun-c\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody)
  newtext = re.sub(r"\{\{ka-noun-c\|.*\}\}", "{{ka-infl-noun}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-c}} to {{ka-infl-noun}}")
    secbody = newtext
  
  newtext = re.sub("\{\{ka-noun-a\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody)
  newtext = re.sub("\{\{ka-noun-a\|.*\}\}", "{{ka-infl-noun}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-a}} to {{ka-infl-noun}}")
    secbody = newtext
  
  newtext = re.sub("\{\{ka-noun-o\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody)
  newtext = re.sub("\{\{ka-noun-o\|.*\}\}", "{{ka-infl-noun}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-o}} to {{ka-infl-noun}}")
    secbody = newtext
  
  newtext = re.sub("\{\{ka-noun-u\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody)
  newtext = re.sub("\{\{ka-noun-u\|.*\}\}", "{{ka-infl-noun}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-u}} to {{ka-infl-noun}}")
    secbody = newtext
  
  newtext = re.sub("\{\{ka-noun-e\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody)
  newtext = re.sub("\{\{ka-noun-e\|.*\}\}", "{{ka-infl-noun}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-e}} to {{ka-infl-noun}}")
    secbody = newtext

  newtext = re.sub("\{\{ka\-noun-c-2\|.*?\|.*?\|(.*?)\|.*plural.*\}\}", r"{{ka-infl-noun|\1|-}}", secbody)
  newtext = re.sub("\{\{ka\-noun-c-2\|.*?\|.*?\|(.*?)\|.*\}\}", r"{{ka-infl-noun|\1}}", newtext)
  if secbody != newtext:
    notes.append("convert {{ka-noun-c-2}} to {{ka-infl-noun}}")
    secbody = newtext
  
  #newtext = re.sub(r"==\s*Declension\s*==", "==Inflection==", secbody)
  #if secbody != newtext:
  #  notes.append("==Declension== -> ==Inflection== in Georgian section")
  #  secbody = newtext

  sections[j] = secbody + sectail
  return "".join(sections), notes
Esempio n. 26
0
def process_text_on_page(index, pagetitle, text):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose)
  def verify_template_is_full_line(tn, line):
    templates = list(blib.parse_text(line).filter_templates())
    if type(tn) is list:
      tns = tn
    else:
      tns = [tn]
    tntext = "/".join(tns)
    if len(templates) == 0:
      pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line))
      return None
    t = templates[0]
    if tname(t) not in tns:
      pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" %
          (tntext, tntext, line))
      return None
    if unicode(t) != line:
      pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line))
      return None
    return t

  notes = []

  retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg,
    force_final_nls=True)
  if retval is None:
    return
  sections, j, secbody, sectail, has_non_lang = retval

  subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

  sect_for_wiki = 0
  for k in xrange(1, len(subsections), 2):
    if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]):
      sect_for_wiki = k + 1
    elif re.search(r"==\s*Pronunciation\s*==", subsections[k]):
      secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k])
      if secheader != subsections[k]:
        subsections[k] = secheader
        notes.append("remove extraneous spaces in ==Pronunciation== header")
      extra_notes = []
      parsed = blib.parse_text(subsections[k + 1])
      num_it_IPA = 0
      saw_it_pr = False
      for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["it-pr", "it-pronunciation"]:
          saw_it_pr = True
          break
        if tn == "it-IPA":
          num_it_IPA += 1
      if saw_it_pr:
        pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t))
        continue
      if num_it_IPA == 0:
        pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping")
        continue
      if num_it_IPA > 1:
        pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping")
        continue
      lines = subsections[k + 1].strip().split("\n")
      # Remove blank lines.
      lines = [line for line in lines if line]
      hyph_lines = []
      homophone_lines = []
      rfap_lines = []
      rhyme_lines = []
      must_continue = False
      audioarg = ""
      args = []
      bare_args = []
      args_for_hyph = []
      lines_so_far = []
      for lineind, line in enumerate(lines):
        origline = line
        lines_so_far.append(line)
        # In case of "* {{it-IPA|...}}", chop off the "* ".
        line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line)
        if line.startswith("{{it-IPA"):
          if args:
            pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline)
            must_continue = True
            break
          outer_ref_arg = None
          m = re.search("^(.*?) *<ref>(.*?)</ref>$", line)
          if m:
            line, outer_ref_arg = m.groups()
          ipat = verify_template_is_full_line("it-IPA", line)
          if ipat is None:
            must_continue = True
            break
          bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"]
          bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args]
          bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args]
          bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args]
          normalized_bare_args = [
            normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline)))
            for arg in bare_args
          ]
          if None in normalized_bare_args:
            must_continue = True
            break
          args = [x for x in bare_args]

          args_for_hyph = []
          for arg in normalized_bare_args:
            hypharg = (
              arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z")
              .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z")
            )
            hypharg = re.sub(pron_sign_c, "", hypharg)
            putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", ""))
            putative_pagetitle = remove_non_final_accents(putative_pagetitle)
            # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized
            # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not
            # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation
            # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want
            # pronunciation rè to match page title ré or vice versa.)
            if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle:
              args_for_hyph.append(hypharg)

          for param in ipat.params:
            pn = pname(param)
            pv = unicode(param.value)
            if re.search("^[0-9]+$", pn):
              continue
            m = re.search("^(ref|qual)([0-9]*)$", pn)
            if m:
              parampref, argnum = m.groups()
              argnum = int(argnum or "1") - 1
              if argnum >= len(args):
                pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % (
                  pn, pv, origline))
                must_continue = True
                break
              args[argnum] += "<%s:%s>" % (parampref, pv)
            else:
              pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if outer_ref_arg:
            if "<ref:" in args[-1]:
              pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s"
                  % (outer_ref_arg, args[-1], origline))
              must_continue = True
              break
            else:
              args[-1] += "<ref:%s>"  % outer_ref_arg
              extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}")
          continue
        if line.startswith("{{rfap"):
          line = "* " + line
        if line.startswith("{{wiki"):
          subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki]
          # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we
          # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate
          # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section.
          del lines_so_far[-1]
          subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:])
          notes.append("move {{wikipedia}} line to top of etym section")
          continue
        if not line.startswith("* ") and not line.startswith("*{"):
          pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s"
              % origline)
          must_continue = True
          break
        if line.startswith("* "):
          line = line[2:]
        else:
          line = line[1:]
        if line.startswith("{{hyph"):
          hyph_lines.append("* " + line)
        elif line.startswith("{{homophone"):
          homophone_lines.append("* " + line)
        elif line.startswith("{{rfap"):
          rfap_lines.append(line)
        elif line.startswith("{{audio"):
          audiot = verify_template_is_full_line("audio", line)
          if audiot is None:
            must_continue = True
            break
          if getparam(audiot, "1") != "it":
            pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline)
            must_continue = True
            break
          audiofile = getparam(audiot, "2")
          audiogloss = getparam(audiot, "3")
          for param in audiot.params:
            pn = pname(param)
            pv = unicode(param.value)
            if pn not in ["1", "2", "3"]:
              pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (
                pn, pv, origline))
              must_continue = True
              break
          if must_continue:
            break
          if audiogloss in ["Audio", "audio"]:
            audiogloss = ""
          if audiogloss:
            audiogloss = ";%s" % audiogloss
          audiopart = "<audio:%s%s>" % (audiofile, audiogloss)
          audioarg += audiopart
          pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart))
          extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot))
        elif line.startswith("{{rhyme"):
          rhyme_lines.append(line)
        elif remove_accents(line) == remove_accents(pagetitle):
          pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline)
        else:
          pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline)
          must_continue = True
          break
      if must_continue:
        continue

      if rhyme_lines:
        rhyme_error = False
        rhyme_pronuns = []
        for bare_arg in normalized_bare_args:
          pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg))
          if not pronun:
            rhyme_error = True
            break
          rhyme_pronun = (
            re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "")
            .replace(".", ""))
          if rhyme_pronun not in rhyme_pronuns:
            rhyme_pronuns.append(rhyme_pronun)
        if not rhyme_error:
          saw_non_matching_rhyme = False
          normalized_rhymes = []
          rhyme_line_text = ", ".join(rhyme_lines)
          normalized_bare_arg_text = ",".join(normalized_bare_args)
          rhyme_pronun_text = ",".join(rhyme_pronuns)
          for rhyme_line in rhyme_lines:
            rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line)
            if not rhymet:
              break
            if getparam(rhymet, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line))
              break
            rhymes = []
            must_break = False
            num_syl = ""
            rhyme_specific_num_syl = []
            for param in rhymet.params:
              pn = pname(param)
              pv = unicode(param.value)
              if not re.search("^s?[0-9]*$", pn):
                pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                    (pn, pv, tname(rhymet), rhyme_line))
                must_break = True
                break
              if pn == "s":
                num_syl = "<s:%s>" % pv
              elif pn.startswith("s"):
                rhyme_no = int(pn[1:]) - 1
                rhyme_specific_num_syl.append((rhyme_no, pv))
              elif int(pn) > 1:
                if pv:
                  rhymes.append([pv, ""])
            if must_break:
              break
            for rhyme_no, this_num_syl in rhyme_specific_num_syl:
              if rhyme_no >= len(rhymes):
                pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % (
                  rhyme_no + 1, this_num_syl, rhyme_line))
                must_break = True
                break
              rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl
            if must_break:
              break
            for rhyme, this_num_syl in rhymes:
              normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm")
              normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme)
              this_num_syl = this_num_syl or num_syl
              if this_num_syl and not args_for_hyph and not hyph_lines:
                pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s"
                    % (this_num_syl, rhyme, rhyme_line_text))
                saw_non_matching_rhyme = True
                normalized_rhymes.append(normalized_rhyme + this_num_syl)
              else:
                normalized_rhymes.append(normalized_rhyme)
                if rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif normalized_rhyme in rhyme_pronuns:
                  pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text))
                elif rhyme != normalized_rhyme:
                  pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
                else:
                  pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s"
                      % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
                  saw_non_matching_rhyme = True
          else: # no break
            if saw_non_matching_rhyme:
              pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s"
                  % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text))
              args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes)
              extra_notes.append("incorporate non-default rhymes into {{it-pr}}")
            else:
              extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}")
            rhyme_lines = []

      if not args:
        pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?")
        continue
      args[-1] += audioarg

      if hyph_lines:
        if len(hyph_lines) > 1:
          pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines))
        else:
          assert hyph_lines[0].startswith("* ")
          hyph_line = hyph_lines[0][2:]
          hyph_templates = re.split(", *", hyph_line)
          hyphs = []
          for hyph_template in hyph_templates:
            hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template)
            if not hypht:
              break
            syls = []
            if getparam(hypht, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template))
              break
            else:
              must_break = False
              for param in hypht.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn) and pn != "nocaption":
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hypht), hyph_line))
                  must_break = True
                  break
                if pn != "nocaption" and int(pn) > 1:
                  if not pv:
                    hyphs.append(syls)
                    syls = []
                  else:
                    syls.append(pv)
              if must_break:
                break
              if syls:
                hyphs.append(syls)
          else: # no break
            if hyphs:
              specified_hyphenations = [".".join(syls) for syls in hyphs]
              specified_hyphenations = [
                re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations]
              specified_hyphenations = [
                adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations]
              specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations]
              hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph]
              if set(specified_hyphenations) < set(hyphenations):
                pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
              elif set(specified_hyphenations) != set(hyphenations):
                hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations]
                rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations]
                def indices_of_syllable_markers(hyph):
                  # Get the character indices of the syllable markers, but not counting the syllable markers themselves
                  # (i.e. return the number of characters preceding the syllable marker).
                  raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."]
                  adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)]
                  return set(adjusted_indices)
                if set(specified_hyphenations) == set(hyphenations_without_accents):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif set(rehyphenated_specified_hyphenations) == set(hyphenations):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1
                    and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "")
                    and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])):
                  pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" %
                    (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                else:
                  if not hyphenations:
                    pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" %
                        (",".join(specified_hyphenations), hyph_line))
                  else:
                    pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" %
                        (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line))
                  args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations)
                  extra_notes.append("incorporate non-default hyphenations into {{it-pr}}")
              else:
                pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line)
                extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}")
              hyph_lines = []

      if homophone_lines:
        if len(homophone_lines) > 1:
          pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines))
        else:
          assert homophone_lines[0].startswith("* ")
          homophone_line = homophone_lines[0][2:]
          homophones = {}
          homophone_qualifiers = {}
          hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line)
          if hmpt:
            if getparam(hmpt, "1") != "it":
              pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line))
            else:
              for param in hmpt.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^q?[0-9]+$", pn):
                  pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" %
                      (pn, pv, tname(hmpt), homophone_line))
                  break
                if pn.startswith("q"):
                  homophone_qualifiers[int(pn[1:])] = pv
                elif int(pn) > 1:
                  homophones[int(pn) - 1] = pv
              else: # no break
                hmp_args = []
                for pn, pv in sorted(homophones.items()):
                  hmp_args.append(pv)
                  if pn in homophone_qualifiers:
                    hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn]
                args[-1] += "<hmp:%s>" % ",".join(hmp_args)
                extra_notes.append("incorporate homophones into {{it-pr}}")
                homophone_lines = []

      if args == ["+"]:
        it_pr = "{{it-pr}}"
      else:
        it_pr = "{{it-pr|%s}}" % ",".join(args)
      pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr))

      all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines)
      newsubsec = "%s\n\n" % all_lines
      if subsections[k + 1] != newsubsec:
        this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes
        notes.extend(this_notes)
      subsections[k + 1] = newsubsec

  secbody = "".join(subsections)
  # Strip extra newlines added to secbody
  sections[j] = secbody.rstrip("\n") + sectail
  return "".join(sections), notes
Esempio n. 27
0
def find_latin_section(text, pagemsg):
    return blib.find_modifiable_lang_section(text, "Latin", pagemsg)
Esempio n. 28
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    if not args.stdin:
        pagemsg("Processing")

    if "==French==" not in text or "{{IPA|" not in text:
        return

    retval = blib.find_modifiable_lang_section(text, "French", pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_french = retval

    if "{{IPA|" not in secbody:
        return

    notes = []

    def fix_up_section(sectext):
        parsed = blib.parse_text(sectext)

        pronun_templates = []
        verb_templates = []
        nonverb_templates = []
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn in french_nonverb_head_templates:
                nonverb_templates.append(t)
            elif tn in french_verb_head_templates:
                verb_templates.append(t)
            elif tn == "head":
                if getparam(t, "1").strip() != "fr":
                    pagemsg(
                        "WARNING: Saw wrong-language {{head}} template: %s" %
                        unicode(t))
                else:
                    pos = getparam(t, "2").strip()
                    if pos in french_verb_head_pos:
                        verb_templates.append(t)
                    else:
                        nonverb_templates.append(t)
        if verb_templates and nonverb_templates:
            pagemsg(
                "WARNING: Saw both verb template(s) %s and non-verb template(s) %s, using pos=vnv"
                % (",".join(unicode(x) for x in verb_templates), ",".join(
                    unicode(x) for x in nonverb_templates)))
        if not verb_templates and not nonverb_templates:
            pagemsg("WARNING: Didn't see any French templates")
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "IPA":
                m = re.search("^.*?%s.*$" % re.escape(unicode(t)), sectext,
                              re.M)
                if not m:
                    pagemsg(
                        "WARNING: Couldn't find template %s in section text" %
                        unicode(t))
                    line = "(unknown)"
                else:
                    line = m.group(0)
                if t.has("lang"):
                    first_param = 1
                    lang = getparam(t, "lang")
                else:
                    first_param = 2
                    lang = getparam(t, "1")
                if lang != "fr":
                    pagemsg(
                        "WARNING: Saw wrong-language {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                pron = getparam(t, str(first_param))
                if not pron:
                    pagemsg(
                        "WARNING: No pronun in {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                if getparam(t, str(first_param + 1)) or getparam(
                        t, str(first_param + 2)) or getparam(
                            t, str(first_param + 3)):
                    pagemsg(
                        "WARNING: Multiple pronuns in {{IPA}} template: %s in line <%s>"
                        % (unicode(t), line))
                    continue
                pos_val = ("vnv" if verb_templates and nonverb_templates else
                           "v" if verb_templates else "")
                pos_arg = "|pos=%s" % pos_val if pos_val else ""
                #autopron = expand_text("{{#invoke:User:Benwing2/fr-pron|show|%s%s}}" % (
                autopron = expand_text("{{#invoke:fr-pron|show|%s%s}}" %
                                       (pagetitle, pos_arg))
                if not autopron:
                    continue
                pron = re.sub("^/(.*)/$", r"\1", pron)
                pron = re.sub(r"^\[(.*)\]$", r"\1", pron)
                pron = pron.strip()
                pron = pron.replace("r", u"ʁ")
                # account for various common errors in Dawnraybot's generated pronunciations:
                # #1
                if pagetitle.endswith("rez") and pron.endswith(u"ʁɔe"):
                    pron = re.sub(u"ʁɔe$", u"ʁe", pron)
                # #2
                if re.search("ai(s|t|ent)$",
                             pagetitle) and pron.endswith(u"e"):
                    pron = re.sub(u"e$", u"ɛ", pron)
                # #3
                if pos_val == "v" and pagetitle.endswith(
                        "ai") and pron.endswith(u"ɛ"):
                    pron = re.sub(u"ɛ$", u"e", pron)
                if "." not in pron:
                    autopron = autopron.replace(".", "")
                if autopron.endswith(u"ɑ") and pron.endswith("a"):
                    autopron = autopron[:-1] + "a"
                if re.search(ur"ɑ[mt]$", autopron) and re.search(
                        u"a[mt]$", pron):
                    autopron = re.sub(ur"ɑ([mt])$", r"a\1", autopron)
                for i in xrange(2):
                    # {{fr-IPA}} deletes schwa in the sequence V.Cə.CV esp. in the
                    # sequence V.Cə.ʁV in verbs, whereas the bot-generated pronunciation
                    # doesn't. We have separate cases depending on the identity of C,
                    # which may go before or after the syllable break. Do it twice in
                    # case it occurs twice in a row in a single word.
                    pron = re.sub(
                        ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([jlmnɲwʃʒ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])",
                        r"\1\2.\3", pron)
                    pron = re.sub(
                        ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([szfvtdpbkɡ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])",
                        r"\1.\2\3", pron)
                # {{fr-IPA}} converts sequences of Crj and Clj to Cri.j and Cli.j,
                # which is correct, but Dawnraybot doesn't do that.
                pron = re.sub(u"([szfvtdpbkɡ][ʁl])j", r"\1i.j", pron)
                allow_mismatch = False
                if pron != autopron:
                    tempcall = "{{fr-IPA%s}}" % pos_arg
                    if pron.replace(u"ɑ", "a") == autopron.replace(u"ɑ", "a"):
                        pagemsg(
                            u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɑ vs. a only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    elif re.sub(u"ɛ(.)", r"e\1",
                                pron) == re.sub(u"ɛ(.)", r"e\1", autopron):
                        pagemsg(
                            u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɛ vs. e only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    elif pron.replace(".", "") == autopron.replace(".", ""):
                        pagemsg(
                            "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable division only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                        allow_mismatch = True
                    elif pron.replace(".",
                                      "").replace(" ", "") == autopron.replace(
                                          ".", "").replace(" ", ""):
                        pagemsg(
                            "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable/word division only: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    else:
                        pagemsg(
                            "WARNING: Can't replace %s with %s because auto-generated pron %s doesn't match %s: line <%s>"
                            % (unicode(t), tempcall, autopron, pron, line))
                    if not allow_mismatch:
                        continue
                origt = unicode(t)
                rmparam(t, "lang")
                rmparam(t, "1")
                rmparam(t, str(first_param))
                blib.set_template_name(t, "fr-IPA")
                if pos_val:
                    t.add("pos", pos_val)
                notes.append(
                    "replace manually-specified {{IPA|fr}} pronun with {{fr-IPA}}"
                )
                pagemsg("Replaced %s with %s: line <%s>" %
                        (origt, unicode(t), line))
                if "{{a|" in line:
                    pagemsg(
                        "WARNING: Replaced %s with %s on a line with an accent spec: line <%s>"
                        % (origt, unicode(t), line))
        return unicode(parsed)
Esempio n. 29
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not re.search(
            r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form",
            text):
        return

    pagemsg("Processing")

    notes = []

    retval = blib.find_modifiable_lang_section(text, "German", pagemsg)
    if retval is None:
        pagemsg("WARNING: Couldn't find German section")
        return
    sections, j, secbody, sectail, has_non_lang = retval

    if re.search("== *Etymology 1 *==", secbody):
        pagemsg("WARNING: Multiple etymology sections, skipping")
        return

    parsed = blib.parse_text(secbody)

    headt = None
    comparative_of_t = None
    superlative_of_t = None
    inflection_of_t = None
    need_superlative_of_t_lemma = None
    for t in parsed.filter_templates():
        origt = unicode(t)
        tn = tname(t)

        def do_comparative_superlative_of(pos, existing_t, should_end):
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{%s of}}, skipping: %s" %
                    (pos, origt))
                return False
            if existing_t:
                pagemsg(
                    "WARNING: Saw two {{%s of}} templates, skipping: %s and %s"
                    % (pos, unicode(existing_t), origt))
                return False
            if not headt:
                pagemsg(
                    "WARNING: Saw {{%s of}} without head template, skipping: %s"
                    % (pos, origt))
                return False
            if not pagetitle.endswith(should_end):
                pagemsg(
                    "WARNING: Incorrect ending for %s, should be -%s, skipping"
                    % (pos, should_end))
                return False
            param2 = getparam(headt, "2")
            if param2 != "%s adjective" % pos:
                headt.add("2", "%s adjective" % pos)
                notes.append(
                    "convert {{head|de|%s}} to {{head|de|%s adjective}}" %
                    (param2, pos))
            return t

        if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [
                "adjective form", "adjective comparative form",
                "adjective superlative form", "participle form"
        ]:
            if headt:
                pagemsg(
                    "WARNING: Saw two head templates, skipping: %s and %s" %
                    (unicode(headt), origt))
                return
            headt = t
        elif tn == "head" and getparam(t, "1") == "de" and getparam(
                t, "2") == "verb form":
            pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt)
        elif tn == "head":
            pagemsg("WARNING: Saw unrecognized head template, skipping: %s" %
                    origt)
            return
        elif tn == "comparative of":
            comparative_of_t = do_comparative_superlative_of(
                "comparative", comparative_of_t, "er")
            if not comparative_of_t:
                return
        elif tn == "superlative of":
            superlative_of_t = do_comparative_superlative_of(
                "superlative", superlative_of_t, "sten")
            if not superlative_of_t:
                return
        elif tn == "de-adj form of":
            pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" %
                    origt)
            return
        elif tn in ["inflection of", "infl of"]:
            if getparam(t, "1") != "de":
                pagemsg(
                    "WARNING: Saw wrong language in {{inflection of}}, skipping: %s"
                    % origt)
                return
            if not headt:
                pagemsg(
                    "WARNING: Saw {{inflection of}} without head template, skipping: %s"
                    % origt)
                return
            if inflection_of_t:
                pagemsg(
                    "WARNING: Saw {{inflection of}} twice, skipping: %s and %s"
                    % (unicode(inflection_of_t), origt))
                return
            inflection_of_t = t
            lemma = getparam(t, "2")
            if getparam(t, "3"):
                pagemsg(
                    "WARNING: Saw alt form in {{inflection of}}, skipping: %s"
                    % origt)
                return
            infl_tags = []
            for param in t.params:
                pn = pname(param)
                pv = unicode(param.value)
                if not re.search("^[0-9]+$", pn):
                    pagemsg(
                        "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s"
                        % (pn, pv, origt))
                    return
                if int(pn) >= 4:
                    infl_tags.append(pv)
            tags = "|".join(infl_tags)
            if tags not in tags_to_ending:
                pagemsg(
                    "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s"
                    % origt)
                return
            del t.params[:]
            ending = tags_to_ending[tags]
            if ending in ["sten", "esten"]:
                need_superlative_of_t_lemma = lemma
            blib.set_template_name(t, "de-adj form of")
            t.add("1", lemma)

            no_explicit = check_if_lemma_and_ending_match_pagetitle(
                lemma, ending, pagetitle, allow_umlaut=True)
            if not no_explicit:
                pagemsg("WARNING: Explicit ending %s required for lemma %s" %
                        (ending, lemma))
                t.add("2", ending)
            notes.append(
                "convert {{inflection of|de|...}} to {{de-adj form of}}")
            if "comd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "comparative adjective form":
                    headt.add("2", "comparative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|comparative adjective form}}"
                        % param2)
            elif "supd" in tags:
                param2 = getparam(headt, "2")
                if param2 != "superlative adjective form":
                    headt.add("2", "superlative adjective form")
                    notes.append(
                        "convert {{head|de|%s}} to {{head|de|superlative adjective form}}"
                        % param2)

    secbody = unicode(parsed)

    def add_adj_form_of(secbody, pos, comparative_superlative_t, ending):
        lemma = getparam(comparative_superlative_t, "2")
        if check_if_lemma_and_ending_match_pagetitle(lemma,
                                                     ending,
                                                     pagetitle,
                                                     allow_umlaut=False):
            form_pos = "superlative adjective form" if pos == "superlative" else "adjective form"
            newsec = """

===Adjective===
{{head|de|%s}}

# {{de-adj form of|%s}}""" % (form_pos, lemma)
            secbody, replaced = blib.replace_in_text(
                secbody,
                unicode(comparative_superlative_t),
                unicode(comparative_superlative_t) + newsec,
                pagemsg,
                abort_if_warning=True)
            if not replaced:
                pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" %
                        (ending, unicode(comparative_of_t)))
                return secbody, False
            notes.append("add {{de-adj form of}} for %s" % pos)
        else:
            pagemsg(
                "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" %
                (lemma, pos, ending))
        return secbody, True

    if comparative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t,
                                      "er")
        if not ok:
            return

    if superlative_of_t and not inflection_of_t:
        secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t,
                                      "sten")
        if not ok:
            return

    if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma:
        cursec = """===Adjective===
{{head|de|superlative adjective form}}

# %s""" % unicode(inflection_of_t)
        newsec = """===Adjective===
{{head|de|superlative adjective}}

# {{superlative of|de|%s}}

""" % need_superlative_of_t_lemma
        secbody, replaced = blib.replace_in_text(secbody,
                                                 cursec,
                                                 newsec + cursec,
                                                 pagemsg,
                                                 abort_if_warning=True)
        if not replaced:
            pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" %
                    unicode(inflection_of_t))
            return
        notes.append("add {{superlative of|de|...}}")

    sections[j] = secbody + sectail
    text = "".join(sections)

    if not notes:
        pagemsg("WARNING: Couldn't convert page")

    return text, notes
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    origtext = text
    notes = []

    def get_templated_self_link(link):
        if args.self_links_use_raw:
            return "[[#English|%s]]" % link
        else:
            return "{{l|en|%s}}" % link

    def fix_sec_links(sectext):
        lines = sectext.split("\n")
        new_lines = []
        for line in lines:
            if line.startswith("#"):
                if args.convert_raw_self_links:
                    template_split_re = r"(\{\{(?:[^{}]|\{\{[^{}]*\}\})*\}\})"
                    # Split templates and only change non-template text
                    split_templates = re.split(template_split_re, line)
                    for l in xrange(0, len(split_templates), 2):
                        while True:
                            newtext = re.sub(
                                r"^#(.*?)\[\[%s\]\]" % pagetitle,
                                r"#\1" + get_templated_self_link(pagetitle),
                                split_templates[l], 0, re.M)
                            if newtext == split_templates[l]:
                                break
                            changed = True
                            notes.append(
                                "replace raw self link to English terms with templated one"
                            )
                            split_templates[l] = newtext
                    line = "".join(split_templates)
                else:

                    def replace_templated(m):
                        origm1 = m.group(1)
                        m1 = origm1
                        if "[[" not in m1:
                            m1 = "[[%s]]" % m1
                        m1_new = m1.replace("[[%s]]" % pagetitle,
                                            get_templated_self_link(pagetitle))
                        saw_self_link = False
                        if m1_new != m1:
                            saw_self_link = True
                            m1 = m1_new
                        if m1 != get_templated_self_link(origm1):
                            notes.append(
                                "replace templated link to English terms in defns with raw link(s)"
                                + (", keeping self-links templated"
                                   if saw_self_link else ""))
                        return m1

                    line = re.sub(
                        r"\{\{l\|en\|((?:[^{}|]|\[\[[^{}\[\]]*\]\])*?)\}\}",
                        replace_templated, line)
            new_lines.append(line)
        return "\n".join(new_lines)

    if args.lang:
        retval = blib.find_modifiable_lang_section(
            text, None if args.partial_page else args.lang, pagemsg)
        if retval is None:
            pagemsg("WARNING: Couldn't find %s section" % args.lang)
            return
        sections, j, secbody, sectail, has_non_lang = retval

        secbody = fix_sec_links(secbody)
        sections[j] = secbody + sectail
        text = "".join(sections)
    else:
        text = fix_sec_links(text)

    return text, notes