コード例 #1
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None
    sections, j, secbody, sectail, has_non_latin = retval
    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in lalib.la_headword_templates:
            for head in lalib.la_get_headword_from_template(
                    t, pagetitle, pagemsg):
                no_macrons_head = remove_macrons(blib.remove_links(head))
                if pagetitle.startswith("Reconstruction"):
                    unprefixed_title = "*" + re.sub(".*/", "", pagetitle)
                else:
                    unprefixed_title = pagetitle
                if no_macrons_head != unprefixed_title:
                    pagemsg("WARNING: Bad Latin head: %s" % unicode(t))
    return None, None
コード例 #2
0
def investigate_possible_adj(index, adj_pagename, adv, adv_defns):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, adj_pagename, txt))

    pagemsg("Trying for adverb %s" % adv)
    page = pywikibot.Page(site, adj_pagename)
    if not page.exists():
        pagemsg("Doesn't exist for adverb %s" % adv)
        return

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn in ["la-adj", "la-part"]:
                adj = lalib.la_get_headword_from_template(
                    t, adj_pagename, pagemsg)[0]
                adj_defns = lalib.find_defns(subsections[k])
                msg("%s /// %s /// %s /// %s" %
                    (adv, adj, ";".join(adv_defns), ";".join(adj_defns)))
コード例 #3
0
def process_page(page, index, add_dot_after_i, convert_j):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  origtext = text

  retval = lalib.find_latin_section(text, pagemsg)
  if retval is None:
    return None, None

  sections, j, secbody, sectail, has_non_latin = retval

  notes = []

  parsed = blib.parse_text(secbody)

  for t in parsed.filter_templates():
    if tname(t) == "la-IPA":
      param1 = getparam(t, "1") or pagetitle
      for prefix in prefixes:
        if type(prefix) is list:
          prefix, macron_prefix = prefix
        else:
          macron_prefix = prefix
        orig_param1 = param1
        if re.search("^%s[ij]" % macron_prefix, param1):
          if re.search(u"^%si%s" % (macron_prefix, vowel_re), param1) and add_dot_after_i:
            param1 = re.sub("^%si" % macron_prefix, "%si." % macron_prefix, param1)
            notes.append("add dot after i in {{la-IPA}} to force vocalic pronunciation")
          elif re.search("^%sj%s" % (macron_prefix, vowel_re), param1) and convert_j:
            param1 = re.sub("^%sj" % macron_prefix, "%si" % macron_prefix, param1)
            notes.append("convert j to i in {{la-IPA}} to match pagename; j no longer necessary to force consonantal pronunciation")
          if param1 != orig_param1:
            origt = unicode(t)
            # Fetch all params.
            params = []
            for param in t.params:
              pname = unicode(param.name)
              if pname.strip() not in ["1"]:
                params.append((pname, param.value, param.showkey))
            # Erase all params.
            del t.params[:]
            t.add("1", param1)
            # Put remaining parameters in order.
            for name, value, showkey in params:
              t.add(name, value, showkey=showkey, preserve_spacing=False)
            pagemsg("Replaced %s with %s" % (origt, unicode(t)))
          break
      else:
        # no break
        pagemsg("WARNING: Unable to match pronun template against any prefixes: %s" % unicode(t))

  secbody = unicode(parsed)
  sections[j] = secbody + sectail
  return "".join(sections), notes
コード例 #4
0
def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if " " in pagetitle:
        pagemsg("WARNING: Space in page title, skipping")
        return
    pagemsg("Processing")

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn == "la-adv":
                adv = blib.remove_links(getparam(t, "1")) or pagetitle
                macron_stem, is_stem = lalib.infer_adv_stem(adv)
                if not is_stem:
                    pagemsg(
                        "WARNING: Couldn't infer stem from adverb %s, not standard: %s"
                        % (adv, origt))
                    continue
                adv_defns = lalib.find_defns(subsections[k])
                possible_adjs = []
                stem = lalib.remove_macrons(macron_stem)
                possible_adjs.append(stem + "us")
                possible_adjs.append(stem + "is")
                if stem.endswith("nt"):
                    possible_adjs.append(stem[:-2] + "ns")
                if stem.endswith("plic"):
                    possible_adjs.append(stem[:-2] + "ex")
                if stem.endswith("c"):
                    possible_adjs.append(stem[:-1] + "x")
                if re.search("[aeiou]r$", stem):
                    possible_adjs.append(stem)
                elif stem.endswith("r"):
                    possible_adjs.append(stem[:-1] + "er")
                if adv.endswith(u"iē"):
                    possible_adjs.append(stem + "ius")
                for possible_adj in possible_adjs:
                    investigate_possible_adj(index, possible_adj, adv,
                                             adv_defns)
コード例 #5
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^==.*==\n)", secbody, 0, re.M)

    notes = []

    for k in xrange(2, len(subsections), 2):
        if "==Inflection==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])
            poses = set()
            for t in parsed.filter_templates():
                pos = lalib.la_infl_template_pos(t)
                if pos:
                    poses.add(pos)
            poses = sorted(list(poses))
            if len(poses) > 1:
                pagemsg(
                    "WARNING: Saw inflection templates for multiple parts of speech: %s"
                    % ",".join(poses))
            elif len(poses) == 0:
                pagemsg(
                    "WARNING: Saw no inflection templates in ==Inflection== section"
                )
            else:
                if poses[0] == "verb":
                    subsections[k - 1] = subsections[k - 1].replace(
                        "Inflection", "Conjugation")
                    notes.append(
                        "convert Latin ==Inflection== header to ==Conjugation=="
                    )
                else:
                    subsections[k - 1] = subsections[k - 1].replace(
                        "Inflection", "Declension")
                    notes.append(
                        "convert Latin ==Inflection== header to ==Declension=="
                    )

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #6
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    parsed = blib.parse_text(secbody)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["l", "m", "alternative form of", "alt form"]:
            if tn in ["l", "m"]:
                lang = getparam(t, "1")
                termparam = 2
            elif getparam(t, "lang"):
                lang = getparam(t, "lang")
                termparam = 1
            else:
                lang = getparam(t, "1")
                termparam = 2
            if lang != "la":
                #pagemsg("WARNING: Wrong language in template: %s" % unicode(t))
                continue
            term = getparam(t, str(termparam))
            alt = getparam(t, str(termparam + 1))
            gloss = getparam(t, str(termparam + 2))
            if alt and lalib.remove_macrons(alt) == term:
                origt = unicode(t)
                t.add(str(termparam), alt)
                if gloss:
                    t.add(str(termparam + 1), "")
                else:
                    rmparam(t, str(termparam + 1))
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("move alt param to link param in %s" % tn)

    secbody = unicode(parsed)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #7
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    notes = []
    for k in xrange(2, len(subsections), 2):
        newtext = re.sub(r"^\n*(\{\{la-.*?-form)", r"\1", subsections[k])
        if newtext != subsections[k]:
            notes.append(
                "remove extraneous newlines before Latin non-lemma headword")
        indent = len(re.sub("^(=+).*\n", r"\1", subsections[k - 1]))

        def add_header(m):
            lastchar, tempname = m.groups()
            if tempname in tempname_to_header:
                header_pos = tempname_to_header[tempname]
            else:
                pagemsg("WARNING: Unrecognized template name: %s" % tempname)
                return m.group(0)
            header = "=" * indent + header_pos + "=" * indent
            preceding_newline = "\n" if lastchar != "\n" else ""
            return lastchar + "\n" + preceding_newline + header + "\n{{" + tempname

        newnewtext = re.sub(r"([^=])\n\{\{(la-[a-z -]*?-form)", add_header,
                            newtext)
        if newnewtext != newtext:
            notes.append("add missing header before Latin non-lemma form")
        subsections[k] = newnewtext
    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #8
0
def process_page(page, index, sectext, comment):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    retval = lalib.find_latin_section(unicode(page.text), pagemsg)
    if retval is None:
        return None, None

    sectext = re.sub(r"^==Latin==\n", "", sectext) + "\n\n"
    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    sections[j] = sectext
    notes.append(comment)
    return "".join(sections).rstrip("\n"), notes
コード例 #9
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    # Greatly speed things up when --stdin by ignoring non-Latin pages
    if "==Latin==" not in text:
        return None, None

    if not re.search("la-(noun|proper noun|pronoun|verb|adj|num|suffix)-form",
                     text):
        return None, None

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in [
                "la-noun-form", "la-proper noun-form", "la-pronoun-form",
                "la-verb-form", "la-adj-form", "la-num-form", "la-suffix-form"
        ]:
            if not getparam(t, "1"):
                pagemsg("WARNING: Missing 1=: %s" % unicode(t))
            for param in t.params:
                pn = pname(param)
                if pn not in ["1", "g", "g2", "g3", "g4"]:
                    pagemsg("WARNING: Extraneous param %s=: %s" %
                            (pn, unicode(t)))
    return None, None
コード例 #10
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    parsed = blib.parse_text(secbody)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-decl-3rd-I":
            stem = getparam(t, "1")
            if stem.endswith("polis"):
                blib.set_template_name(t, "la-decl-3rd-polis")
                t.add("1", stem[:-5])
                notes.append("Fix noun in -polis to use {{la-decl-3rd-polis}}")
            else:
                pagemsg(
                    "WARNING: Found la-decl-3rd-I without stem in -polis: %s" %
                    unicode(t))
        elif tn == "la-noun":
            blib.set_template_name(t, "la-proper noun")

    secbody = unicode(parsed).replace("==Noun==", "==Proper noun==")

    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #11
0
def process_page(page, index, parsed):
  global args
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

  pagemsg("Processing")

  text = unicode(page.text)
  origtext = text

  notes = []

  retval = lalib.find_latin_section(text, pagemsg)
  if retval is None:
    return None, None

  sections, j, secbody, sectail, has_non_latin = retval

  subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M)

  saw_a_template = False

  for k in xrange(2, len(subsections), 2):
    parsed = blib.parse_text(subsections[k])
    la_noun_template = None
    la_ndecl_template = None
    must_continue = False
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "la-ndecl":
        if la_ndecl_template:
          pagemsg("WARNING: Saw multiple noun declension templates in subsection, %s and %s, skipping" % (
            unicode(la_ndecl_template), unicode(t)))
          must_continue = True
          break
        la_ndecl_template = t
        saw_a_template = True
      if tn in ["la-noun", "la-proper noun", "la-location"] or (
        tn == "head" and getparam(t, "1") == "la" and getparam(t, "2") in ["noun", "proper noun"]
      ):
        if la_noun_template:
          pagemsg("WARNING: Saw multiple noun headword templates in subsection, %s and %s, skipping" % (
            unicode(la_noun_template), unicode(t)))
          must_continue = True
          break
        la_noun_template = t
        saw_a_template = True
    if must_continue:
      continue
    if not la_noun_template and not la_ndecl_template:
      continue
    new_style_headword_template = (
      la_noun_template and
      tname(la_noun_template) in ["la-noun", "la-proper noun"] and
      not getparam(la_noun_template, "head2") and
      not getparam(la_noun_template, "2") and
      not getparam(la_noun_template, "3") and
      not getparam(la_noun_template, "4") and
      not getparam(la_noun_template, "decl")
    )
    if la_noun_template and not la_ndecl_template:
      if (tname(la_noun_template) in ["la-noun", "la-proper noun"] and
          getparam(la_noun_template, "indecl")):
        if new_style_headword_template:
          pagemsg("Found new-style indeclinable noun headword template, skipping: %s" %
            unicode(la_noun_template))
          continue
        if (getparam(la_noun_template, "head2") or
            getparam(la_noun_template, "decl") or
            getparam(la_noun_template, "2") and
            getparam(la_noun_template, "2") != getparam(la_noun_template, "1") or
            not getparam(la_noun_template, "3")):
          pagemsg("WARNING: Found old-style indeclinable noun headword template and don't know how to convert: %s" %
              unicode(la_noun_template))
          continue
        gender = getparam(la_noun_template, "3")
        orig_la_noun_template = unicode(la_noun_template)
        la_noun_template.add("g", gender[0], before="3")
        rmparam(la_noun_template, "3")
        rmparam(la_noun_template, "2")
        pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
        notes.append("convert indeclinable {{la-noun}}/{{la-proper noun}} template to new style")
        subsections[k] = unicode(parsed)
        continue
      else:
        pagemsg("WARNING: Saw noun headword template but no declension template: %s" % unicode(la_noun_template))
        continue
    if la_ndecl_template and not la_noun_template:
      pagemsg("WARNING: Saw noun declension template but no headword template: %s" % unicode(la_ndecl_template))
      continue

    orig_la_noun_template = unicode(la_noun_template)
    if new_style_headword_template:
      pagemsg("Found new-style noun headword template, skipping: %s" %
        orig_la_noun_template)
      continue

    def render_headword_and_decl():
      return "headword template <from> %s <to> %s <end>, declension template <from> %s <to> %s <end>" % (
        orig_la_noun_template, orig_la_noun_template,
        unicode(la_ndecl_template), unicode(la_ndecl_template)
      )

    if tname(la_noun_template) == "head":
      explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["head", "head1"], "head")
      lemma = explicit_head_param_head or [pagetitle]
    elif tname(la_noun_template) == "la-location":
      explicit_head_param_head = [getparam(la_noun_template, "1")]
    else:
      explicit_head_param_head = blib.fetch_param_chain(la_noun_template, ["1", "head", "head1"], "head")
    lemma = explicit_head_param_head or [pagetitle]
    if "[[" in lemma[0]:
      if len(lemma) > 1:
        pagemsg("WARNING: Multiple lemmas %s and lemmas with links in them, can't handle, skipping: %s" % (
          ",".join(lemma), render_headword_and_decl()
        ))
        continue
      ndecl_lemma = getparam(la_ndecl_template, "1")
      if "[[" not in ndecl_lemma:
        must_continue = False
        for m in re.finditer(r"(\[\[.*?\]\])", lemma[0]):
          link = m.group(1)
          plainlink = blib.remove_links(link)
          if plainlink not in ndecl_lemma:
            pagemsg("WARNING: Can't interpolate link %s into declension template, skipping: %s" % (
              link, render_headword_and_decl()))
            must_continue = True
            break
          ndecl_lemma = ndecl_lemma.replace(plainlink, link, 1)
        if must_continue:
          continue
        new_ndecl_template = blib.parse_text(unicode(la_ndecl_template)).filter_templates()[0]
        new_ndecl_template.add("1", ndecl_lemma)
        pagemsg("Adding links to decl template %s to produce %s" % (
          unicode(la_ndecl_template), unicode(new_ndecl_template)))
        la_ndecl_template = new_ndecl_template

    noun_props = new_generate_noun_forms(unicode(la_ndecl_template), errandpagemsg, expand_text, include_props=True)
    if noun_props is None:
      continue
    decl_gender = noun_props.get("g", None)

    if tname(la_noun_template) == "head":
      noun_gender = blib.fetch_param_chain(la_noun_template, ["g", "g1"], "g")
      if not noun_gender and not decl_gender:
        pagemsg("WARNING: No gender in {{head|la|...}} and no declension gender, can't proceed, skipping: %s" % render_headword_and_decl())
        continue
    elif tname(la_noun_template) == "la-location":
      noun_gender = [getparam(la_noun_template, "4")]
    else:
      noun_gender = blib.fetch_param_chain(la_noun_template, ["3", "g", "g1"], "g")
      if not noun_gender:
        pagemsg("WARNING: No gender in old-style headword, skipping: %s" % render_headword_and_decl())
        continue

    def do_compare_headword_decl_forms(id_slot, headword_forms, decl_slots,
        adjust_for_missing_gen_forms=False, remove_headword_links=False):
      return compare_headword_decl_forms(id_slot, headword_forms, decl_slots,
        noun_props, render_headword_and_decl(), pagemsg,
        adjust_for_missing_gen_forms=adjust_for_missing_gen_forms,
        remove_headword_links=remove_headword_links)

    def check_headword_vs_decl_decls(regularized_noun_decl):
      must_continue = False
      decl_lemma = getparam(la_ndecl_template, "1") 
      if "((" in decl_lemma:
        pagemsg("WARNING: (( in decl_lemma, can't handle, skipping: %s" %
            render_headword_and_decl())
        must_continue = True
        return
      segments = re.split(r"([^<> -]+<[^<>]*>)", decl_lemma)
      decl_decls = []
      for i in xrange(1, len(segments) - 1, 2):
        m = re.search("^([^<> -]+)<([^<>]*)>$", segments[i])
        stem_spec, decl_and_subtype_spec = m.groups()
        decl_and_subtypes = decl_and_subtype_spec.split(".")
        decl_decl = decl_and_subtypes[0]
        decl_decls.append(decl_decl)
      if set(regularized_noun_decl) != set(decl_decls):
        if set(regularized_noun_decl) <= set(decl_decls):
          pagemsg("headword decl %s subset of declension decl %s, allowing: %s" % (
            ",".join(regularized_noun_decl), ",".join(decl_decls),
            render_headword_and_decl()))
        else:
          pagemsg("WARNING: headword decl %s not same as or subset of declension decl %s, skipping: %s" % (
            ",".join(regularized_noun_decl), ",".join(decl_decls),
            render_headword_and_decl()))
          must_continue = True
      return must_continue

    def check_headword_vs_decl_gender():
      must_continue = False
      if len(noun_gender) == 1 and noun_gender[0] == decl_gender:
        need_explicit_gender = False
      else:
        need_explicit_gender = True
        if len(noun_gender) > 1:
          pagemsg("WARNING: Saw multiple headword genders %s, please verify: %s" % (
            ",".join(noun_gender), render_headword_and_decl()))
        elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")):
          pagemsg("WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % (
          noun_gender[0], decl_gender, render_headword_and_decl()))
          must_continue = True
      return need_explicit_gender, must_continue

    def erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender):
      # Erase all params
      del la_noun_template.params[:]
      # Copy params from decl template
      for param in la_ndecl_template.params:
        pname = unicode(param.name)
        la_noun_template.add(pname, param.value, showkey=param.showkey, preserve_spacing=False)
      # Add explicit gender if needed
      if need_explicit_gender:
        explicit_genders = []
        for ng in noun_gender:
          ng = ng[0]
          if ng not in explicit_genders:
            explicit_genders.append(ng)
        blib.set_param_chain(la_noun_template, explicit_genders, "g", "g")

    if tname(la_noun_template) == "head":
      if explicit_head_param_head and not do_compare_headword_decl_forms("lemma", explicit_head_param_head, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Check for extraneous {{head|la|...}} parameters
      must_continue = False
      is_proper_noun = getparam(la_ndecl_template, "2") == "proper noun"
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2"] or re.search("^(head|g)[0-9]*$", pname.strip()):
          continue
        pagemsg("WARNING: Saw extraneous param %s in {{head}} template, skipping: %s" % (
          pname, render_headword_and_decl()))
        must_continue = True
        break
      if must_continue:
        continue
      # Copy params from decl template
      blib.set_template_name(la_noun_template,
        "la-proper noun" if is_proper_noun else "la-noun")
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{head|la|...}} to new-style {{la-noun}}/{{la-proper noun}} template")

    elif tname(la_noun_template) == "la-location":
      noun_decl = [getparam(la_noun_template, "6")]
      if not noun_decl:
        pagemsg("WARNING: No noun decl in {{la-location}}, skipping: %s" % render_headword_and_decl())
        continue
      genitive = [getparam(la_noun_template, "2")]
      if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"],
          adjust_for_missing_gen_forms=True, remove_headword_links=True):
        continue
      regularized_noun_decl = []
      must_continue = False
      for nd in noun_decl:
        if nd not in noun_decl_to_decl_type:
          pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % (
            nd, render_headword_and_decl()))
          must_continue = True
          break
        regularized_noun_decl.append(noun_decl_to_decl_type[nd])
      if must_continue:
        continue
      must_continue = check_headword_vs_decl_decls(regularized_noun_decl)
      if must_continue:
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Check for extraneous {{la-location}} parameters
      must_continue = False
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2", "3", "4", "5", "6"]:
          continue
        pagemsg("WARNING: Saw extraneous param %s in {{la-location}} template, skipping: %s" % (
          pname, render_headword_and_decl()))
        must_continue = True
        break
      if must_continue:
        continue
      blib.set_template_name(la_noun_template, "la-proper noun")
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{la-location}} to new-style {{la-proper noun}} template")

    else:
      # old-style {{la-noun}} or {{la-proper noun}}
      noun_decl = blib.fetch_param_chain(la_noun_template, ["4", "decl", "decl1"], "decl")
      if not noun_decl:
        pagemsg("WARNING: No noun decl in old-style headword, skipping: %s" % render_headword_and_decl())
        continue
      genitive = blib.fetch_param_chain(la_noun_template, ["2", "gen", "gen1"], "gen")
      if not do_compare_headword_decl_forms("lemma", lemma, ["linked_nom_sg", "linked_nom_pl"]):
        continue
      if not do_compare_headword_decl_forms("genitive", genitive, ["gen_sg", "gen_pl"],
          adjust_for_missing_gen_forms=True, remove_headword_links=True):
        continue
      regularized_noun_decl = []
      must_continue = False
      for nd in noun_decl:
        if nd not in noun_decl_to_decl_type:
          pagemsg("WARNING: Unrecognized noun decl=%s, skipping: %s" % (
            nd, render_headword_and_decl()))
          must_continue = True
          break
        regularized_noun_decl.append(noun_decl_to_decl_type[nd])
      if must_continue:
        continue

      must_continue = check_headword_vs_decl_decls(regularized_noun_decl)
      if must_continue:
        continue
      need_explicit_gender, must_continue = check_headword_vs_decl_gender()
      if must_continue:
        continue

      # Fetch remaining params from headword template
      headword_params = []
      for param in la_noun_template.params:
        pname = unicode(param.name)
        if pname.strip() in ["1", "2", "3", "4"] or re.search("^(head|gen|g|decl)[0-9]*$", pname.strip()):
          continue
        headword_params.append((pname, param.value, param.showkey))
      erase_and_copy_params_and_add_gender(need_explicit_gender, noun_gender)
      # Copy remaining params from headword template
      for name, value, showkey in headword_params:
        la_noun_template.add(name, value, showkey=showkey, preserve_spacing=False)
      pagemsg("Replaced %s with %s" % (orig_la_noun_template, unicode(la_noun_template)))
      notes.append("convert {{la-noun}}/{{la-proper noun}} params to new style")

    subsections[k] = unicode(parsed)

  if not saw_a_template:
    pagemsg("WARNING: Saw no noun headword or declension templates")

  secbody = "".join(subsections)
  sections[j] = secbody + sectail
  return "".join(sections), notes
コード例 #12
0
def correct_nom_sg_n_participle(page, index, participle, lemma):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    if "===Etymology 1===" in secbody:
        pagemsg("WARNING: Multiple etymologies, don't know what to do")
        return None, None

    notes = []

    subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M)

    participle_text = """{{head|la|participle|[[indeclinable]]|head=%s}}

# {{inflection of|la|%s||perf|pasv|part}}\n\n""" % (participle, lemma)
    saw_participle = False
    for k in xrange(2, len(subsections), 2):
        if subsections[k - 1] == "===Participle===\n":
            if saw_participle:
                pagemsg("WARNING: Saw multiple participles, skipping")
                return None, None
            saw_participle = True
            subsections[k] = participle_text
            notes.append("correct participle %s of %s to be impersonal" %
                         (participle, lemma))
    secbody = "".join(subsections)
    if not saw_participle:
        for k in xrange(2, len(subsections), 2):
            insert_before = False
            if subsections[k - 1] == "===References===\n":
                pagemsg(
                    "Inserting new participle subsection before references subsection"
                )
                insert_before = True
            elif re.search(r"\{\{inflection of.*\|sup", subsections[k]):
                pagemsg(
                    "Inserting new participle subsection before supine subsection"
                )
                insert_before = True
            if insert_before:
                subsections[k - 1:k -
                            1] = ["===Participle===\n" + participle_text]
                secbody = "".join(subsections)
                break
        else:
            # no break
            if not secbody.endswith("\n\n"):
                secbody += "\n\n"
            secbody += "===Participle===\n" + participle_text
        notes.append("add impersonal participle %s of %s" %
                     (participle, lemma))

    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #13
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  origtext = text

  retval = lalib.find_latin_section(text, pagemsg)
  if retval is None:
    return None, None

  sections, j, secbody, sectail, has_non_latin = retval

  subsections = re.split("(^==.*==\n)", secbody, 0, re.M)

  if len(subsections) != 3:
    pagemsg("WARNING: Not right # of sections (expected 1): %s" %
        ",".join(subsections[k].strip() for k in xrange(1, len(subsections), 2)))
    return None, None

  if subsections[1] != "===Verb===\n":
    pagemsg("WARNING: Expected ===Verb=== in subsections[1] but saw %s" %
        subsections[1].strip())
    return None, None

  parsed = blib.parse_text(subsections[2])
  infl = None
  lemma = None
  infloft = None
  for t in parsed.filter_templates():
    if tname(t) == "la-verb-form":
      if infl:
        pagemsg("WARNING: Saw more than one {{la-verb-form}} call: %s" %
            unicode(t))
        return None, None
      infl = getparam(t, "1")
    elif tname(t) == "inflection of":
      if lemma:
        pagemsg("WARNING: Saw more than one {{inflection of}} call: %s" %
            unicode(t))
        return None, None
      if getparam(t, "lang"):
        lemma = getparam(t, "1")
      else:
        lemma = getparam(t, "2")
      infloft = t
    else:
      pagemsg("WARNING: Saw unexpected template: %s" % unicode(t))
      return None, None
  if not infl or not lemma:
    pagemsg("WARNING: Didn't find both inflection %s and lemma %s" % (
      infl, lemma))
    return None, None
  infl = re.sub(u" (esse|īrī)$", "", infl)
  if infl.endswith(u"us"):
    if infl.endswith(u"ūrus"):
      partdesc = "Future active participle"
      head_template = "{{la-future participle|%s}}" % infl[:-2]
      infl_template = "{{la-decl-1&2|%s}}" % infl[:-2]
    else:
      if "perf|act" in unicode(infloft):
        partdesc = "Perfect active participle"
      else:
        partdesc = "Perfect passive participle"
      head_template = "{{la-perfect participle|%s}}" % infl[:-2]
      infl_template = "{{la-decl-1&2|%s}}" % infl[:-2]
    sectext = """
===Etymology===
%s of {{m|la|%s}}.

===Pronunciation===
* {{la-IPA|%s}}

===Participle===
%s

# {{rfdef|la}}

====Declension====
%s""" % (partdesc, lemma, infl, head_template, infl_template)
    comment = "correct Latin form to participle"
  elif infl.endswith("um"):
    sectext = """
===Etymology===
From {{m|la|%s}}.

===Pronunciation===
* {{la-IPA|%s}}

===Gerund===
{{la-gerund|%s}}

# {{rfdef|la}}

====Declension====
{{la-decl-gerund|%s}}

===Participle===
{{la-part-form|%s}}

# {{inflection of|la|%s||acc|m|s|;|nom//acc//voc|n|s}}""" % (
    lemma, infl, infl[:-2], infl[:-2], infl, infl[:-2] + "us"
  )
    comment = "correct Latin form to gerund/participle form"
  else:
    pagemsg("WARNING: Unrecognized ending for participle/gerund %s" % infl)
    return None, None

  sections[j] = sectext + sectail
  return "".join(sections), comment
コード例 #14
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    saw_noun = None
    saw_proper_noun = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-noun":
            if saw_noun:
                pagemsg(
                    "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_noun), unicode(t)))
                return
            saw_noun = t
        elif tn == "la-proper noun":
            if saw_proper_noun:
                pagemsg(
                    "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_proper_noun), unicode(t)))
                return
            saw_proper_noun = t
    if saw_noun and saw_proper_noun:
        pagemsg(
            "WARNING: Saw both noun and proper noun, can't correct header/headword"
        )
        return
    if not saw_noun and not saw_proper_noun:
        pagemsg(
            "WARNING: Saw neither noun nor proper noun, can't correct header/headword"
        )
        return
    pos = "pn" if saw_proper_noun else "n"
    ht = saw_proper_noun or saw_noun
    if getparam(ht, "indecl"):
        pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht))
        return
    generate_template = blib.parse_text(unicode(ht)).filter_templates()[0]
    blib.set_template_name(generate_template, "la-generate-noun-forms")
    blib.remove_param_chain(generate_template, "lemma", "lemma")
    blib.remove_param_chain(generate_template, "m", "m")
    blib.remove_param_chain(generate_template, "f", "f")
    blib.remove_param_chain(generate_template, "g", "g")
    rmparam(generate_template, "type")
    rmparam(generate_template, "indecl")
    rmparam(generate_template, "id")
    rmparam(generate_template, "pos")
    result = expand_text(unicode(generate_template))
    if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        return
    tempargs = blib.split_generate_args(result)
    forms_seen = set()
    slots_and_forms_to_process = []
    for slot, formarg in tempargs.iteritems():
        forms = formarg.split(",")
        for form in forms:
            if "[" in form or "|" in form:
                continue
            form_no_macrons = lalib.remove_macrons(form)
            if form_no_macrons == pagetitle:
                continue
            if form_no_macrons in forms_seen:
                continue
            forms_seen.add(form_no_macrons)
            slots_and_forms_to_process.append((slot, form))
    for index, (slot, form) in blib.iter_items(
            sorted(slots_and_forms_to_process,
                   key=lambda x: lalib.remove_macrons(x[1]))):

        def handler(page, index, parsed):
            return process_form(page, index, slot, form, pos)

        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                     index,
                     handler,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
コード例 #15
0
def process_form(page, index, slot, form, pos):
    def pagemsg(txt):
        msg("Page %s %s %s: %s" % (index, slot, form, txt))

    notes = []

    pagemsg("Processing")

    if not page.exists():
        pagemsg("Skipping form value %s, page doesn't exist" % form)
        return None, None

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    if pos == "pn":
        from_header = "==Noun=="
        to_header = "==Proper noun=="
        from_headword_template = "la-noun-form"
        to_headword_template = "la-proper noun-form"
        from_pos = "noun form"
        to_pos = "proper noun form"
        from_lemma_pos = "noun"
        to_lemma_pos = "proper noun"
    elif pos == "n":
        from_header = "==Proper noun=="
        to_header = "==Noun=="
        from_headword_template = "la-proper noun-form"
        to_headword_template = "la-noun-form"
        from_pos = "proper noun form"
        to_pos = "noun form"
        from_lemma_pos = "proper noun"
        to_lemma_pos = "noun"
    else:
        raise ValueError("Unrecognized POS %s" % pos)

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    for k in xrange(2, len(subsections), 2):
        if (re.search(r"\{\{%s([|}])" % from_headword_template, subsections[k])
                or re.search(r"\{\{head\|la\|%s([|}])" % from_pos,
                             subsections[k])):
            newsubsec = subsections[k]
            newsubsec = re.sub(r"\{\{%s([|}])" % from_headword_template,
                               r"{{%s\1" % to_headword_template, newsubsec)
            newsubsec = re.sub(r"\{\{head\|la\|%s([|}])" % from_pos,
                               r"{{head|la|%s\1" % to_pos, newsubsec)
            newheadersubsec = subsections[k - 1]
            newheadersubsec = newheadersubsec.replace(from_header, to_header)
            if newsubsec != subsections[k] or newheadersubsec != subsections[
                    k - 1]:
                notes.append("non-lemma %s -> %s in header and headword" %
                             (from_lemma_pos, to_lemma_pos))
            subsections[k] = newsubsec
            subsections[k - 1] = newheadersubsec

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    text = "".join(sections)
    return text, notes
コード例 #16
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    notes = []

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M)

    saw_a_template = False

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        la_verb_template = None
        la_conj_template = None
        must_continue = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "la-conj":
                if la_conj_template:
                    pagemsg(
                        "WARNING: Saw multiple verb conjugation templates in subsection, %s and %s, skipping"
                        % (unicode(la_conj_template), unicode(t)))
                    must_continue = True
                    break
                la_conj_template = t
                saw_a_template = True
            if tn == "la-verb":
                if la_verb_template:
                    pagemsg(
                        "WARNING: Saw multiple verb headword templates in subsection, %s and %s, skipping"
                        % (unicode(la_verb_template), unicode(t)))
                    must_continue = True
                    break
                la_verb_template = t
                saw_a_template = True
        if must_continue:
            continue
        if not la_verb_template and not la_conj_template:
            continue
        if la_verb_template and not la_conj_template:
            pagemsg(
                "WARNING: Saw verb headword template but no conjugation template: %s"
                % unicode(la_verb_template))
            continue
        if la_conj_template and not la_verb_template:
            pagemsg(
                "WARNING: Saw verb conjugation template but no headword template: %s"
                % unicode(la_conj_template))
            continue

        orig_la_verb_template = unicode(la_verb_template)
        if re.search(r"^(irreg|[0-9]\+*)(\..*)?$",
                     getparam(la_verb_template, "1")):
            pagemsg("Found new-style verb headword template, skipping: %s" %
                    orig_la_verb_template)
            continue

        def render_headword_and_conj():
            return "headword template <from> %s <to> %s <end>, conjugation template <from> %s <to> %s <end>" % (
                orig_la_verb_template, orig_la_verb_template,
                unicode(la_conj_template), unicode(la_conj_template))

        verb_props = new_generate_verb_forms(unicode(la_conj_template),
                                             errandpagemsg,
                                             expand_text,
                                             include_props=True)
        if verb_props is None:
            continue
        subtypes = [
            x.replace("-", "") for x in safe_split(verb_props["subtypes"], ".")
        ]
        conj_type = verb_props["conj_type"]
        conj_subtype = verb_props.get("conj_subtype", None)

        def compare_headword_conj_forms(id_slot,
                                        headword_forms,
                                        conj_slots,
                                        adjust_for_missing_perf_forms=False,
                                        remove_conj_links=False):
            conj_forms = ""
            for slot in conj_slots:
                if slot in verb_props:
                    conj_forms = verb_props[slot]
                    break
            conj_forms = safe_split(conj_forms, ",")
            if remove_conj_links:
                conj_forms = [blib.remove_links(x) for x in conj_forms]
            corrected_headword_forms = [
                lengthen_ns_nf(x) for x in headword_forms
            ]
            corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms]
            if adjust_for_missing_perf_forms:
                # There are several instances of 4++ verbs where only the -īvī variant,
                # not the -iī variant, is listed in the headword. Don't get tripped up
                # by that.
                ivi_conj_forms = [
                    x for x in corrected_conj_forms if x.endswith(u"īvī")
                ]
                for ivi_conj_form in ivi_conj_forms:
                    ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form)
                    if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms:
                        corrected_headword_forms.append(ii_conj_form)
            if set(corrected_headword_forms) != set(corrected_conj_forms):
                macronless_headword_forms = set(
                    lalib.remove_macrons(x) for x in corrected_headword_forms)
                macronless_conj_forms = set(
                    lalib.remove_macrons(x) for x in corrected_conj_forms)
                if macronless_headword_forms == macronless_conj_forms:
                    pagemsg(
                        "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s"
                        % (id_slot, ",".join(headword_forms), id_slot,
                           ",".join(conj_forms), render_headword_and_conj()))
                else:
                    pagemsg(
                        "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s"
                        % (id_slot, ",".join(headword_forms), id_slot,
                           ",".join(conj_forms), render_headword_and_conj()))
                return False
            return True

        verb_conj = getparam(la_verb_template, "conj") or getparam(
            la_verb_template, "c")
        pattern = getparam(la_verb_template, "pattern")
        lemma = blib.fetch_param_chain(la_verb_template,
                                       ["1", "head", "head1"], "head")
        inf = blib.fetch_param_chain(la_verb_template, ["2", "inf", "inf1"],
                                     "inf")
        perf = blib.fetch_param_chain(la_verb_template, ["3", "perf", "perf1"],
                                      "perf")
        sup = blib.fetch_param_chain(la_verb_template, ["4", "sup", "sup1"],
                                     "sup")
        # Hack to handle cases like abeō where the headword normally lists perfect
        # abiī but the conj lists abiī, abīvī.
        if verb_conj == "irreg" and len(lemma) > 0 and lemma[0].endswith(
                u"eō"):
            ivi = re.sub(u"eō$", u"īvī", lemma[0])
            if ivi not in perf:
                perf.append(ivi)
        if not compare_headword_conj_forms("lemma", lemma, [
                "1s_pres_actv_indc", "3s_pres_actv_indc", "1s_perf_actv_indc",
                "3s_perf_actv_indc"
        ]):
            continue
        if "depon" in subtypes or "semidepon" in subtypes:
            if sup:
                pagemsg(
                    "WARNING: Saw supine in conjunction with deponent verb, skipping: %s"
                    % render_headword_and_conj())
                continue
            sup = [re.sub("[sm]( (sum|est))?$", "m", x) for x in perf]
        else:
            if not compare_headword_conj_forms(
                    "perfect",
                    perf,
                ["1s_perf_actv_indc", "3s_perf_actv_indc"],
                    adjust_for_missing_perf_forms=True,
                    # Remove links from perfect to handle cases like adsoleō where the
                    # perfect is adsoluī,[[adsolitus]] [[sum]] and the headword says
                    # adsoluī,adsolitus sum.
                    remove_conj_links=True):
                continue
        if len(sup) > 0 and sup[0].endswith(u"ūrus"):
            if not compare_headword_conj_forms("future participle", sup,
                                               ["futr_actv_ptc"]):
                continue
            if "supfutractvonly" not in subtypes:
                if len(lemma) > 0 and lemma[0].endswith("sum"):
                    pass
                else:
                    pagemsg(
                        "WARNING: Expected supfutractvonly in subtypes=%s, skipping: %s"
                        % (".".join(
                            sorted(subtypes)), render_headword_and_conj()))
                    continue
        else:
            if not compare_headword_conj_forms("supine", sup, ["sup_acc"]):
                continue
        if not verb_conj:
            pagemsg("WARNING: No conj in headword template: %s" %
                    render_headword_and_conj())
        else:
            conj_type_to_verb_conj = {
                "1st": "1",
                "2nd": "2",
                "3rd": "3",
                "3rd-io": "io",
                "4th": "4",
                "irreg": "irreg",
            }
            if conj_type not in conj_type_to_verb_conj:
                pagemsg(
                    "WARNING: Something wrong, saw unrecognized conj_type=%s: %s"
                    % (conj_type, render_headword_and_conj()))
                continue
            conj_type = conj_type_to_verb_conj[conj_type]
            if conj_subtype:
                if conj_subtype not in conj_type_to_verb_conj:
                    pagemsg(
                        "WARNING: Something wrong, saw unrecognized conj_subtype=%s"
                        % (conj_subtype, render_headword_and_conj()))
                    continue
                conj_subtype = conj_type_to_verb_conj[conj_subtype]
            if verb_conj != conj_type and verb_conj != conj_subtype:
                pagemsg(
                    "WARNING: Conjugation template has conj=%s, subconj=%s but headword template has conj=%s, skipping: %s"
                    % (conj_type, conj_subtype, verb_conj,
                       render_headword_and_conj()))
                continue
        pattern = pattern.replace("opt-semi-depon", "optsemidepon")
        pattern = pattern.replace("semi-depon", "semidepon")
        pattern = pattern.replace("pass-3only", "pass3only")
        pattern = pattern.replace("pass-impers", "passimpers")
        pattern = pattern.replace("no-actv-perf", "noactvperf")
        pattern = pattern.replace("no-pasv-perf", "nopasvperf")
        pattern = pattern.replace("perf-as-pres", "perfaspres")
        pattern = pattern.replace("short-imp", "shortimp")
        pattern = pattern.replace("sup-futr-actv-only", "supfutractvonly")
        pattern = safe_split(pattern, "-")
        pattern = [
            x for x in pattern if x not in
            ["noperf", "nosup", "irreg", "def", "facio", "shortimp", "depon"]
        ]
        subtypes = [
            x for x in subtypes
            if x not in ["I", "noperf", "nosup", "irreg", "depon"]
        ]
        if len(lemma) > 0 and lemma[0].endswith("sum"):
            # This is added automatically by [[sum]]
            subtypes = [x for x in subtypes if x != "supfutractvonly"]
        if set(pattern) != set(subtypes):
            if set(subtypes) >= set(pattern) and (
                    set(subtypes) - set(pattern) <= {
                        "nopass", "p3inf", "poetsyncperf", "optsyncperf",
                        "alwayssyncperf"
                    }):
                pagemsg(
                    "Subtypes=%s of conjugation template have extra, ignorable subtypes %s compared with pattern=%s of headword template: %s"
                    % (".".join(sorted(subtypes)), ".".join(
                        sorted(list(set(subtypes) - set(pattern)))), ".".join(
                            sorted(pattern)), render_headword_and_conj()))
            else:
                pagemsg(
                    "WARNING: Conjugation template has subtypes=%s but headword template has pattern=%s, skipping: %s"
                    % (".".join(sorted(subtypes)), ".".join(
                        sorted(pattern)), render_headword_and_conj()))
                continue

        # Fetch remaining params from headword template
        headword_params = []
        for param in la_verb_template.params:
            pname = unicode(param.name)
            if pname.strip() in [
                    "1", "2", "3", "4", "44", "conj", "c", "pattern"
            ] or re.search("^(head|inf|perf|sup)[0-9]*$", pname.strip()):
                continue
            headword_params.append((pname, param.value, param.showkey))
        # Erase all params
        del la_verb_template.params[:]
        # Copy params from conj template
        for param in la_conj_template.params:
            pname = unicode(param.name)
            la_verb_template.add(pname,
                                 param.value,
                                 showkey=param.showkey,
                                 preserve_spacing=False)
        # Copy remaining params from headword template
        for name, value, showkey in headword_params:
            la_verb_template.add(name,
                                 value,
                                 showkey=showkey,
                                 preserve_spacing=False)
        pagemsg("Replaced %s with %s" %
                (orig_la_verb_template, unicode(la_verb_template)))
        notes.append("convert {{la-verb}} params to new style")
        subsections[k] = unicode(parsed)

    if not saw_a_template:
        pagemsg("WARNING: Saw no verb headword or conjugation templates")

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #17
0
def delete_form_1(page, index, lemma, formind, formval, pos,
                  tag_sets_to_delete, preserve_diaeresis):
    notes = []

    tag_sets_to_delete = True if tag_sets_to_delete is True else (
        sorted(tag_sets_to_delete))
    frozenset_tag_sets_to_delete = True if tag_sets_to_delete is True else set(
        frozenset(tag_set) for tag_set in tag_sets_to_delete)

    def pagemsg(txt):
        msg("Page %s %s: form %s %s: %s" %
            (index, lemma, formind, formval, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: form %s %s: %s" %
                  (index, lemma, formind, formval, txt))

    if pos == "verbform":
        expected_head_template = "la-verb-form"
        expected_header_pos = "Verb"
        expected_head_pos = "verb form"
    elif pos == "nounform":
        expected_head_template = "la-noun-form"
        expected_header_pos = "Noun"
        expected_head_pos = "noun form"
    elif pos == "adjform":
        expected_head_template = "la-adj-form"
        expected_header_pos = "Adjective"
        expected_head_pos = "adjective form"
    elif pos == "partform":
        expected_head_template = "la-part-form"
        expected_header_pos = "Participle"
        expected_head_pos = "participle form"
    elif pos == "numform":
        expected_head_template = "la-num-form"
        expected_header_pos = "Numeral"
        expected_head_pos = "numeral form"
    else:
        raise ValueError("Unrecognized part of speech %s" % pos)

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    # FIXME!

    #if "==Etymology 1==" in secbody:
    #  etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    #  for k in xrange(2, len(etym_sections), 2):
    #    etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True)
    #  secbody = "".join(etym_sections)

    subsections_to_delete = []
    subsections_to_remove_inflections_from = []

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        saw_head = False
        saw_infl = False
        saw_other_infl = False
        remove_deletable_tag_sets_from_subsection = False
        saw_bad_template = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == expected_head_template:
                saw_head = True
            elif tn == "head" and getparam(t, "1") == "la" and getparam(
                    t, "2") == expected_head_pos:
                saw_head = True
            elif tn == "inflection of":
                lang = getparam(t, "lang")
                if lang:
                    lemma_param = 1
                else:
                    lang = getparam(t, "1")
                    lemma_param = 2
                if lang != "la":
                    errandpagemsg(
                        "WARNING: In Latin section, found {{inflection of}} for different language %s: %s"
                        % (lang, unicode(t)))
                    return None, None
                actual_lemma = getparam(t, str(lemma_param))
                # Allow mismatch in macrons, which often happens, e.g. because
                # a macron was added to the lemma page but not to the inflections
                if remove_macrons(actual_lemma,
                                  preserve_diaeresis) == remove_macrons(
                                      lemma, preserve_diaeresis):
                    # fetch tags
                    tags = []
                    for param in t.params:
                        pname = unicode(param.name).strip()
                        pval = unicode(param.value).strip()
                        if re.search("^[0-9]+$", pname):
                            if int(pname) >= lemma_param + 2:
                                if pval:
                                    tags.append(pval)
                    for tag in tags:
                        if "//" in tag:
                            pagemsg(
                                "WARNING: Don't know how to handle multipart tags yet: %s"
                                % unicode(t))
                            saw_other_infl = True
                            break
                    else:
                        # no break
                        tag_sets = lalib.split_tags_into_tag_sets(tags)
                        for tag_set in tag_sets:
                            if tag_sets_to_delete is True or frozenset(
                                    lalib.canonicalize_tag_set(tag_set)
                            ) in frozenset_tag_sets_to_delete:
                                saw_infl = True
                            else:
                                pagemsg(
                                    "Found {{inflection of}} for correct lemma but wrong tag set %s, expected one of %s: %s"
                                    % ("|".join(tag_set), ",".join(
                                        "|".join(x)
                                        for x in tag_sets_to_delete),
                                       unicode(t)))
                                saw_other_infl = True
                else:
                    pagemsg(
                        "Found {{inflection of}} for different lemma %s: %s" %
                        (actual_lemma, unicode(t)))
                    saw_other_infl = True
        if saw_head and saw_infl:
            if saw_other_infl:
                pagemsg(
                    "Found subsection #%s to delete but has inflection-of template for different lemma or nondeletable tag set, will remove only deletable tag sets"
                    % (k // 2))
                remove_deletable_tag_sets_from_subsection = True
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn not in [
                        expected_head_template, "inflection of"
                ] and not (tn == "head" and getparam(t, "1") == "la"
                           and getparam(t, "2") == expected_head_pos):
                    pagemsg(
                        "WARNING: Saw unrecognized template in otherwise deletable subsection #%s: %s"
                        % (k // 2, unicode(t)))
                    saw_bad_template = True
                    break
            else:
                # No break
                if "===%s===" % expected_header_pos in subsections[k - 1]:
                    if remove_deletable_tag_sets_from_subsection:
                        subsections_to_remove_inflections_from.append(k)
                    else:
                        subsections_to_delete.append(k)
                else:
                    pagemsg(
                        "WARNING: Wrong header in otherwise deletable subsection #%s: %s"
                        % (k // 2, subsections[k - 1].strip()))

    if not subsections_to_delete and not subsections_to_remove_inflections_from:
        pagemsg(
            "Found Latin section but no deletable or excisable subsections")
        return None, None

    #### Now, we can delete an inflection, a subsection or the whole section or page

    for k in subsections_to_remove_inflections_from:
        newsubsec = subsections[k]
        if not newsubsec.endswith("\n"):
            # This applies to the last subsection on the page
            newsubsec += "\n"

        def remove_inflections(m):
            parsed = blib.parse_text(m.group(0))
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == "inflection of":
                    lang = getparam(t, "lang")
                    if lang:
                        lemma_param = 1
                    else:
                        lang = getparam(t, "1")
                        lemma_param = 2
                    assert lang == "la"
                    actual_lemma = getparam(t, str(lemma_param))
                    # Allow mismatch in macrons, which often happens, e.g. because
                    # a macron was added to the lemma page but not to the inflections
                    if remove_macrons(actual_lemma,
                                      preserve_diaeresis) == remove_macrons(
                                          lemma, preserve_diaeresis):
                        tr = getparam(t, "tr")
                        alt = getparam(t, "alt") or getparam(
                            t, str(lemma_param + 1))
                        # fetch tags
                        tags = []
                        params = []
                        for param in t.params:
                            pname = unicode(param.name).strip()
                            pval = unicode(param.value).strip()
                            if re.search("^[0-9]+$", pname):
                                if int(pname) >= lemma_param + 2:
                                    if pval:
                                        tags.append(pval)
                            elif pname not in ["lang", "tr", "alt"]:
                                params.append((pname, pval, param.showkey))
                        tag_sets = lalib.split_tags_into_tag_sets(tags)
                        filtered_tag_sets = []
                        for tag_set in tag_sets:
                            if tag_sets_to_delete is not True and frozenset(
                                    lalib.canonicalize_tag_set(tag_set)
                            ) not in frozenset_tag_sets_to_delete:
                                filtered_tag_sets.append(tag_set)
                        if not filtered_tag_sets:
                            return ""

                        # Erase all params.
                        del t.params[:]
                        # Put back new params.
                        t.add("1", lang)
                        t.add("2", actual_lemma)
                        if tr:
                            t.add("tr", tr)
                        t.add("3", alt)
                        next_tag_param = 4
                        for tag in lalib.combine_tag_set_group(
                                filtered_tag_sets):
                            t.add(str(next_tag_param), tag)
                            next_tag_param += 1
            return unicode(parsed)

        newnewsubsec = re.sub(r"^# \{\{inflection of\|[^{}\n]*\}\}\n",
                              remove_inflections, newsubsec, 0, re.M)
        if newnewsubsec != newsubsec:
            notes.append("removed inflection(s) for bad Latin form(s)")
            subsections[k] = newnewsubsec

    for k in reversed(subsections_to_delete):
        # Do in reverse order so indices don't change
        del subsections[k]
        del subsections[k - 1]

    if len(subsections) == 1 or len(subsections) == 3 and re.search(
            "^==+References==+$", subsections[1].strip()):
        # Whole section deletable
        if subsections[0].strip():
            pagemsg(
                "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>"
                % subsections[0].strip())
            return None, None
        if "[[Category:" in sectail:
            pagemsg(
                "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>"
                % sectail.strip())
            return None, None
        if not has_non_latin:
            # Can delete the whole page, but check for non-blank section 0
            cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0])
            if cleaned_sec0.strip():
                pagemsg(
                    "WARNING: Whole page deletable except that there's text above all sections: <%s>"
                    % cleaned_sec0.strip())
                return None, None
            pagetitle = unicode(page.title())
            pagemsg("Page %s should be deleted" % pagetitle)
            pages_to_delete.append(pagetitle)
            return None, None
        del sections[j]
        del sections[j - 1]
        notes.append(
            "excised %s subsection%s for bad Latin forms, leaving no Latin section"
            % (len(subsections_to_delete),
               "" if len(subsections_to_delete) == 1 else "s"))
        if j > len(sections):
            # We deleted the last section, remove the separator at the end of the
            # previous section.
            sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1])
        text = "".join(sections)

    else:
        # Some but not all subsections remain
        secbody = "".join(subsections)
        sections[j] = secbody + sectail
        if subsections_to_delete and subsections_to_remove_inflections_from:
            deletable_subsec_text = "Subsection(s) %s deletable and subsection(s) %s excisable" % (
                ",".join(str(k // 2) for k in subsections_to_delete), ",".join(
                    str(k // 2)
                    for k in subsections_to_remove_inflections_from))
            deletable_subsec_note_text = "deleted %s subsection%s and partly excised %s subsection%s" % (
                len(subsections_to_delete), "" if len(subsections_to_delete)
                == 1 else "s", len(subsections_to_remove_inflections_from), ""
                if len(subsections_to_remove_inflections_from) == 1 else "s")
        elif subsections_to_delete:
            deletable_subsec_text = "Subsection(s) %s deletable" % (",".join(
                str(k // 2) for k in subsections_to_delete))
            deletable_subsec_note_text = "deleted %s subsection%s" % (
                len(subsections_to_delete),
                "" if len(subsections_to_delete) == 1 else "s")
        else:
            deletable_subsec_text = "Subsection(s) %s excisable" % (",".join(
                str(k // 2) for k in subsections_to_remove_inflections_from))
            deletable_subsec_note_text = "partly excised %s subsection%s" % (
                len(subsections_to_remove_inflections_from), ""
                if len(subsections_to_remove_inflections_from) == 1 else "s")

        if "==Etymology" in sections[j]:
            pagemsg(
                "WARNING: %s but found Etymology subsection, don't know how to handle"
                % deletable_subsec_text)
            return None, None
        if "==Pronunciation" in sections[j]:
            pagemsg(
                "WARNING: %s but found Pronunciation subsection, don't know how to handle"
                % deletable_subsec_text)
            return None, None

        notes.append(
            "%s for bad Latin forms, leaving some subsections remaining" %
            deletable_subsec_note_text)
        text = "".join(sections)

    return text, notes
コード例 #18
0
def process_page(page, index, headword_template, decl_template):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    parsed = blib.parse_text(secbody)
    num_noun_headword_templates = 0
    num_ndecl_templates = 0
    num_adecl_templates = 0
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["la-noun", "la-proper noun"]:
            num_noun_headword_templates += 1
        if tn == "la-ndecl":
            num_ndecl_templates += 1
        if tn == "la-adecl":
            num_adecl_templates += 1
        # FIXME, also add something for manually-specified declensions (synaeresis?)
    if "\n===Declension===\n" in secbody:
        pagemsg("WARNING: Saw misindented Declension header")
    if num_adecl_templates >= 1:
        pagemsg("WARNING: Saw {{la-adecl}} in noun section")
    if num_ndecl_templates + num_adecl_templates >= num_noun_headword_templates:
        pagemsg(
            "WARNING: Already seen %s decl template(s) >= %s headword template(s), skipping"
            % (num_ndecl_templates + num_adecl_templates,
               num_noun_headword_templates))
        return None, None

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)

    num_declension_headers = 0
    for k in xrange(1, len(subsections), 2):
        if "Declension" in subsections[k] or "Inflection" in subsections[k]:
            num_declension_headers += 1
    if num_declension_headers >= num_noun_headword_templates:
        pagemsg(
            "WARNING: Already seen %s Declension/Inflection header(s) >= %s headword template(s), skipping"
            % (num_declension_headers, num_noun_headword_templates))
        return None, None

    for k in xrange(2, len(subsections), 2):
        if headword_template in subsections[k]:
            pagemsg("Inserting declension section after subsection %s" % k)
            subsections[k] = subsections[k].rstrip('\n') + "\n\n"
            num_equal_signs = len(
                re.sub("^(=+).*", r"\1", subsections[k - 1].strip()))
            subsections[k + 1:k + 1] = [
                "%sDeclension%s\n%s\n\n" %
                ("=" * (num_equal_signs + 1), "=" *
                 (num_equal_signs + 1), decl_template)
            ]
            notes.append("add section for Latin declension %s" % decl_template)
            break
    else:
        pagemsg("WARNING: Couldn't locate headword template, skipping: %s" %
                headword_template)
        return None, None
    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    text = "".join(sections)
    text = re.sub("\n\n\n+", "\n\n", text)
    if not notes:
        notes.append("convert 3+ newlines to 2")
    return text, notes
コード例 #19
0
def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    notes = []

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===[^=]*===\n)", secbody, 0, re.M)

    saw_a_template = False

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        la_adj_template = None
        must_continue = False
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "la-adj":
                if la_adj_template:
                    pagemsg(
                        "WARNING: Saw multiple adjective headword templates in subsection, %s and %s, skipping"
                        % (unicode(la_adj_template), unicode(t)))
                    must_continue = True
                    break
                la_adj_template = t
                saw_a_template = True
        if must_continue:
            continue
        if not la_adj_template:
            continue
        m = re.search(
            r"'*comparative'*: '*(.*?)'+,* *'*superlative'*: '*(.*?)'+",
            subsections[k])
        if m:
            comp, sup = m.groups()

            def parse_comp_sup(cs):
                m = re.search(r"^\{\{[lm]\|la\|(.*?)\}\}$", cs)
                if m:
                    return m.group(1)
                m = re.search(r"^\[\[.*?\|(.*?)\]\]$", cs)
                if m:
                    return m.group(1)
                m = re.search(r"^\[\[(.*?)\]\]$", cs)
                if m:
                    return m.group(1)
                pagemsg("WARNING: Can't parse comp/sup %s" % cs)
                return None

            comp = parse_comp_sup(comp)
            sup = parse_comp_sup(sup)
            if comp and sup:
                orig_la_adj_template = unicode(la_adj_template)
                la_adj_template.add("comp", comp)
                la_adj_template.add("sup", sup)
                pagemsg("Replaced %s with %s" %
                        (orig_la_adj_template, unicode(la_adj_template)))
                notes.append(
                    "move comparative/superative to {{la-adj}} headword params"
                )
                subsections[k] = unicode(parsed)
                subsections[k] = re.sub(
                    r"\n+\* *'*comparative'*: '*(.*?)'+,* *'*superlative'*: '*(.*?)'+\n+",
                    "\n\n", subsections[k], 1)

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #20
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    def process_etym_section(sectext, is_etym_section):
        if "==Pronunciation 1==" not in sectext:
            pagemsg("No ==Pronunciation 1== in %s" %
                    ("etym section" if is_etym_section else "text"))
            return sectext

        if is_etym_section:
            equalsigns = "===="
        else:
            equalsigns = "==="
        subsections = re.split("(^==.*==\n)", sectext, 0, re.M)

        if len(subsections) > 2 and subsections[1] == "===Etymology===\n":
            # Allow for an Etymology section at the beginning (many examples have one,
            # saying e.g. "Inflected form of {{m|la|pulchellus||beautiful little}}.".
            offset = 2
        else:
            offset = 0
        if not (len(subsections) == 9 + offset or
                (len(subsections) == 11 + offset
                 and subsections[9 + offset] == "===References===\n")):
            pagemsg(
                "WARNING: Not right # of sections (normally four, potentially five or six with ===Etymology=== and/or ===References===): %s"
                % (",".join(subsections[k].strip()
                            for k in xrange(1, len(subsections), 2))))
            return sectext
        if (subsections[1 + offset] != "%sPronunciation 1%s\n" %
            (equalsigns, equalsigns)
                or subsections[5 + offset] != "%sPronunciation 2%s\n" %
            (equalsigns, equalsigns)):
            pagemsg(
                "WARNING: Expected %sPronunciation N%s headers but saw %s and %s"
                % (equalsigns, equalsigns, subsections[1 + offset].strip(),
                   subsections[5 + offset].strip()))
            return sectext
        if subsections[3 + offset] != subsections[7 + offset]:
            if is_etym_section:
                pagemsg(
                    "WARNING: Already in etym section and saw different POS headers %s and %s, can't convert to etym sections"
                    % (subsections[3 + offset].strip(),
                       subsections[7 + offset].strip()))
                return sectext
            elif offset > 0:
                pagemsg(
                    "WARNING: Already have ===Etymology=== section and saw different POS headers %s and %s, can't convert to etym sections"
                    % (subsections[3 + offset].strip(),
                       subsections[7 + offset].strip()))
                return sectext
            else:
                pagemsg("Saw different POS headers %s and %s" %
                        (subsections[3 + offset].strip(),
                         subsections[7 + offset].strip()))
                subsections[
                    1 +
                    offset] = "===Etymology 1===\n\n====Pronunciation====\n"
                subsections[2 + offset] = re.sub(r"^\{\{rfc-pron-n\|.*?\}\}\n",
                                                 "", subsections[2 + offset],
                                                 0, re.M)
                subsections[
                    5 +
                    offset] = "===Etymology 2===\n\n====Pronunciation====\n"
                notes.append(
                    "Combined ===Pronunciation 1=== and ===Pronunciation 2=== to ===Etymology 1=== and ===Etymology 2=== because different parts of speech/lemmas"
                )
                return "".join(subsections)

        else:

            def find_lemmas(text):
                lemmas = set()
                parsed = blib.parse_text(text)
                for t in parsed.filter_templates():
                    if tname(t) == "inflection of":
                        if getparam(t, "lang"):
                            lemmas.add(getparam(t, "1"))
                        else:
                            lemmas.add(getparam(t, "2"))
                return lemmas

            first_lemmas = find_lemmas(subsections[4 + offset])
            second_lemmas = find_lemmas(subsections[8 + offset])
            if first_lemmas != second_lemmas:
                pagemsg(
                    "WARNING: Different lemmas in two POS sections: %s and %s"
                    % (",".join(first_lemmas), ",".join(second_lemmas)))
                return sectext

            # For verbs with the infinitive in the second section, swap the
            # sections to put the infinitive first.
            if re.search(r"\|inf[|}]", subsections[8 + offset]):
                # Preserve the newlines at the end of each section; only swap the text.
                m = re.match(r"\A(.*?)(\n*)\Z", subsections[4 + offset], re.S)
                text4, newlines4 = m.groups()
                m = re.search(r"\A(.*?)(\n*)\Z", subsections[8 + offset], re.S)
                text8, newlines8 = m.groups()
                subsections[4 + offset] = text8 + newlines4
                subsections[8 + offset] = text4 + newlines8
                temptext = subsections[2 + offset]
                subsections[2 + offset] = subsections[6 + offset]
                subsections[6 + offset] = temptext
                notes.append("swap non-lemma sections to put infinitive first")

            subsections[1 + offset] = "%sPronunciation%s\n" % (equalsigns,
                                                               equalsigns)
            subsections[3 + offset] = re.sub(
                "^=+", equalsigns,
                re.sub("=+\n$", equalsigns + "\n", subsections[3 + offset]))
            subsections[7 + offset] = re.sub(
                "^=+", equalsigns,
                re.sub("=+\n$", equalsigns + "\n", subsections[7 + offset]))
            subsections[2 + offset] = subsections[2 + offset].strip(
            ) + "\n" + subsections[6 + offset].strip() + "\n\n"
            parsed = blib.parse_text(subsections[2 + offset])
            for t in parsed.filter_templates():
                if tname(t) == "la-IPA":
                    t.add("ann", "1")
            subsections[2 + offset] = unicode(parsed)
            subsections[2 + offset] = re.sub(r"^\{\{rfc-pron-n\|.*?\}\}\n", "",
                                             subsections[2 + offset], 0, re.M)
            del subsections[6 + offset]
            del subsections[5 + offset]
            notes.append(
                "combine %sPronunciation 1%s and %sPronunciation 2%s" %
                (equalsigns, equalsigns, equalsigns, equalsigns))
            return "".join(subsections)

    has_etym_1 = "==Etymology 1==" in secbody
    if not has_etym_1:
        secbody = process_etym_section(secbody, is_etym_section=False)
    else:
        etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0,
                                 re.M)
        for k in xrange(2, len(etym_sections), 2):
            etym_sections[k] = process_etym_section(etym_sections[k],
                                                    is_etym_section=True)
        secbody = "".join(etym_sections)

    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #21
0
def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        if "==Adverb==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])
            posdeg = None
            compt = None
            supt = None
            for t in parsed.filter_templates():
                if tname(t) == "comparative of":
                    if compt:
                        pagemsg(
                            "WARNING: Saw multiple {{comparative of}}: %s and %s"
                            % (unicode(compt), unicode(t)))
                    else:
                        compt = t
                        posdeg = blib.remove_links(getparam(t, "1"))
                        if not posdeg:
                            pagemsg(
                                "WARNING: Didn't see positive degree in {{comparative of}}: %s"
                                % unicode(t))
                elif tname(t) == "superlative of":
                    if supt:
                        pagemsg(
                            "WARNING: Saw multiple {{superlative of}}: %s and %s"
                            % (unicode(supt), unicode(t)))
                    else:
                        supt = t
                        posdeg = blib.remove_links(getparam(t, "1"))
                        if not posdeg:
                            pagemsg(
                                "WARNING: Didn't see positive degree in {{superlative of}}: %s"
                                % unicode(t))
            if compt and supt:
                pagemsg(
                    "WARNING: Saw both comparative and superlative, skipping: %s and %s"
                    % (unicode(compt), unicode(supt)))
                continue
            if not compt and not supt:
                pagemsg(
                    "WARNING: Didn't see {{comparative of}} or {{superlative of}} in section %s"
                    % k)
                continue
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["la-adv-comp", "la-adv-sup"]:
                    pagemsg("Already saw fixed headword: %s" % unicode(t))
                    break
                if tn == "head":
                    if not getparam(t, "1") == "la":
                        pagemsg("WARNING: Saw wrong language in {{head}}: %s" %
                                unicode(t))
                    else:
                        pos = getparam(t, "2")
                        head = blib.remove_links(getparam(t,
                                                          "head")) or pagetitle
                        if pos not in [
                                "adverb",
                                "adverbs",
                                "adverb form",
                                "adverb forms",
                                "adverb comparative form",
                                "adverb comparative forms",
                                "adverb superlative form",
                                "adverb superlative forms",
                        ]:
                            pagemsg(
                                "WARNING: Unrecognized part of speech '%s': %s"
                                % (pos, unicode(t)))
                        else:
                            real_head, real_comp, real_sup = find_head_comp_sup(
                                lalib.remove_macrons(posdeg), pagemsg)
                            if real_head:
                                if lalib.remove_macrons(
                                        real_head) != lalib.remove_macrons(
                                            posdeg):
                                    pagemsg(
                                        "WARNING: Can't replace positive degree %s with %s because they differ when macrons are removed"
                                        % (posdeg, real_head))
                                else:
                                    pagemsg(
                                        "Using real positive degree %s instead of %s"
                                        % (real_head, posdeg))
                                    inflt = compt or supt
                                    origt = unicode(inflt)
                                    inflt.add("1", real_head)
                                    pagemsg("Replaced %s with %s" %
                                            (origt, unicode(inflt)))
                            if compt:
                                newname = "la-adv-comp"
                                infldeg = "comparative"
                                if real_comp and real_comp != "-":
                                    if lalib.remove_macrons(
                                            real_comp) != lalib.remove_macrons(
                                                head):
                                        pagemsg(
                                            "WARNING: Can't replace comparative degree %s with %s because they differ when macrons are removed"
                                            % (head, real_comp))
                                    else:
                                        pagemsg(
                                            "Using real comparative degree %s instead of %s"
                                            % (real_comp, head))
                                        head = real_comp
                                else:
                                    pagemsg(
                                        "WARNING: Couldn't retrieve real comparative for positive degree %s"
                                        % real_head)
                            else:
                                newname = "la-adv-sup"
                                infldeg = "superlative"
                                if real_sup and real_sup != "-":
                                    if lalib.remove_macrons(
                                            real_sup) != lalib.remove_macrons(
                                                head):
                                        pagemsg(
                                            "WARNING: Can't replace superlative degree %s with %s because they differ when macrons are removed"
                                            % (head, real_sup))
                                    else:
                                        pagemsg(
                                            "Using real superlative degree %s instead of %s"
                                            % (real_sup, head))
                                        head = real_sup
                                else:
                                    pagemsg(
                                        "WARNING: Couldn't retrieve real superlative for positive degree %s"
                                        % real_head)
                            origt = unicode(t)
                            rmparam(t, "head")
                            rmparam(t, "2")
                            rmparam(t, "1")
                            blib.set_template_name(t, newname)
                            t.add("1", head)
                            pagemsg("Replaced %s with %s" %
                                    (origt, unicode(t)))
                            notes.append(
                                "replace {{head|la|...}} with {{%s}} and fix up positive/%s"
                                % (newname, infldeg))

            subsections[k] = unicode(parsed)

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes
コード例 #22
0
def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  origtext = text

  retval = lalib.find_latin_section(text, pagemsg)
  if retval is None:
    return None, None

  sections, j, secbody, sectail, has_non_latin = retval

  notes = []

  def fix_up_section(sectext, warn_on_multiple_heads):
    parsed = blib.parse_text(sectext)

    heads = set()
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if lalib.la_template_is_head(t):
        heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg))
      elif tn == "la-IPA":
        pronun_templates.append(t)
    if len(heads) > 1:
      if warn_on_multiple_heads:
        pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    if len(heads) == 0:
      pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads))
      return sectext
    newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext)
    newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M)
    if newsectext != sectext:
      notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0])
      sectext = newsectext
    # Recompute pronun templates as we may have added one.
    parsed = blib.parse_text(sectext)
    pronun_templates = []
    for t in parsed.filter_templates():
      tn = tname(t)
      if tn == "la-IPA":
        pronun_templates.append(t)
    if "{{a|Ecclesiastical}} {{IPA" in sectext:
      if len(pronun_templates) == 0:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template")
      elif len(pronun_templates) > 1:
        pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" %
          ",".join(unicode(tt) for tt in pronun_templates))
      else:
        origt = unicode(pronun_templates[0])
        pronun_templates[0].add("eccl", "yes")
        pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0])))
        newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "",
            sectext, 0, re.M)
        if newsectext == sectext:
          pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation")
        else:
          notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}")
          sectext = newsectext
    return sectext

  # If there are multiple Etymology sections, the pronunciation may be above all of
  # them if all have the same pronunciation, else it will be within each section.
  # Cater to both situations. We first try without splitting on etym sections; if that
  # doesn't change anything, it may be because there were multiple heads found and
  # separate pronunciation sections, so we then try splitting on etym sections.
  has_etym_1 = "==Etymology 1==" in secbody
  newsecbody = fix_up_section(secbody, warn_on_multiple_heads=not has_etym_1)
  if newsecbody != secbody:
    secbody = newsecbody
  elif has_etym_1:
    etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M)
    for k in xrange(2, len(etym_sections), 2):
      etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True)
    secbody = "".join(etym_sections)

  sections[j] = secbody + sectail
  return "".join(sections), notes
コード例 #23
0
def delete_term(index, term, expected_head_templates, save, verbose):
    notes = []

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, term, txt))

    page = pywikibot.Page(site, term)
    if not page.exists():
        pagemsg("Skipping form value %s, page doesn't exist" % term)
        return

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    saw_lemma_in_etym = False
    saw_wrong_lemma_in_etym = False
    saw_head = False
    infl_template = None
    saw_bad_template = False
    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn in expected_head_templates:
                saw_head = True
            elif tn in ["inflection of", "rfdef", "la-IPA"]:
                pass
            else:
                pagemsg(
                    "WARNING: Saw unrecognized template in subsection #%s %s: %s"
                    % (k // 2, subsections[k - 1].strip(), unicode(t)))
                saw_bad_template = True

    delete = False
    if saw_head:
        if saw_bad_template:
            pagemsg(
                "WARNING: Would delete but saw unrecognized template, not deleting"
            )
        else:
            delete = True

    if not delete:
        return

    if "==Etymology" in sections[j]:
        pagemsg(
            "WARNING: Found Etymology subsection, don't know how to handle")
        return
    if "==Pronunciation " in sections[j]:
        pagemsg(
            "WARNING: Found Pronunciation N subsection, don't know how to handle"
        )
        return

    #### Now, we can maybe delete the whole section or page

    if subsections[0].strip():
        pagemsg(
            "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>"
            % subsections[0].strip())
        return
    if "[[Category:" in sectail:
        pagemsg(
            "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>"
            % sectail.strip())
        return
    if not has_non_latin:
        # Can delete the whole page, but check for non-blank section 0
        cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0])
        if cleaned_sec0.strip():
            pagemsg(
                "WARNING: Whole page deletable except that there's text above all sections: <%s>"
                % cleaned_sec0.strip())
            return
        pagetitle = unicode(page.title())
        pagemsg("Page %s should be deleted" % pagetitle)
        pages_to_delete.append(pagetitle)
        return
    del sections[j]
    del sections[j - 1]
    notes.append("removed Latin section for bad term")
    if j > len(sections):
        # We deleted the last section, remove the separator at the end of the
        # previous section.
        sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1])
    text = "".join(sections)

    return text, notes
コード例 #24
0
def delete_participle_1(page, index, lemma, formind, formval, pos,
                        preserve_diaeresis, save, verbose, diff):
    notes = []

    def pagemsg(txt):
        msg("Page %s %s: form %s %s: %s" %
            (index, lemma, formind, formval, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: form %s %s: %s" %
                  (index, lemma, formind, formval, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall,
                                remove_macrons(formval, preserve_diaeresis),
                                pagemsg, verbose)

    expected_head_template = "la-part"

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M)
    saw_lemma_in_etym = False
    saw_wrong_lemma_in_etym = False
    saw_head = False
    infl_template = None
    saw_bad_template = False
    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "m" and "==Etymology==" in subsections[k - 1]:
                actual_lemma = getparam(t, "2")
                if remove_macrons(lemma, preserve_diaeresis) == remove_macrons(
                        actual_lemma, preserve_diaeresis):
                    saw_lemma_in_etym = True
                else:
                    pagemsg(
                        "WARNING: Saw wrong lemma %s != %s in Etymology section: %s"
                        % (actual_lemma, lemma, unicode(t)))
                    saw_wrong_lemma_in_etym = True
            elif tn == expected_head_template:
                saw_head = True
            elif tn == "la-adecl":
                if not saw_head:
                    pagemsg(
                        "WARNING: Saw inflection template without (or before) head template, skipping: %s"
                        % unicode(t))
                elif infl_template:
                    pagemsg(
                        "WARNING: Saw two possible inflection templates: first %s, second %s"
                        % (infl_template, unicode(t)))
                else:
                    infl_template = unicode(t)
            elif tn in [
                    "rfdef", "R:L&S", "R:Elementary Lewis", "R:du Cange",
                    "R:Gaffiot", "R:NLW", "alternative form of", "la-IPA"
            ]:
                pass
            else:
                pagemsg(
                    "WARNING: Saw unrecognized template in subsection #%s %s: %s"
                    % (k // 2, subsections[k - 1].strip(), unicode(t)))
                saw_bad_template = True

    delete = False
    if saw_head and infl_template:
        if not saw_lemma_in_etym:
            pagemsg(
                "WARNING: Would delete but didn't see reference to correct lemma %s in Etymology section, not deleting"
                % lemma)
        elif saw_wrong_lemma_in_etym:
            pagemsg(
                "WARNING: Would delete but saw reference to wrong lemma in Etymology section, not deleting"
            )
        elif saw_bad_template:
            pagemsg(
                "WARNING: Would delete but saw unrecognized template, not deleting"
            )
        else:
            delete = True

    if not delete:
        return None, None

    args = lalib.generate_adj_forms(infl_template, errandpagemsg, expand_text)
    if args is None:
        return None, None
    single_forms_to_delete = []
    for key, form in args.iteritems():
        single_forms_to_delete.extend(form.split(","))
    for formformind, formformval in blib.iter_items(single_forms_to_delete):
        delete_form(index, formval, formformind, formformval, "partform", True,
                    preserve_diaeresis, save, verbose, diff)

    #### Now, we can maybe delete the whole section or page

    if subsections[0].strip():
        pagemsg(
            "WARNING: Whole Latin section deletable except that there's text above all subsections: <%s>"
            % subsections[0].strip())
        return None, None
    if "[[Category:" in sectail:
        pagemsg(
            "WARNING: Whole Latin section deletable except that there's a category at the end: <%s>"
            % sectail.strip())
        return None, None
    if not has_non_latin:
        # Can delete the whole page, but check for non-blank section 0
        cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0])
        if cleaned_sec0.strip():
            pagemsg(
                "WARNING: Whole page deletable except that there's text above all sections: <%s>"
                % cleaned_sec0.strip())
            return None, None
        pagetitle = unicode(page.title())
        pagemsg("Page %s should be deleted" % pagetitle)
        pages_to_delete.append(pagetitle)
        return None, None
    del sections[j]
    del sections[j - 1]
    notes.append("removed Latin section for bad participle")
    if j > len(sections):
        # We deleted the last section, remove the separator at the end of the
        # previous section.
        sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1])
    text = "".join(sections)

    return text, notes