Python remove_macrons Examples, lalib.remove_macrons Python Examples

Example #1

0

Show file

File: delete_bad_latin_forms.py Project: benwing2/RuNounChanges

        def remove_inflections(m):
            parsed = blib.parse_text(m.group(0))
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn == "inflection of":
                    lang = getparam(t, "lang")
                    if lang:
                        lemma_param = 1
                    else:
                        lang = getparam(t, "1")
                        lemma_param = 2
                    assert lang == "la"
                    actual_lemma = getparam(t, str(lemma_param))
                    # Allow mismatch in macrons, which often happens, e.g. because
                    # a macron was added to the lemma page but not to the inflections
                    if remove_macrons(actual_lemma,
                                      preserve_diaeresis) == remove_macrons(
                                          lemma, preserve_diaeresis):
                        tr = getparam(t, "tr")
                        alt = getparam(t, "alt") or getparam(
                            t, str(lemma_param + 1))
                        # fetch tags
                        tags = []
                        params = []
                        for param in t.params:
                            pname = unicode(param.name).strip()
                            pval = unicode(param.value).strip()
                            if re.search("^[0-9]+$", pname):
                                if int(pname) >= lemma_param + 2:
                                    if pval:
                                        tags.append(pval)
                            elif pname not in ["lang", "tr", "alt"]:
                                params.append((pname, pval, param.showkey))
                        tag_sets = lalib.split_tags_into_tag_sets(tags)
                        filtered_tag_sets = []
                        for tag_set in tag_sets:
                            if tag_sets_to_delete is not True and frozenset(
                                    lalib.canonicalize_tag_set(tag_set)
                            ) not in frozenset_tag_sets_to_delete:
                                filtered_tag_sets.append(tag_set)
                        if not filtered_tag_sets:
                            return ""

                        # Erase all params.
                        del t.params[:]
                        # Put back new params.
                        t.add("1", lang)
                        t.add("2", actual_lemma)
                        if tr:
                            t.add("tr", tr)
                        t.add("3", alt)
                        next_tag_param = 4
                        for tag in lalib.combine_tag_set_group(
                                filtered_tag_sets):
                            t.add(str(next_tag_param), tag)
                            next_tag_param += 1
            return unicode(parsed)

Example #2

0

Show file

def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not args.stdin:
        pagemsg("Processing")

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None
    sections, j, secbody, sectail, has_non_latin = retval
    parsed = blib.parse_text(secbody)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in lalib.la_headword_templates:
            for head in lalib.la_get_headword_from_template(
                    t, pagetitle, pagemsg):
                no_macrons_head = remove_macrons(blib.remove_links(head))
                if pagetitle.startswith("Reconstruction"):
                    unprefixed_title = "*" + re.sub(".*/", "", pagetitle)
                else:
                    unprefixed_title = pagetitle
                if no_macrons_head != unprefixed_title:
                    pagemsg("WARNING: Bad Latin head: %s" % unicode(t))
    return None, None

Example #3

0

Show file

File: delete_bad_latin_forms.py Project: benwing2/RuNounChanges

def delete_form(index, lemma, formind, formval, pos, tag_sets_to_delete,
                preserve_diaeresis, save, verbose, diff):
    def pagemsg(txt):
        msg("Page %s %s: form %s %s: %s" %
            (index, lemma, formind, formval, txt))

    if "[" in formval:
        pagemsg("Skipping form value %s with link in it" % formval)
        return

    page = pywikibot.Page(site, remove_macrons(formval, preserve_diaeresis))
    if not page.exists():
        pagemsg("Skipping form value %s, page doesn't exist" % formval)
        return

    def do_delete_form_1(page, index, parsed):
        return delete_form_1(page, index, lemma, formind, formval, pos,
                             tag_sets_to_delete, preserve_diaeresis)

    blib.do_edit(page,
                 index,
                 do_delete_form_1,
                 save=save,
                 verbose=verbose,
                 diff=diff)

Example #4

0

Show file

 def compare_headword_conj_forms(id_slot,
                                 headword_forms,
                                 conj_slots,
                                 adjust_for_missing_perf_forms=False,
                                 remove_conj_links=False):
     conj_forms = ""
     for slot in conj_slots:
         if slot in verb_props:
             conj_forms = verb_props[slot]
             break
     conj_forms = safe_split(conj_forms, ",")
     if remove_conj_links:
         conj_forms = [blib.remove_links(x) for x in conj_forms]
     corrected_headword_forms = [
         lengthen_ns_nf(x) for x in headword_forms
     ]
     corrected_conj_forms = [lengthen_ns_nf(x) for x in conj_forms]
     if adjust_for_missing_perf_forms:
         # There are several instances of 4++ verbs where only the -īvī variant,
         # not the -iī variant, is listed in the headword. Don't get tripped up
         # by that.
         ivi_conj_forms = [
             x for x in corrected_conj_forms if x.endswith(u"īvī")
         ]
         for ivi_conj_form in ivi_conj_forms:
             ii_conj_form = re.sub(u"īvī$", u"iī", ivi_conj_form)
             if ii_conj_form in corrected_conj_forms and ii_conj_form not in corrected_headword_forms:
                 corrected_headword_forms.append(ii_conj_form)
     if set(corrected_headword_forms) != set(corrected_conj_forms):
         macronless_headword_forms = set(
             lalib.remove_macrons(x) for x in corrected_headword_forms)
         macronless_conj_forms = set(
             lalib.remove_macrons(x) for x in corrected_conj_forms)
         if macronless_headword_forms == macronless_conj_forms:
             pagemsg(
                 "WARNING: Headword %s=%s different from conj %s=%s in macrons only, skipping: %s"
                 % (id_slot, ",".join(headword_forms), id_slot,
                    ",".join(conj_forms), render_headword_and_conj()))
         else:
             pagemsg(
                 "WARNING: Headword %s=%s different from conj %s=%s in more than just macrons, skipping: %s"
                 % (id_slot, ",".join(headword_forms), id_slot,
                    ",".join(conj_forms), render_headword_and_conj()))
         return False
     return True

Example #5

0

Show file

def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if " " in pagetitle:
        pagemsg("WARNING: Space in page title, skipping")
        return
    pagemsg("Processing")

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        parsed = blib.parse_text(subsections[k])
        for t in parsed.filter_templates():
            origt = unicode(t)
            tn = tname(t)
            if tn == "la-adv":
                adv = blib.remove_links(getparam(t, "1")) or pagetitle
                macron_stem, is_stem = lalib.infer_adv_stem(adv)
                if not is_stem:
                    pagemsg(
                        "WARNING: Couldn't infer stem from adverb %s, not standard: %s"
                        % (adv, origt))
                    continue
                adv_defns = lalib.find_defns(subsections[k])
                possible_adjs = []
                stem = lalib.remove_macrons(macron_stem)
                possible_adjs.append(stem + "us")
                possible_adjs.append(stem + "is")
                if stem.endswith("nt"):
                    possible_adjs.append(stem[:-2] + "ns")
                if stem.endswith("plic"):
                    possible_adjs.append(stem[:-2] + "ex")
                if stem.endswith("c"):
                    possible_adjs.append(stem[:-1] + "x")
                if re.search("[aeiou]r$", stem):
                    possible_adjs.append(stem)
                elif stem.endswith("r"):
                    possible_adjs.append(stem[:-1] + "er")
                if adv.endswith(u"iē"):
                    possible_adjs.append(stem + "ius")
                for possible_adj in possible_adjs:
                    investigate_possible_adj(index, possible_adj, adv,
                                             adv_defns)

Example #6

0

Show file

File: addpron_latin.py Project: benwing2/RuNounChanges

def get_lemmas_of_form_page(parsed):
  lemmas = set()
  for t in parsed.filter_templates():
    tname = unicode(t.name)
    first_param = None
    if (tname in ["inflection of", "comparative of", "superlative of"]):
      first_param = get_first_param(t)
    if first_param:
      lemma = lalib.remove_macrons(blib.remove_links(getparam(t, first_param)))
      lemmas.add(lemma)
  return lemmas

Example #7

0

Show file

def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    parsed = blib.parse_text(secbody)

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["l", "m", "alternative form of", "alt form"]:
            if tn in ["l", "m"]:
                lang = getparam(t, "1")
                termparam = 2
            elif getparam(t, "lang"):
                lang = getparam(t, "lang")
                termparam = 1
            else:
                lang = getparam(t, "1")
                termparam = 2
            if lang != "la":
                #pagemsg("WARNING: Wrong language in template: %s" % unicode(t))
                continue
            term = getparam(t, str(termparam))
            alt = getparam(t, str(termparam + 1))
            gloss = getparam(t, str(termparam + 2))
            if alt and lalib.remove_macrons(alt) == term:
                origt = unicode(t)
                t.add(str(termparam), alt)
                if gloss:
                    t.add(str(termparam + 1), "")
                else:
                    rmparam(t, str(termparam + 1))
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append("move alt param to link param in %s" % tn)

    secbody = unicode(parsed)
    sections[j] = secbody + sectail
    return "".join(sections), notes

Example #8

0

Show file

File: move_alt_form_to_main_form_in_latin_inflection_of.py Project: benwing2/RuNounChanges

def process_page(page, index, parsed):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "inflection of":
            lang = getparam(t, "lang")
            if lang:
                term_param = 1
            else:
                lang = getparam(t, "1")
                term_param = 2
            if lang != "la":
                continue
            term = getparam(t, str(term_param))
            alt = getparam(t, str(term_param + 1))
            if alt:
                if lalib.remove_macrons(alt) != lalib.remove_macrons(term):
                    pagemsg(
                        "WARNING: alt not same as term modulo macrons: %s" %
                        unicode(t))
                    continue
                origt = unicode(t)
                t.add(str(term_param), alt)
                t.add(str(term_param + 1), "")
                pagemsg("Replaced %s with %s" % (origt, unicode(t)))
                notes.append(
                    "move alt param to term param in Latin {{inflection of}}")

    return unicode(parsed), notes

Example #9

0

Show file

File: convert_la_headword_noun.py Project: benwing2/RuNounChanges

def compare_headword_decl_forms(id_slot, headword_forms, decl_slots, noun_props,
    headword_and_decl_text, pagemsg, adjust_for_missing_gen_forms=False,
    adjust_for_e_ae_gen=False, remove_headword_links=False):
  decl_forms = ""
  for slot in decl_slots:
    if slot in noun_props:
      decl_forms = noun_props[slot]
      break
  decl_forms = safe_split(decl_forms, ",")
  if remove_headword_links:
    headword_forms = [blib.remove_links(x) for x in headword_forms]
  corrected_headword_forms = [lengthen_ns_nf(x) for x in headword_forms]
  corrected_decl_forms = [lengthen_ns_nf(x) for x in decl_forms]
  if adjust_for_e_ae_gen:
    corrected_headword_forms = [re.sub(u"ē$", "ae", x) for x in headword_forms]
  if adjust_for_missing_gen_forms:
    # Nouns in -ius and -ium are commonly missing the shortened genitive
    # variants. Don't get tripped up by that.
    ii_decl_forms = [x for x in corrected_decl_forms if x.endswith(u"iī")]
    for ii_decl_form in ii_decl_forms:
      i_decl_form = re.sub(u"iī$", u"ī", ii_decl_form)
      if i_decl_form in corrected_decl_forms and i_decl_form not in corrected_headword_forms:
        corrected_headword_forms.append(i_decl_form)
  if set(corrected_headword_forms) != set(corrected_decl_forms):
    macronless_headword_forms = set(lalib.remove_macrons(x) for x in corrected_headword_forms)
    macronless_decl_forms = set(lalib.remove_macrons(x) for x in corrected_decl_forms)
    if macronless_headword_forms == macronless_decl_forms:
      pagemsg("WARNING: Headword %s=%s different from decl %s=%s in macrons only, skipping: %s" % (
        id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms),
        headword_and_decl_text
      ))
    else:
      pagemsg("WARNING: Headword %s=%s different from decl %s=%s in more than just macrons, skipping: %s" % (
        id_slot, ",".join(headword_forms), id_slot, ",".join(decl_forms),
        headword_and_decl_text
      ))
    return False
  return True

Example #10

0

Show file

File: addpron_latin.py Project: benwing2/RuNounChanges

def process_lemma(index, pagetitle, slots, program_args):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose)

  page = pywikibot.Page(site, pagetitle)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tn = tname(t)
    pos = None
    if tn == "la-conj":
      pos = "verb"
    elif tn == "la-ndecl":
      pos = "noun"
    elif tn == "la-adecl":
      pos = "adj"
    if pos:
      args = lalib.generate_infl_forms(pos, unicode(t), errandpagemsg, expand_text)
      for slot in args:
        matches = False
        for spec in slots:
          if spec == slot:
            matches = True
            break
          if lalib.slot_matches_spec(slot, spec):
            matches = True
            break
        if matches:
          for formpagename in re.split(",", args[slot]):
            if "[" in formpagename or "|" in formpagename:
              pagemsg("WARNING: Skipping page %s with links in it" % formpagename)
            else:
              formpagename = lalib.remove_macrons(formpagename)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Skipping dictionary form")
              else:
                def do_process_page(page, index, parsed):
                  return process_page(index, page, program_args)
                blib.do_edit(formpage, index, do_process_page,
                    save=program_args.save, verbose=program_args.verbose,
                    diff=program_args.diff)

Example #11

0

Show file

File: find_mismatched_latin_forms.py Project: benwing2/RuNounChanges

 def merge_forms_for_slot(slot, this_inflargs):
     # Merge the forms of all inflection templates under the given
     # lemma headword
     all_valid_forms = []
     all_valid_forms_with_syncopated = []
     for inflargs in this_inflargs:
         if slot not in inflargs:
             continue
         saw_slot_in_inflargs = True
         forms = inflargs[slot].split(",")
         valid_forms = [
             form for form in forms
             if "[" not in form and "|" not in form
         ]
         for form in valid_forms:
             if form not in all_valid_forms:
                 all_valid_forms.append(form)
             if form not in all_valid_forms_with_syncopated:
                 all_valid_forms_with_syncopated.append(form)
             if pos == "verb" and re.search(u"v[eiē]", form):
                 syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1",
                                          form)
                 if syncopated_form not in all_valid_forms_with_syncopated:
                     all_valid_forms_with_syncopated.append(
                         syncopated_form)
     all_matchable_forms = [
         form for form in all_valid_forms
         if lalib.remove_macrons(form) == pagetitle
     ]
     all_matchable_forms_with_syncopated = [
         form for form in all_valid_forms_with_syncopated
         if lalib.remove_macrons(form) == pagetitle
     ]
     return (all_valid_forms, all_valid_forms_with_syncopated,
             all_matchable_forms,
             all_matchable_forms_with_syncopated)

Example #12

0

Show file

File: find_mismatched_latin_forms.py Project: benwing2/RuNounChanges

            def yield_infl_of_templates_and_properties():
                for t in headword['infl_of_templates']:
                    lang = getparam(t, "lang")
                    if lang:
                        lemma_param = 1
                    else:
                        lang = getparam(t, "1")
                        lemma_param = 2
                    if lang != "la":
                        errandstagemsg(
                            "WARNING: In Latin section, found {{inflection of}} for different language %s: %s"
                            % (lang, unicode(t)))
                        continue
                    lemma = getparam(t, str(lemma_param))
                    if "[" in lemma or "|" in lemma:
                        stagemsg("WARNING: Link in lemma %s, skipping: %s" %
                                 (lemma, unicode(t)))
                        continue
                    inflargs_sets = lookup_inflection(
                        lalib.remove_macrons(lemma), pos, expected_headtemps,
                        expected_infltemps, stagemsg, errandstagemsg)
                    if inflargs_sets is None:
                        stagemsg(
                            "WARNING: Lemma %s doesn't exist or has no %s heads"
                            % (lemma, pos))
                        continue

                    # fetch tags
                    tags = []
                    for param in t.params:
                        pname = unicode(param.name).strip()
                        pval = unicode(param.value).strip()
                        if re.search("^[0-9]+$", pname):
                            if int(pname) >= lemma_param + 2:
                                if pval:
                                    tags.append(pval)
                    # split tags into tag sets (which may be multipart) and further
                    # split any multipart tag sets into component tag sets
                    tag_sets = [
                        tag_set for maybe_multipart_tag_set in
                        lalib.split_tags_into_tag_sets(tags)
                        for tag_set in lalib.split_multipart_tag_set(
                            maybe_multipart_tag_set)
                    ]
                    yield t, lemma_param, lemma, inflargs_sets, tag_sets

Example #13

0

Show file

File: check_latin_verbs_against_participles.py Project: benwing2/RuNounChanges

def check_participle(form, pagemsg):
  orig_pagemsg = pagemsg
  def pagemsg(txt):
    orig_pagemsg("%s: %s" % (form, txt))
  if "[" in form or "|" in form:
    pagemsg("Skipping form with brackets or vertical bar")
    return
  page = pywikibot.Page(site, lalib.remove_macrons(form))
  if not blib.safe_page_exists(page, pagemsg):
    pagemsg("Skipping nonexistent page")
  parsed = blib.parse_text(unicode(page.text))
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn == "la-part":
      actual_part = re.sub("/.*", "", getparam(t, "1"))
      if actual_part != form:
        pagemsg("WARNING: Found actual participle %s, expected %s" % (
          actual_part, form))

Example #14

0

Show file

File: propagate_la_comp_sup_adj.py Project: benwing2/RuNounChanges

def process_non_lemma_page(page, index):
  global args
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  pagemsg("Processing")
  text = unicode(page.text)
  parsed = blib.parse_text(text)
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn in ["la-adj-comp", "la-adj-sup"]:
      lemma = getparam(t, "1") or pagetitle
      pos = getparam(t, "pos")
      if pos:
        def do_process(page, index, parsed):
          return process_lemma_page(page, index, tn == "la-adj-comp",
              lemma)
        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(pos)), index,
            do_process, save=args.save, verbose=args.verbose, diff=args.diff)
      else:
        pagemsg("WARNING: Didn't see positive degree: %s" % unicode(t))

Example #15

0

Show file

File: fix_latin_impers_pass_part.py Project: benwing2/RuNounChanges

def process_page(index, page, save, verbose, diff):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, verbose)

    pagemsg("Processing")

    parsed = blib.parse(page)

    for t in parsed.filter_templates():
        if tname(t) == "la-conj":
            args = lalib.generate_verb_forms(unicode(t), errandpagemsg,
                                             expand_text)
            supforms = args.get("sup_acc", "")
            if supforms:
                supforms = supforms.split(",")
                for supform in supforms:
                    non_impers_part = re.sub("um$", "us", supform)
                    pagemsg(
                        "Line to delete: part %s allbutnomsgn {{la-adecl|%s}}"
                        % (non_impers_part, non_impers_part))

                    def do_correct_nom_sg_n_participle(page, index, parsed):
                        return correct_nom_sg_n_participle(
                            page, index, supform, args["1s_pres_actv_indc"])

                    blib.do_edit(pywikibot.Page(site,
                                                lalib.remove_macrons(supform)),
                                 index,
                                 do_correct_nom_sg_n_participle,
                                 save=save,
                                 verbose=verbose,
                                 diff=diff)

Example #16

0

Show file

File: fix_bad_latin_forms.py Project: benwing2/RuNounChanges

def process_page(index, pos, lemma, subs, infl, save, verbose):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, lemma, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, lemma, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose)

  pagemsg("Processing")

  args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text)
  if args is None:
    return

  forms_to_delete = []

  for key, form in args.iteritems():
    forms_to_delete.extend(form.split(","))

  for formind, form in blib.iter_items(forms_to_delete):
    def handler(page, formind, parsed):
      return process_form(index, page, lemma, formind, form, subs)
    blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)

Example #17

0

Show file

File: push_latin_adv_for_adj.py Project: benwing2/RuNounChanges

        pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix)))

    return unicode(parsed), notes


parser = blib.create_argparser(
    "Add Latin adverbs to adjectives based on the output of find_latin_adj_for_adv.py"
)
parser.add_argument("--direcfile", required=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lines = [x.rstrip('\n') for x in codecs.open(args.direcfile, "r", "utf-8")]
for i, line in blib.iter_items(lines, start, end):
    m = re.search("^(.*?) /// (.*?) /// .*? /// .*?$", line)
    if not m:
        msg("Page %s: Unrecognized line: %s" % (i, line))
        continue
    adv, adj = m.groups()

    def do_process_page(page, index, parsed):
        return process_page(page, index, adv)

    blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(adj)),
                 i,
                 do_process_page,
                 save=args.save,
                 verbose=args.verbose,
                 diff=args.diff)
blib.elapsed_time()

Example #18

0

Show file

File: find_mismatched_latin_forms.py Project: benwing2/RuNounChanges

                    def check_for_tag_set_match(tag_set, allow_lemma_mismatch):
                        slot = lalib.tag_set_to_slot(tag_set, tag_set_groups,
                                                     stagemsg)
                        if slot is None:
                            # Already issued warning
                            return []
                        if slot not in possible_slots:
                            stagemsg(
                                "WARNING: Unrecognized slot %s from tag set: %s"
                                % (slot, unicode(t)))
                            return []
                        saw_slot_in_inflargs = False
                        matching_actual_lemmas = []
                        for actual_lemmas, this_inflargs in inflargs_sets:
                            saw_matching_lemma = False
                            for actual_lemma in actual_lemmas:
                                actual_lemma = blib.remove_links(actual_lemma)
                                if (lalib.remove_macrons(lemma)
                                        == lalib.remove_macrons(actual_lemma)
                                        if allow_lemma_mismatch else lemma
                                        == actual_lemma):
                                    saw_matching_lemma = True
                            if not saw_matching_lemma:
                                continue

                            (all_valid_forms, all_valid_forms_with_syncopated,
                             all_matchable_forms,
                             all_matchable_forms_with_syncopated) = (
                                 merge_forms_for_slot(slot, this_inflargs))

                            matched_form = False
                            if set(headword_forms) == set(all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms), slot, lemma,
                                       ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms),
                                       ",".join(all_matchable_forms), slot,
                                       lemma, ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) == set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    %
                                    (",".join(headword_forms), slot, lemma,
                                     ",".join(all_valid_forms_with_syncopated))
                                )
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    % (",".join(headword_forms), ",".join(
                                        all_matchable_forms_with_syncopated),
                                       slot, lemma, ",".join(
                                           all_valid_forms_with_syncopated)))
                                matched_form = True
                            if matched_form:
                                for actual_lemma in actual_lemmas:
                                    if actual_lemma not in matching_actual_lemmas:
                                        matching_actual_lemmas.append(
                                            actual_lemma)

                        if not matching_actual_lemmas:
                            if not saw_slot_in_inflargs:
                                if "pasv" in slot:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb"
                                        % (",".join(headword_forms), slot,
                                           lemma))
                                else:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s"
                                        % (",".join(headword_forms), slot,
                                           lemma))

                        return matching_actual_lemmas

Example #19

0

Show file

File: fix_bad_latin_forms.py Project: benwing2/RuNounChanges

def process_form(index, page, lemma, formind, formval, subs):
  pagetitle = unicode(page.title())

  def pagemsg(txt):
    msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt))

  notes = []

  parsed = blib.parse(page)

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)

    def fix_head(headparam, head, tn):
      for badstem, goodstem in subs:
        if head.startswith(badstem):
          newhead = goodstem + head[len(badstem):]
          t.add(headparam, newhead)
          notes.append("correct stem %s -> %s in {{%s}}" % (
            badstem, goodstem, tn))
          return newhead
      else:
        # no break
        pagemsg("WARNING: Head %s not same as page title and doesn't begin with bad stem %s: %s" % (
          head, " or ".join(badstem for badstem, goodstem in subs), unicode(t)))
        return False

    # la-suffix-form has its own format, don't handle
    if tn in lalib.la_nonlemma_headword_templates and tn != "la-suffix-form":
      headparam = "head"
      head = getparam(t, headparam)
      if not head:
        headparam = "1"
        head = getparam(t, headparam)
      if remove_macrons(head) != pagetitle:
        newhead = fix_head(headparam, head, tn)
        if newhead and remove_macrons(newhead) != pagetitle:
          pagemsg("WARNING: Replacement head %s not same as page title: %s" % (
            newhead, unicode(t)))
    elif tn in lalib.la_infl_of_templates:
      langparam = "lang"
      headparam = "1"
      altparam = "2"
      lang = getparam(t, langparam)
      if not lang:
        langparam = "1"
        headparam = "2"
        altparam = "3"
        lang = getparam(t, langparam)
      if lang == "la":
        link = getparam(t, headparam)
        alt = getparam(t, altparam)
        head = alt or link
        if remove_macrons(head) != remove_macrons(lemma):
          if subs:
            newhead = fix_head(headparam, head, tn + "|la")
            if newhead:
              t.add(altparam, "")
              if remove_macrons(newhead) != remove_macrons(lemma):
                pagemsg("WARNING: Replacement lemma %s not same as lemma %s: %s" % (
                  newhead, lemma, unicode(t)))
        else:
          if link != lemma or alt != "":
            t.add(headparam, lemma)
            t.add(altparam, "")
            notes.append("correct lemma and/or move alt text to link text in {{%s|la}}" % tn)
    if origt != unicode(t):
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
  return unicode(parsed), notes

Example #20

0

Show file

File: find_mismatched_latin_forms.py Project: benwing2/RuNounChanges

def lookup_inflection(lemma_no_macrons, pos, expected_headtemps,
                      expected_infltemps, pagemsg, errandpagemsg):
    global args
    lemma_pagetitle = lemma_no_macrons
    if lemma_pagetitle.startswith("*"):
        lemma_pagetitle = "Reconstruction:Latin/" + lemma_pagetitle[1:]

    orig_pagemsg = pagemsg
    orig_errandpagemsg = errandpagemsg

    def pagemsg(txt):
        orig_pagemsg("%s: %s" % (lemma_no_macrons, txt))

    def errandpagemsg(txt):
        orig_errandpagemsg("%s: %s" % (lemma_no_macrons, txt))

    def expand_text(tempcall):
        cache_key = (tempcall, lemma_pagetitle)
        if cache_key in expand_text_cache:
            retval = expand_text_cache[cache_key]
            if args.verbose:
                pagemsg("Found (%s, %s)=%s in expand_text_cache" %
                        (tempcall, lemma_pagetitle, retval))
            return retval
        if args.verbose:
            pagemsg("Couldn't find (%s, %s) in expand_text_cache" %
                    (tempcall, lemma_pagetitle))
        result = blib.expand_text(tempcall, lemma_pagetitle, pagemsg,
                                  args.verbose)
        expand_text_cache[cache_key] = result
        return result

    if lemma_pagetitle in heads_and_defns_cache:
        if args.verbose:
            pagemsg("Found %s in heads_and_defns_cache" % lemma_pagetitle)
        retval = heads_and_defns_cache[lemma_pagetitle]
    else:
        if args.verbose:
            pagemsg("Couldn't find %s in heads_and_defns_cache" %
                    lemma_pagetitle)
        page = pywikibot.Page(site, lemma_pagetitle)
        try:
            exists = blib.try_repeatedly(lambda: page.exists(), pagemsg,
                                         "determine if page exists")
        except pywikibot.exceptions.InvalidTitle as e:
            pagemsg("WARNING: Invalid title %s, skipping" % lemma_pagetitle)
            heads_and_defns_cache[lemma_pagetitle] = "nonexistent"
            traceback.print_exc(file=sys.stdout)
            return None
        if not exists:
            pagemsg("WARNING: Lemma %s doesn't exist" % lemma_no_macrons)
            heads_and_defns_cache[lemma_pagetitle] = "nonexistent"
            return None

        retval = lalib.find_heads_and_defns(unicode(page.text), pagemsg)
        heads_and_defns_cache[lemma_pagetitle] = retval

    if retval == "nonexistent":
        pagemsg("WARNING: Lemma %s doesn't exist (cached)" % lemma_no_macrons)
        return None
    if retval is None:
        return None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    matched_head = False

    inflargs_sets = []

    seen_heads = []
    seen_infltns = []
    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)
        heads = lalib.la_get_headword_from_template(ht, lemma_pagetitle,
                                                    pagemsg, expand_text)
        for head in heads:
            if head not in seen_heads:
                seen_heads.append(head)
        for inflt in headword['infl_templates']:
            infltn = tname(inflt)
            if infltn not in seen_infltns:
                seen_infltns.append(infltn)
        if tn in expected_headtemps:
            oright = unicode(ht)
            for head in heads:
                head_no_links = blib.remove_links(head)
                if lalib.remove_macrons(head_no_links) == lemma_no_macrons:
                    break
            else:
                # no break
                continue
            this_inflargs = []
            for inflt in headword['infl_templates']:
                infltn = tname(inflt)
                if infltn not in expected_infltemps:
                    pagemsg(
                        "WARNING: Saw bad declension template for %s, expected one of {{%s}}: %s"
                        % (pos, ",".join(
                            "{{%s}}" % temp
                            for temp in expected_infltemps), unicode(inflt)))
                    continue

                originflt = unicode(inflt)
                inflargs = lalib.generate_infl_forms(pos, originflt,
                                                     errandpagemsg,
                                                     expand_text)
                if inflargs is None:
                    continue
                this_inflargs.append(inflargs)
                matched_head = True
            inflargs_sets.append((heads, this_inflargs))
    if not matched_head:
        pagemsg(
            "WARNING: Couldn't find any matching heads, even allowing macron differences (seen heads %s, seen infl template names %s)"
            % (",".join(seen_heads), ",".join(seen_infltns)))
        return None
    return inflargs_sets

Example #21

0

Show file

File: find_mismatched_latin_forms.py Project: benwing2/RuNounChanges

def process_text_on_page(index, pagetitle, text):
    global args

    if pagetitle.startswith("Reconstruction:Latin/"):
        pagetitle = re.sub("^Reconstruction:Latin/", "*", pagetitle)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    if not args.stdin:
        pagemsg("Processing")

    # Greatly speed things up when --stdin by ignoring non-Latin pages
    if "==Latin==" not in text:
        return None, None

    retval = lalib.find_heads_and_defns(text, pagemsg)
    if retval is None:
        return None, None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)

        if tn == "la-noun-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "noun form":
            pos = "noun"
            tag_set_groups = lalib.noun_tag_groups
            possible_slots = lalib.la_noun_decl_overrides
            expected_headtemps = ["la-noun"]
            expected_infltemps = ["la-ndecl"]
        elif tn == "la-proper noun-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "proper noun form":
            pos = "pn"
            tag_set_groups = lalib.noun_tag_groups
            possible_slots = lalib.la_noun_decl_overrides
            expected_headtemps = ["la-proper noun"]
            expected_infltemps = ["la-ndecl"]
        #elif tn == "la-pronoun-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "pronoun form":
        #  pos = "pronoun"
        #  tag_set_groups = lalib.adj_tag_groups
        #  possible_slots = lalib.la_adj_decl_overrides
        #  expected_headtemp = ???
        elif tn == "la-verb-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "verb form":
            pos = "verb"
            tag_set_groups = lalib.verb_tag_groups
            possible_slots = lalib.la_verb_overrides
            expected_headtemps = ["la-verb"]
            expected_infltemps = ["la-conj"]
        elif tn == "la-adj-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "adjective form":
            pos = "adj"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-adj", "la-adj-comp", "la-adj-sup"]
            expected_infltemps = ["la-adecl"]
        elif tn == "la-part-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "participle form":
            pos = "part"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-part"]
            expected_infltemps = ["la-adecl"]
        #elif tn == "la-suffix-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "suffix form":
        #  pos = "suffix"
        elif tn == "la-num-form" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") == "numeral form":
            pos = "numadj"
            tag_set_groups = lalib.adj_tag_groups
            possible_slots = lalib.la_adj_decl_overrides
            expected_headtemps = ["la-num-adj"]
            expected_infltemps = ["la-adecl"]
        else:
            continue

        #
        # We have the following:
        #
        # 1. The non-lemma headword, with one or (potentially but unlikely) more
        #    than one headword form.
        # 2. Under the headword, multiple {{inflection of}} templates, each of
        #    which specifies a single lemma under which the non-lemma form
        #    belongs, and one or more corresponding tag sets.
        # 3. The lemma page corresponding to the lemma specified in an
        #    {{inflection of}} template may have one or more lemmas of the right
        #    part of speech. Each lemma specifies one or (potentially but
        #    unlikely) more than one lemma form. Some, all or none of the lemmas
        #    might match the lemma specified in the {{inflection of}} template
        #    in macrons (i.e. there's an exact match between the lemma in the
        #    {{inflection of}} template and one of the actual lemma forms of a
        #    lemma on the page).
        # 4. Under each lemma on the lemma page is one or more inflection
        #    templates specifying the inflections of the lemma. Each inflection
        #    template specifies the non-lemma form(s) (potentially more than one)
        #    for each slot.
        #
        # When looking up a given {{inflection of}} template, the ideal case is
        # that the specified lemma matches one of the actual lemmas, and all
        # corresponding specified non-lemma forms match the corresponding actual
        # non-lemma form(s) for all tag sets. (If there are multiple specified
        # non-lemma forms, they may match across inflection templates if there's
        # more than one, e.g. the first matches the first inflecion template and
        # the second matches the second inflection template.)
        #
        # What if there are mismatches?
        #
        # 1. If the specified non-lemma forms are a subset of the actual
        #    non-lemma forms for a given {{inflection of}} template and lemma,
        #    this is still considered a match but we make a note of it (not a
        #    warning).
        # 2. If a single {{inflection of}} template has multiple tag sets in it
        #    and and for some but not all tag sets the specified non-lemma forms
        #    match, we consider this a match but issue a warning. (In the future,
        #    we might consider removing the bad tag sets, conditioned on a
        #    separate command-line flag.)
        # 3. If the specified lemma of a given {{inflection of}} template
        #    doesn't match any actual lemmas, we look at all actual lemmas that
        #    match except for macrons and see if, for any of them, the specified
        #    non-lemma forms match the actual non-lemma forms per (1) and (2).
        #    If so, we gather the set of lemma forms for all such lemmas. If
        #    there's only one, we can update the specified lemma in the
        #    {{inflection of}} template (and issue a warning). If there are
        #    multiple, we issue a warning and don't update the specified lemma.
        # 4. We first loop through all {{inflection of}} templates for the given
        #    specified non-lemma forms and check for matches according to
        #    (1), (2) and (3). If some but not all templates match, we issue
        #    a warning and we're done with this non-lemma headword.
        # 5. If there are no matches per (4), we look for the set of actual forms
        #    that match all tag sets of all {{inflection of}} templates when
        #    ignoring macron differences. If there is such a non-empty set,
        #    we can update the specified non-lemma forms in the non-lemma
        #    headword (and issue a warning). When doing so, we may need to
        #    update the corresponding pronunciation template(s), according to
        #    logic still to be determined (FIXME), but similar to or identical to
        #    existing logic in clean_latin_long_vowels.py.
        # 6. If there are no matches per (5), we first look at the possible
        #    assignments of actual lemmas to each possible {{inflection of}}
        #    template (ignoring macron differences). If there's only one such
        #    assignment (i.e. each {{inflection of}} template can be assigned to
        #    only one actual lemma), then for that assignment, we find the
        #    actual forms that match the non-lemma pagename except in macrons and
        #    are common among all the sets of inflections, and update the
        #    specified non-lemma forms in the non-lemma headword using those
        #    forms (and issue a warning). When doing so, we may need to update
        #    the corresponding pronunciation template(s) as in (5). If there are
        #    no forms in common, issue a warning and do nothing.
        # 7. If there are multiple assignments of actual lemmas to
        #    {{inflection of}} templates, we loop over all possible assignments.
        #    For each assignment, we find the set of actual common non-lemma
        #    forms as in (6). If there is more than one assignment with a
        #    non-empty set of actual common non-lemma forms, or no assignment,
        #    we issue a warning and do nothing. Otherwise, we update the
        #    specified non-lemma forms in the non-lemma headword (and
        #    corresponding pronunciation template(s)) as in (6).

        headword_forms = lalib.la_get_headword_from_template(
            ht, pagetitle, pagemsg)
        matching_headword_forms = []
        for headword_form in headword_forms:
            if "[" in headword_form or "|" in headword_form:
                pagemsg(
                    "WARNING: Bracket or pipe symbol in non-lemma headword form, should not happen: %s"
                    % unicode(ht))
                headword_form = blib.remove_links(headword_form)
            if lalib.remove_macrons(headword_form) != pagetitle:
                pagemsg(
                    "WARNING: Bad headword form %s, doesn't match page title: %s"
                    % (headword_form, unicode(ht)))
            elif headword_form in matching_headword_forms:
                pagemsg("WARNING: Duplicate headword form %s: %s" %
                        (headword_form, unicode(ht)))
            else:
                matching_headword_forms.append(headword_form)
        headword_forms = matching_headword_forms

        for stage in [1, 2, 3]:

            def stagemsg(txt):
                pagemsg("Stage %s: %s" % (stage, txt))

            def errandstagemsg(txt):
                errandpagemsg("Stage %s: %s" % (stage, txt))

            def yield_infl_of_templates_and_properties():
                for t in headword['infl_of_templates']:
                    lang = getparam(t, "lang")
                    if lang:
                        lemma_param = 1
                    else:
                        lang = getparam(t, "1")
                        lemma_param = 2
                    if lang != "la":
                        errandstagemsg(
                            "WARNING: In Latin section, found {{inflection of}} for different language %s: %s"
                            % (lang, unicode(t)))
                        continue
                    lemma = getparam(t, str(lemma_param))
                    if "[" in lemma or "|" in lemma:
                        stagemsg("WARNING: Link in lemma %s, skipping: %s" %
                                 (lemma, unicode(t)))
                        continue
                    inflargs_sets = lookup_inflection(
                        lalib.remove_macrons(lemma), pos, expected_headtemps,
                        expected_infltemps, stagemsg, errandstagemsg)
                    if inflargs_sets is None:
                        stagemsg(
                            "WARNING: Lemma %s doesn't exist or has no %s heads"
                            % (lemma, pos))
                        continue

                    # fetch tags
                    tags = []
                    for param in t.params:
                        pname = unicode(param.name).strip()
                        pval = unicode(param.value).strip()
                        if re.search("^[0-9]+$", pname):
                            if int(pname) >= lemma_param + 2:
                                if pval:
                                    tags.append(pval)
                    # split tags into tag sets (which may be multipart) and further
                    # split any multipart tag sets into component tag sets
                    tag_sets = [
                        tag_set for maybe_multipart_tag_set in
                        lalib.split_tags_into_tag_sets(tags)
                        for tag_set in lalib.split_multipart_tag_set(
                            maybe_multipart_tag_set)
                    ]
                    yield t, lemma_param, lemma, inflargs_sets, tag_sets

            def merge_forms_for_slot(slot, this_inflargs):
                # Merge the forms of all inflection templates under the given
                # lemma headword
                all_valid_forms = []
                all_valid_forms_with_syncopated = []
                for inflargs in this_inflargs:
                    if slot not in inflargs:
                        continue
                    saw_slot_in_inflargs = True
                    forms = inflargs[slot].split(",")
                    valid_forms = [
                        form for form in forms
                        if "[" not in form and "|" not in form
                    ]
                    for form in valid_forms:
                        if form not in all_valid_forms:
                            all_valid_forms.append(form)
                        if form not in all_valid_forms_with_syncopated:
                            all_valid_forms_with_syncopated.append(form)
                        if pos == "verb" and re.search(u"v[eiē]", form):
                            syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1",
                                                     form)
                            if syncopated_form not in all_valid_forms_with_syncopated:
                                all_valid_forms_with_syncopated.append(
                                    syncopated_form)
                all_matchable_forms = [
                    form for form in all_valid_forms
                    if lalib.remove_macrons(form) == pagetitle
                ]
                all_matchable_forms_with_syncopated = [
                    form for form in all_valid_forms_with_syncopated
                    if lalib.remove_macrons(form) == pagetitle
                ]
                return (all_valid_forms, all_valid_forms_with_syncopated,
                        all_matchable_forms,
                        all_matchable_forms_with_syncopated)

            if stage == 1:
                matched_infl_of_templates = False
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):

                    def check_for_tag_set_match(tag_set, allow_lemma_mismatch):
                        slot = lalib.tag_set_to_slot(tag_set, tag_set_groups,
                                                     stagemsg)
                        if slot is None:
                            # Already issued warning
                            return []
                        if slot not in possible_slots:
                            stagemsg(
                                "WARNING: Unrecognized slot %s from tag set: %s"
                                % (slot, unicode(t)))
                            return []
                        saw_slot_in_inflargs = False
                        matching_actual_lemmas = []
                        for actual_lemmas, this_inflargs in inflargs_sets:
                            saw_matching_lemma = False
                            for actual_lemma in actual_lemmas:
                                actual_lemma = blib.remove_links(actual_lemma)
                                if (lalib.remove_macrons(lemma)
                                        == lalib.remove_macrons(actual_lemma)
                                        if allow_lemma_mismatch else lemma
                                        == actual_lemma):
                                    saw_matching_lemma = True
                            if not saw_matching_lemma:
                                continue

                            (all_valid_forms, all_valid_forms_with_syncopated,
                             all_matchable_forms,
                             all_matchable_forms_with_syncopated) = (
                                 merge_forms_for_slot(slot, this_inflargs))

                            matched_form = False
                            if set(headword_forms) == set(all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms), slot, lemma,
                                       ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms):
                                stagemsg(
                                    "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)"
                                    % (",".join(headword_forms),
                                       ",".join(all_matchable_forms), slot,
                                       lemma, ",".join(all_valid_forms)))
                                matched_form = True
                            elif set(headword_forms) == set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    %
                                    (",".join(headword_forms), slot, lemma,
                                     ",".join(all_valid_forms_with_syncopated))
                                )
                                matched_form = True
                            elif set(headword_forms) <= set(
                                    all_matchable_forms_with_syncopated):
                                stagemsg(
                                    "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)"
                                    % (",".join(headword_forms), ",".join(
                                        all_matchable_forms_with_syncopated),
                                       slot, lemma, ",".join(
                                           all_valid_forms_with_syncopated)))
                                matched_form = True
                            if matched_form:
                                for actual_lemma in actual_lemmas:
                                    if actual_lemma not in matching_actual_lemmas:
                                        matching_actual_lemmas.append(
                                            actual_lemma)

                        if not matching_actual_lemmas:
                            if not saw_slot_in_inflargs:
                                if "pasv" in slot:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb"
                                        % (",".join(headword_forms), slot,
                                           lemma))
                                else:
                                    stagemsg(
                                        "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s"
                                        % (",".join(headword_forms), slot,
                                           lemma))

                        return matching_actual_lemmas

                    saw_matching_lemma = False
                    for actual_lemmas, this_inflargs in inflargs_sets:
                        if lemma in [
                                blib.remove_links(x) for x in actual_lemmas
                        ]:
                            saw_matching_lemma = True
                            break

                    if saw_matching_lemma:
                        tag_set_matches = []
                        tag_set_mismatches = []
                        for tag_set in tag_sets:
                            matching_lemmas = check_for_tag_set_match(
                                tag_set, allow_lemma_mismatch=False)
                            if matching_lemmas:
                                tag_set_matches.append(tag_set)
                            else:
                                tag_set_mismatches.append(tag_set)
                        if len(tag_set_matches) > 0:
                            matched_infl_of_templates = True
                            if len(tag_set_mismatches) > 0:
                                stagemsg(
                                    "WARNING: Matched tag sets %s but not %s, counting as a match: %s"
                                    %
                                    (",".join("|".join(tag_set)
                                              for tag_set in tag_set_matches),
                                     ",".join(
                                         "|".join(tag_set)
                                         for tag_set in tag_set_mismatches),
                                     unicode(t)))
                        else:
                            stagemsg(
                                "WARNING: Couldn't match any tag sets: %s" %
                                unicode(t))

                    else:
                        stagemsg(
                            "WARNING: Couldn't match lemma %s among potential lemmas %s, trying without lemma matches: %s"
                            % (lemma, ",".join(
                                actual_lemma for actual_lemmas, this_inflargs
                                in inflargs_sets
                                for actual_lemma in actual_lemmas),
                               unicode(t)))
                        tag_set_matches = []
                        tag_set_mismatches = []
                        all_matching_lemmas = []
                        for tag_set in tag_sets:
                            matching_lemmas = check_for_tag_set_match(
                                tag_set, allow_lemma_mismatch=True)
                            if matching_lemmas:
                                tag_set_matches.append(tag_set)
                                for matching_lemma in matching_lemmas:
                                    if matching_lemma not in all_matching_lemmas:
                                        all_matching_lemmas.append(
                                            matching_lemma)
                            else:
                                tag_set_mismatches.append(tag_set)
                        if len(tag_set_matches) > 0:
                            matched_infl_of_templates = True
                            if len(all_matching_lemmas) == 1:
                                notes.append(
                                    "fix macrons in lemma of '%s' (stage 1): %s -> %s"
                                    %
                                    (tname(t), lemma, all_matching_lemmas[0]))
                                if len(tag_set_mismatches) > 0:
                                    stagemsg(
                                        "WARNING: Fixing macrons in lemma %s -> %s despite only some tag sets %s but not %s matching, counting as a match: %s"
                                        % (lemma, all_matching_lemmas[0],
                                           ",".join(
                                               "|".join(tag_set)
                                               for tag_set in tag_set_matches),
                                           ",".join("|".join(tag_set)
                                                    for tag_set in
                                                    tag_set_mismatches),
                                           unicode(t)))
                                else:
                                    stagemsg(
                                        "WARNING: Fixing macrons in lemma %s -> %s; all tag sets match: %s"
                                        % (lemma, all_matching_lemmas[0],
                                           unicode(t)))
                                origt = unicode(t)
                                t.add(str(lemma_param), all_matching_lemmas[0])
                                stagemsg("Replaced %s with %s" %
                                         (origt, unicode(t)))
                            else:
                                if len(tag_set_mismatches) > 0:
                                    stagemsg(
                                        "WARNING: Multiple possible lemmas %s match some tag sets %s but not %s, counting as a match but not updating lemma %s: %s"
                                        % (",".join(all_matching_lemmas),
                                           ",".join(
                                               "|".join(tag_set)
                                               for tag_set in tag_set_matches),
                                           ",".join("|".join(tag_set)
                                                    for tag_set in
                                                    tag_set_mismatches), lemma,
                                           unicode(t)))
                                else:
                                    stagemsg(
                                        "WARNING: Multiple possible lemmas %s match tag sets, with all tag sets matching, counting as a match but not updating lemma %s: %s"
                                        % (",".join(all_matching_lemmas),
                                           lemma, unicode(t)))
                        else:
                            stagemsg(
                                "WARNING: Couldn't match any tag sets even when allowing macron mismatches with lemma %s: %s"
                                % (lemma, unicode(t)))

                if matched_infl_of_templates:
                    break

            elif stage == 2:
                common_forms = None
                no_common_forms = False
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):
                    for tag_set in tag_sets:
                        slot = lalib.tag_set_to_slot(tag_set, tag_set_groups,
                                                     stagemsg)
                        if slot is None or slot not in possible_slots:
                            # Already issued warning
                            no_common_forms = True
                            break
                        this_tag_set_matching_forms = []
                        combined_this_inflargs = []
                        for actual_lemmas, this_inflargs in inflargs_sets:
                            for actual_lemma in actual_lemmas:
                                actual_lemma = blib.remove_links(actual_lemma)
                                if lemma == actual_lemma:
                                    combined_this_inflargs.extend(
                                        this_inflargs)
                                    break
                        if not combined_this_inflargs:
                            continue
                        (all_valid_forms, all_valid_forms_with_syncopated,
                         all_matchable_forms,
                         all_matchable_forms_with_syncopated) = (
                             merge_forms_for_slot(slot,
                                                  combined_this_inflargs))
                        for form in all_matchable_forms:
                            if form not in this_tag_set_matching_forms:
                                this_tag_set_matching_forms.append(form)
                        if common_forms is None:
                            common_forms = this_tag_set_matching_forms
                            if len(common_forms) == 0:
                                no_common_forms = True
                                break
                        else:
                            new_common_forms = []
                            for form in common_forms:
                                if form in this_tag_set_matching_forms:
                                    new_common_forms.append(form)
                            common_forms = new_common_forms
                            if len(common_forms) == 0:
                                no_common_forms = True
                                break
                    if no_common_forms:
                        break
                if no_common_forms or common_forms is None:
                    stagemsg(
                        "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets, not changing headword form(s) but trying again allowing macron differences in lemmas: %s"
                        % (pagetitle, unicode(ht)))
                else:
                    notes.append(
                        "fix macrons in forms of '%s' (stage 2): %s -> %s" %
                        (tname(ht), ",".join(headword_forms),
                         ",".join(common_forms)))
                    oright = unicode(ht)
                    if tname(ht) == "head":
                        blib.set_param_chain(ht, common_forms, "head", "head")
                    else:
                        blib.set_param_chain(ht, common_forms, "1", "head")
                    stagemsg("Replaced %s with %s" % (oright, unicode(ht)))
                    if len(common_forms) > 1:
                        stagemsg(
                            "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s"
                            % ",".join(common_forms))
                    else:
                        assert len(common_forms) == 1
                        clean_latin_long_vowels.process_pronun_templates(
                            headword['pronun_section'], common_forms[0],
                            stagemsg, notes,
                            "fix macrons in pronun of '%%s' (stage 2): %s -> %s"
                            %
                            (",".join(headword_forms), ",".join(common_forms)))
                    break

            else:
                assert stage == 3
                multiple_assignments = False
                infl_of_assignments = []
                for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties(
                ):
                    matching_lemmas = []
                    for actual_lemmas, this_inflargs in inflargs_sets:
                        for actual_lemma in actual_lemmas:
                            actual_lemma = blib.remove_links(actual_lemma)
                            if lalib.remove_macrons(
                                    lemma) == lalib.remove_macrons(
                                        actual_lemma):
                                if actual_lemma not in matching_lemmas:
                                    matching_lemmas.append(actual_lemma)
                    if len(matching_lemmas) > 1:
                        stagemsg(
                            "WARNING: Multiple actual lemmas %s match {{inflection of}} lemma %s, hence multiple assignments, doing things the hard way: %s"
                            % (",".join(matching_lemmas), lemma, unicode(t)))
                        multiple_assignments = True
                    infl_of_assignments.append(matching_lemmas)

                cur_assignment = None
                cur_common_forms = None
                for assignment in itertools.product(*infl_of_assignments):
                    common_forms = None
                    no_common_forms = False
                    for actual_lemma, (
                            t, lemma_param, lemma, inflargs_sets,
                            tag_sets) in zip(
                                assignment,
                                yield_infl_of_templates_and_properties()):
                        for tag_set in tag_sets:
                            slot = lalib.tag_set_to_slot(
                                tag_set, tag_set_groups, stagemsg)
                            if slot is None or slot not in possible_slots:
                                # Already issued warning
                                no_common_forms = True
                                break
                            this_tag_set_matching_forms = []
                            combined_this_inflargs = []
                            for actual_lemmas, this_inflargs in inflargs_sets:
                                if actual_lemma in actual_lemmas:
                                    combined_this_inflargs.extend(
                                        this_inflargs)
                                (all_valid_forms,
                                 all_valid_forms_with_syncopated,
                                 all_matchable_forms,
                                 all_matchable_forms_with_syncopated) = (
                                     merge_forms_for_slot(
                                         slot, combined_this_inflargs))
                                for form in all_matchable_forms:
                                    if form not in this_tag_set_matching_forms:
                                        this_tag_set_matching_forms.append(
                                            form)
                            if common_forms is None:
                                common_forms = this_tag_set_matching_forms
                                if len(common_forms) == 0:
                                    no_common_forms = True
                                    break
                            else:
                                new_common_forms = []
                                for form in common_forms:
                                    if form in this_tag_set_matching_forms:
                                        new_common_forms.append(form)
                                common_forms = new_common_forms
                                if len(common_forms) == 0:
                                    no_common_forms = True
                                    break
                        if no_common_forms:
                            break
                    if not no_common_forms and common_forms is not None:
                        if cur_assignment:
                            stagemsg(
                                "WARNING: Multiple assignments of lemmas have common forms, at least %s -> %s and %s -> %s, not changing: %s"
                                % (",".join(cur_assignment),
                                   ",".join(cur_common_forms),
                                   ",".join(assignment),
                                   ",".join(common_forms), unicode(ht)))
                        else:
                            cur_assignment = assignment
                            cur_common_forms = common_forms
                if cur_assignment is None:
                    stagemsg(
                        "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets when allowing macron differences in lemmas, not changing headword form(s): %s"
                        % (pagetitle, unicode(ht)))
                else:
                    for actual_lemma, (
                            t, lemma_param, lemma, inflargs_sets,
                            tag_sets) in zip(
                                cur_assignment,
                                yield_infl_of_templates_and_properties()):
                        notes.append(
                            "fix macrons in lemma of '%s' (stage 3): %s -> %s"
                            % (tname(t), lemma, actual_lemma))
                        stagemsg(
                            "WARNING: found common forms %s, updating lemma %s to %s: %s"
                            % (",".join(cur_common_forms), lemma, actual_lemma,
                               unicode(t)))
                        origt = unicode(t)
                        t.add(str(lemma_param), actual_lemma)
                        stagemsg("Replaced %s with %s" % (origt, unicode(t)))
                    notes.append(
                        "fix macrons in forms of '%s' (stage 3): %s -> %s" %
                        (tname(ht), ",".join(headword_forms),
                         ",".join(cur_common_forms)))
                    oright = unicode(ht)
                    if tname(ht) == "head":
                        blib.set_param_chain(ht, cur_common_forms, "head",
                                             "head")
                    else:
                        blib.set_param_chain(ht, cur_common_forms, "1", "head")
                    stagemsg("Replaced %s with %s" % (oright, unicode(ht)))
                    if len(cur_common_forms) > 1:
                        stagemsg(
                            "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s"
                            % ",".join(cur_common_forms))
                    else:
                        assert len(cur_common_forms) == 1
                        clean_latin_long_vowels.process_pronun_templates(
                            headword['pronun_section'], cur_common_forms[0],
                            stagemsg, notes,
                            "fix macrons in pronun of '%%s' (stage 3): %s -> %s"
                            % (",".join(headword_forms),
                               ",".join(cur_common_forms)))
                    break

    secbody = "".join(unicode(x) for x in parsed_subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes

Example #22

0

Show file

File: remove_latin_macrons.py Project: benwing2/RuNounChanges

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import blib
from blib import msg
import sys
import lalib

parser = blib.create_argparser("Remove Latin macrons from input",
                               no_beginning_line=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for index, line in blib.iter_items(sys.stdin, start, end):
    line = line.strip().decode('utf-8')
    msg(lalib.remove_macrons(line))

Example #23

0

Show file

File: addpron_latin.py Project: benwing2/RuNounChanges

parser.add_argument('--lemma-file', help="File containing lemmas to process, one per line; non-lemma forms will be done")
parser.add_argument('--lemmas', help="List of comma-separated lemmas to process; non-lemma forms will be done")
parser.add_argument("--slots", help="Slots to process in conjunction with --lemmas and --lemma-file.")
parser.add_argument('--override-pronun', action="store_true", help="Override existing pronunciations")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.lemma_file or args.lemmas:
  slots = args.slots.split(",")

  if args.lemma_file:
    lemmas = read_pages(args.lemma_file, start, end)
  else:
    lemmas = blib.iter_items(re.split(",", args.lemmas.decode("utf-8")), start, end)
  for i, lemma in lemmas:
    process_lemma(i, lalib.remove_macrons(lemma), slots, args)

else:
  def do_process_page(page, index, parsed):
    return process_page(index, page, args)
  blib.do_pagefile_cats_refs(args, start, end, do_process_page,
      default_cats=["Latin lemmas", "Latin non-lemma forms"], edit=True)

def subval_to_string(subval):
  if type(subval) is tuple:
    pron, extra_params, pre, post = subval
    return unicode(FoundPronun(pron, extra_params, pre, post))
  else:
    return subval

for regex, subvals in manual_pronun_mapping:

Example #24

0

Show file

File: fix_bad_latin_forms.py Project: benwing2/RuNounChanges

 def expand_text(tempcall):
   return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose)

Example #25

0

Show file

def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_heads_and_defns(text, pagemsg)
    if retval is None:
        return None, None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    part_headwords = []
    adj_headwords = []
    pn_headwords = []
    noun_headwords = []

    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)
        if tn == "la-part" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["participle", "participles"]:
            part_headwords.append(headword)
        elif tn == "la-adj" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["adjective", "adjectives"]:
            adj_headwords.append(headword)
        elif tn == "la-proper noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["proper noun", "proper nouns"]:
            pn_headwords.append(headword)
        elif tn == "la-noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]:
            noun_headwords.append(headword)
    headwords_to_do = None
    if part_headwords and not adj_headwords:
        pos = "part"
        headwords_to_do = part_headwords
        expected_inflt = "la-adecl"
    elif pn_headwords and not noun_headwords:
        pos = "pn"
        headwords_to_do = pn_headwords
        expected_inflt = "la-ndecl"

    if not headwords_to_do:
        return None, None

    for headword in headwords_to_do:
        for inflt in headword['infl_templates']:
            infltn = tname(inflt)
            if infltn != expected_inflt:
                pagemsg(
                    "WARNING: Saw bad declension template for %s, expected {{%s}}: %s"
                    % (pos, expected_inflt, unicode(inflt)))
                continue
            inflargs = lalib.generate_infl_forms(pos, unicode(inflt),
                                                 errandpagemsg, expand_text)
            forms_seen = set()
            slots_and_forms_to_process = []
            for slot, formarg in inflargs.iteritems():
                forms = formarg.split(",")
                for form in forms:
                    if "[" in form or "|" in form:
                        continue
                    form_no_macrons = lalib.remove_macrons(form)
                    if form_no_macrons == pagetitle:
                        continue
                    if form_no_macrons in forms_seen:
                        continue
                    forms_seen.add(form_no_macrons)
                    slots_and_forms_to_process.append((slot, form))
            for formindex, (slot, form) in blib.iter_items(
                    sorted(slots_and_forms_to_process,
                           key=lambda x: lalib.remove_macrons(x[1]))):

                def handler(page, formindex, parsed):
                    return process_form(page, formindex, slot, form, pos,
                                        pagemsg)

                blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                             "%s.%s" % (index, formindex),
                             handler,
                             save=args.save,
                             verbose=args.verbose,
                             diff=args.diff)

Example #26

0

Show file

File: convert_latin_participles.py Project: benwing2/RuNounChanges

def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  notes = []

  pagemsg("Processing")

  for t in parsed.filter_templates():
    origt = unicode(t)
    tn = tname(t)
    allow_2 = False
    lemma = None
    if tn in ["la-future participle", "la-perfect participle", "la-gerundive"]:
      base = getparam(t, "1")
      if tn == "la-gerundive":
        param2 = getparam(t, "2")
        if param2:
          if lalib.remove_macrons(base) == lalib.remove_macrons(param2):
            allow_2 = True
            base = param2
          else:
            pagemsg("WARNING: Unrecognized param 2: %s" % origt)
            continue
      if not base:
        pagemsg("WARNING: Empty param 1: %s" % origt)
        continue
      lemma = base + "us"
    elif tn == "la-present participle":
      base = getparam(t, "1")
      ending = getparam(t, "2")
      if not base:
        pagemsg("WARNING: Empty param 1: %s" % origt)
        continue
      if not ending:
        pagemsg("WARNING: Empty param 2: %s" % origt)
        continue
      if ending == "ans":
        lemma = base + u"āns"
      elif ending == "ens":
        lemma = base + u"ēns"
      elif ending == "iens":
        lemma = u"%siēns/%seunt" % (base, base)
      else:
        pagemsg("WARNING: Unrecognized param 2: %s" % origt)
        continue
      allow_2 = True
    if lemma:
      bad_param = False
      for param in t.params:
        pname = unicode(param.name)
        if pname.strip() == "1" or allow_2 and pname.strip() == "2":
          continue
        pagemsg("WARNING: Unrecognized param %s=%s: %s" % (
          pname, param.value, origt))
        bad_param = True
      if bad_param:
        continue
      rmparam(t, "2")
      t.add("1", lemma)
      blib.set_template_name(t, "la-part")
      pagemsg("Replaced %s with %s" % (origt, unicode(t)))
      notes.append(u"convert {{%s}} to {{la-part}}" % tn)

  return unicode(parsed), notes

Example #27

0

Show file

File: delete_bad_latin_forms.py Project: benwing2/RuNounChanges

 def expand_text(tempcall):
     return blib.expand_text(tempcall,
                             remove_macrons(lemma, preserve_diaeresis),
                             pagemsg, verbose)

Example #28

0

Show file

def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    saw_noun = None
    saw_proper_noun = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-noun":
            if saw_noun:
                pagemsg(
                    "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_noun), unicode(t)))
                return
            saw_noun = t
        elif tn == "la-proper noun":
            if saw_proper_noun:
                pagemsg(
                    "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_proper_noun), unicode(t)))
                return
            saw_proper_noun = t
    if saw_noun and saw_proper_noun:
        pagemsg(
            "WARNING: Saw both noun and proper noun, can't correct header/headword"
        )
        return
    if not saw_noun and not saw_proper_noun:
        pagemsg(
            "WARNING: Saw neither noun nor proper noun, can't correct header/headword"
        )
        return
    pos = "pn" if saw_proper_noun else "n"
    ht = saw_proper_noun or saw_noun
    if getparam(ht, "indecl"):
        pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht))
        return
    generate_template = blib.parse_text(unicode(ht)).filter_templates()[0]
    blib.set_template_name(generate_template, "la-generate-noun-forms")
    blib.remove_param_chain(generate_template, "lemma", "lemma")
    blib.remove_param_chain(generate_template, "m", "m")
    blib.remove_param_chain(generate_template, "f", "f")
    blib.remove_param_chain(generate_template, "g", "g")
    rmparam(generate_template, "type")
    rmparam(generate_template, "indecl")
    rmparam(generate_template, "id")
    rmparam(generate_template, "pos")
    result = expand_text(unicode(generate_template))
    if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        return
    tempargs = blib.split_generate_args(result)
    forms_seen = set()
    slots_and_forms_to_process = []
    for slot, formarg in tempargs.iteritems():
        forms = formarg.split(",")
        for form in forms:
            if "[" in form or "|" in form:
                continue
            form_no_macrons = lalib.remove_macrons(form)
            if form_no_macrons == pagetitle:
                continue
            if form_no_macrons in forms_seen:
                continue
            forms_seen.add(form_no_macrons)
            slots_and_forms_to_process.append((slot, form))
    for index, (slot, form) in blib.iter_items(
            sorted(slots_and_forms_to_process,
                   key=lambda x: lalib.remove_macrons(x[1]))):

        def handler(page, index, parsed):
            return process_form(page, index, slot, form, pos)

        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                     index,
                     handler,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)

Example #29

0

Show file

File: fix_latin_comp_sup_adv.py Project: benwing2/RuNounChanges

def process_page(page, index, parsed):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    text = unicode(page.text)
    origtext = text

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return None, None

    sections, j, secbody, sectail, has_non_latin = retval

    notes = []

    subsections = re.split("(^===[^=\n]*===\n)", secbody, 0, re.M)

    for k in xrange(2, len(subsections), 2):
        if "==Adverb==" in subsections[k - 1]:
            parsed = blib.parse_text(subsections[k])
            posdeg = None
            compt = None
            supt = None
            for t in parsed.filter_templates():
                if tname(t) == "comparative of":
                    if compt:
                        pagemsg(
                            "WARNING: Saw multiple {{comparative of}}: %s and %s"
                            % (unicode(compt), unicode(t)))
                    else:
                        compt = t
                        posdeg = blib.remove_links(getparam(t, "1"))
                        if not posdeg:
                            pagemsg(
                                "WARNING: Didn't see positive degree in {{comparative of}}: %s"
                                % unicode(t))
                elif tname(t) == "superlative of":
                    if supt:
                        pagemsg(
                            "WARNING: Saw multiple {{superlative of}}: %s and %s"
                            % (unicode(supt), unicode(t)))
                    else:
                        supt = t
                        posdeg = blib.remove_links(getparam(t, "1"))
                        if not posdeg:
                            pagemsg(
                                "WARNING: Didn't see positive degree in {{superlative of}}: %s"
                                % unicode(t))
            if compt and supt:
                pagemsg(
                    "WARNING: Saw both comparative and superlative, skipping: %s and %s"
                    % (unicode(compt), unicode(supt)))
                continue
            if not compt and not supt:
                pagemsg(
                    "WARNING: Didn't see {{comparative of}} or {{superlative of}} in section %s"
                    % k)
                continue
            for t in parsed.filter_templates():
                tn = tname(t)
                if tn in ["la-adv-comp", "la-adv-sup"]:
                    pagemsg("Already saw fixed headword: %s" % unicode(t))
                    break
                if tn == "head":
                    if not getparam(t, "1") == "la":
                        pagemsg("WARNING: Saw wrong language in {{head}}: %s" %
                                unicode(t))
                    else:
                        pos = getparam(t, "2")
                        head = blib.remove_links(getparam(t,
                                                          "head")) or pagetitle
                        if pos not in [
                                "adverb",
                                "adverbs",
                                "adverb form",
                                "adverb forms",
                                "adverb comparative form",
                                "adverb comparative forms",
                                "adverb superlative form",
                                "adverb superlative forms",
                        ]:
                            pagemsg(
                                "WARNING: Unrecognized part of speech '%s': %s"
                                % (pos, unicode(t)))
                        else:
                            real_head, real_comp, real_sup = find_head_comp_sup(
                                lalib.remove_macrons(posdeg), pagemsg)
                            if real_head:
                                if lalib.remove_macrons(
                                        real_head) != lalib.remove_macrons(
                                            posdeg):
                                    pagemsg(
                                        "WARNING: Can't replace positive degree %s with %s because they differ when macrons are removed"
                                        % (posdeg, real_head))
                                else:
                                    pagemsg(
                                        "Using real positive degree %s instead of %s"
                                        % (real_head, posdeg))
                                    inflt = compt or supt
                                    origt = unicode(inflt)
                                    inflt.add("1", real_head)
                                    pagemsg("Replaced %s with %s" %
                                            (origt, unicode(inflt)))
                            if compt:
                                newname = "la-adv-comp"
                                infldeg = "comparative"
                                if real_comp and real_comp != "-":
                                    if lalib.remove_macrons(
                                            real_comp) != lalib.remove_macrons(
                                                head):
                                        pagemsg(
                                            "WARNING: Can't replace comparative degree %s with %s because they differ when macrons are removed"
                                            % (head, real_comp))
                                    else:
                                        pagemsg(
                                            "Using real comparative degree %s instead of %s"
                                            % (real_comp, head))
                                        head = real_comp
                                else:
                                    pagemsg(
                                        "WARNING: Couldn't retrieve real comparative for positive degree %s"
                                        % real_head)
                            else:
                                newname = "la-adv-sup"
                                infldeg = "superlative"
                                if real_sup and real_sup != "-":
                                    if lalib.remove_macrons(
                                            real_sup) != lalib.remove_macrons(
                                                head):
                                        pagemsg(
                                            "WARNING: Can't replace superlative degree %s with %s because they differ when macrons are removed"
                                            % (head, real_sup))
                                    else:
                                        pagemsg(
                                            "Using real superlative degree %s instead of %s"
                                            % (real_sup, head))
                                        head = real_sup
                                else:
                                    pagemsg(
                                        "WARNING: Couldn't retrieve real superlative for positive degree %s"
                                        % real_head)
                            origt = unicode(t)
                            rmparam(t, "head")
                            rmparam(t, "2")
                            rmparam(t, "1")
                            blib.set_template_name(t, newname)
                            t.add("1", head)
                            pagemsg("Replaced %s with %s" %
                                    (origt, unicode(t)))
                            notes.append(
                                "replace {{head|la|...}} with {{%s}} and fix up positive/%s"
                                % (newname, infldeg))

            subsections[k] = unicode(parsed)

    secbody = "".join(subsections)
    sections[j] = secbody + sectail
    return "".join(sections), notes

Example #30

0

Show file

File: push_latin_adv_for_adj.py Project: benwing2/RuNounChanges

def process_page(page, index, adverb):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    notes = []

    pagemsg("Processing")

    text = unicode(page.text)

    parsed = blib.parse_text(text)
    adj_template = None
    part_template = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-adj":
            if adj_template:
                pagemsg(
                    "WARNING: Saw multiple adjective templates: %s and %s" %
                    (unicode(adj_template), unicode(t)))
            else:
                adj_template = t
        if tn == "la-part":
            if part_template:
                pagemsg(
                    "WARNING: Saw multiple participle templates: %s and %s" %
                    (unicode(part_template), unicode(t)))
            else:
                part_template = t
    if adj_template and part_template:
        pagemsg("Saw both %s and %s, modifying adjective" %
                (unicode(adj_template), unicode(part_template)))
    if adj_template:
        template_to_fix = adj_template
    elif part_template:
        template_to_fix = part_template
    else:
        pagemsg("WARNING: Didn't see adjective or participle template")
        return None, None
    existing_advs = blib.fetch_param_chain(template_to_fix, "adv", "adv")
    changed = False
    for i in xrange(len(existing_advs)):
        if lalib.remove_macrons(existing_advs[i]) == lalib.remove_macrons(adv):
            if existing_advs[i] != adv:
                pagemsg("Updating macrons of %s -> %s in %s" %
                        (existing_advs[i], adv, unicode(template_to_fix)))
                existing_advs[i] = adv
                changed = True
                notes.append("update macrons of adv=, changing %s -> %s" %
                             (existing_advs[i], adv))
            else:
                pagemsg("Already saw %s: %s" % (adv, unicode(template_to_fix)))
            break
    else:
        # no break
        existing_advs.append(adv)
        changed = True
        notes.append("add adv %s to adjective" % adv)
    if changed:
        origt = unicode(template_to_fix)
        blib.set_param_chain(template_to_fix, existing_advs, "adv", "adv")
        pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix)))

    return unicode(parsed), notes