Esempio n. 1
0
def process_lemma(index, pagetitle, slots, program_args):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose)

  page = pywikibot.Page(site, pagetitle)
  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    tn = tname(t)
    pos = None
    if tn == "la-conj":
      pos = "verb"
    elif tn == "la-ndecl":
      pos = "noun"
    elif tn == "la-adecl":
      pos = "adj"
    if pos:
      args = lalib.generate_infl_forms(pos, unicode(t), errandpagemsg, expand_text)
      for slot in args:
        matches = False
        for spec in slots:
          if spec == slot:
            matches = True
            break
          if lalib.slot_matches_spec(slot, spec):
            matches = True
            break
        if matches:
          for formpagename in re.split(",", args[slot]):
            if "[" in formpagename or "|" in formpagename:
              pagemsg("WARNING: Skipping page %s with links in it" % formpagename)
            else:
              formpagename = lalib.remove_macrons(formpagename)
              formpage = pywikibot.Page(site, formpagename)
              if not formpage.exists():
                pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename)
              elif formpagename == pagetitle:
                pagemsg("WARNING: Skipping dictionary form")
              else:
                def do_process_page(page, index, parsed):
                  return process_page(index, page, program_args)
                blib.do_edit(formpage, index, do_process_page,
                    save=program_args.save, verbose=program_args.verbose,
                    diff=program_args.diff)
def process_page(index, pos, lemma, subs, infl, save, verbose):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, lemma, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, lemma, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose)

  pagemsg("Processing")

  args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text)
  if args is None:
    return

  forms_to_delete = []

  for key, form in args.iteritems():
    forms_to_delete.extend(form.split(","))

  for formind, form in blib.iter_items(forms_to_delete):
    def handler(page, formind, parsed):
      return process_form(index, page, lemma, formind, form, subs)
    blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)
def lookup_inflection(lemma_no_macrons, pos, expected_headtemps,
                      expected_infltemps, pagemsg, errandpagemsg):
    global args
    lemma_pagetitle = lemma_no_macrons
    if lemma_pagetitle.startswith("*"):
        lemma_pagetitle = "Reconstruction:Latin/" + lemma_pagetitle[1:]

    orig_pagemsg = pagemsg
    orig_errandpagemsg = errandpagemsg

    def pagemsg(txt):
        orig_pagemsg("%s: %s" % (lemma_no_macrons, txt))

    def errandpagemsg(txt):
        orig_errandpagemsg("%s: %s" % (lemma_no_macrons, txt))

    def expand_text(tempcall):
        cache_key = (tempcall, lemma_pagetitle)
        if cache_key in expand_text_cache:
            retval = expand_text_cache[cache_key]
            if args.verbose:
                pagemsg("Found (%s, %s)=%s in expand_text_cache" %
                        (tempcall, lemma_pagetitle, retval))
            return retval
        if args.verbose:
            pagemsg("Couldn't find (%s, %s) in expand_text_cache" %
                    (tempcall, lemma_pagetitle))
        result = blib.expand_text(tempcall, lemma_pagetitle, pagemsg,
                                  args.verbose)
        expand_text_cache[cache_key] = result
        return result

    if lemma_pagetitle in heads_and_defns_cache:
        if args.verbose:
            pagemsg("Found %s in heads_and_defns_cache" % lemma_pagetitle)
        retval = heads_and_defns_cache[lemma_pagetitle]
    else:
        if args.verbose:
            pagemsg("Couldn't find %s in heads_and_defns_cache" %
                    lemma_pagetitle)
        page = pywikibot.Page(site, lemma_pagetitle)
        try:
            exists = blib.try_repeatedly(lambda: page.exists(), pagemsg,
                                         "determine if page exists")
        except pywikibot.exceptions.InvalidTitle as e:
            pagemsg("WARNING: Invalid title %s, skipping" % lemma_pagetitle)
            heads_and_defns_cache[lemma_pagetitle] = "nonexistent"
            traceback.print_exc(file=sys.stdout)
            return None
        if not exists:
            pagemsg("WARNING: Lemma %s doesn't exist" % lemma_no_macrons)
            heads_and_defns_cache[lemma_pagetitle] = "nonexistent"
            return None

        retval = lalib.find_heads_and_defns(unicode(page.text), pagemsg)
        heads_and_defns_cache[lemma_pagetitle] = retval

    if retval == "nonexistent":
        pagemsg("WARNING: Lemma %s doesn't exist (cached)" % lemma_no_macrons)
        return None
    if retval is None:
        return None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    matched_head = False

    inflargs_sets = []

    seen_heads = []
    seen_infltns = []
    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)
        heads = lalib.la_get_headword_from_template(ht, lemma_pagetitle,
                                                    pagemsg, expand_text)
        for head in heads:
            if head not in seen_heads:
                seen_heads.append(head)
        for inflt in headword['infl_templates']:
            infltn = tname(inflt)
            if infltn not in seen_infltns:
                seen_infltns.append(infltn)
        if tn in expected_headtemps:
            oright = unicode(ht)
            for head in heads:
                head_no_links = blib.remove_links(head)
                if lalib.remove_macrons(head_no_links) == lemma_no_macrons:
                    break
            else:
                # no break
                continue
            this_inflargs = []
            for inflt in headword['infl_templates']:
                infltn = tname(inflt)
                if infltn not in expected_infltemps:
                    pagemsg(
                        "WARNING: Saw bad declension template for %s, expected one of {{%s}}: %s"
                        % (pos, ",".join(
                            "{{%s}}" % temp
                            for temp in expected_infltemps), unicode(inflt)))
                    continue

                originflt = unicode(inflt)
                inflargs = lalib.generate_infl_forms(pos, originflt,
                                                     errandpagemsg,
                                                     expand_text)
                if inflargs is None:
                    continue
                this_inflargs.append(inflargs)
                matched_head = True
            inflargs_sets.append((heads, this_inflargs))
    if not matched_head:
        pagemsg(
            "WARNING: Couldn't find any matching heads, even allowing macron differences (seen heads %s, seen infl template names %s)"
            % (",".join(seen_heads), ",".join(seen_infltns)))
        return None
    return inflargs_sets
Esempio n. 4
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_heads_and_defns(text, pagemsg)
    if retval is None:
        return None, None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    part_headwords = []
    adj_headwords = []
    pn_headwords = []
    noun_headwords = []

    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)
        if tn == "la-part" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["participle", "participles"]:
            part_headwords.append(headword)
        elif tn == "la-adj" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["adjective", "adjectives"]:
            adj_headwords.append(headword)
        elif tn == "la-proper noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["proper noun", "proper nouns"]:
            pn_headwords.append(headword)
        elif tn == "la-noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]:
            noun_headwords.append(headword)
    headwords_to_do = None
    if part_headwords and not adj_headwords:
        pos = "part"
        headwords_to_do = part_headwords
        expected_inflt = "la-adecl"
    elif pn_headwords and not noun_headwords:
        pos = "pn"
        headwords_to_do = pn_headwords
        expected_inflt = "la-ndecl"

    if not headwords_to_do:
        return None, None

    for headword in headwords_to_do:
        for inflt in headword['infl_templates']:
            infltn = tname(inflt)
            if infltn != expected_inflt:
                pagemsg(
                    "WARNING: Saw bad declension template for %s, expected {{%s}}: %s"
                    % (pos, expected_inflt, unicode(inflt)))
                continue
            inflargs = lalib.generate_infl_forms(pos, unicode(inflt),
                                                 errandpagemsg, expand_text)
            forms_seen = set()
            slots_and_forms_to_process = []
            for slot, formarg in inflargs.iteritems():
                forms = formarg.split(",")
                for form in forms:
                    if "[" in form or "|" in form:
                        continue
                    form_no_macrons = lalib.remove_macrons(form)
                    if form_no_macrons == pagetitle:
                        continue
                    if form_no_macrons in forms_seen:
                        continue
                    forms_seen.add(form_no_macrons)
                    slots_and_forms_to_process.append((slot, form))
            for formindex, (slot, form) in blib.iter_items(
                    sorted(slots_and_forms_to_process,
                           key=lambda x: lalib.remove_macrons(x[1]))):

                def handler(page, formindex, parsed):
                    return process_form(page, formindex, slot, form, pos,
                                        pagemsg)

                blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                             "%s.%s" % (index, formindex),
                             handler,
                             save=args.save,
                             verbose=args.verbose,
                             diff=args.diff)
def process_page(index, lemma, pos, infl, slots, pages_to_delete,
                 preserve_diaeresis, save, verbose, diff):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, lemma, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, lemma, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall,
                                remove_macrons(lemma, preserve_diaeresis),
                                pagemsg, verbose)

    pagemsg("Processing")

    args = lalib.generate_infl_forms(pos,
                                     infl,
                                     errandpagemsg,
                                     expand_text,
                                     add_sync_verb_forms=True)
    if args is None:
        return

    forms_to_delete = []
    tag_sets_to_delete = []
    lemma_no_macrons = remove_macrons(lemma)

    def add_bad_forms(bad_slot_fun):
        for slot, formspec in args.iteritems():
            if bad_slot_fun(slot):
                tag_sets_to_delete.append(lalib.slot_to_tag_set(slot))
                forms_to_delete.append((slot, formspec))

    for slot in slots.split(","):
        if slot.startswith("@"):
            if ":" in slot:
                real_form, real_slot = slot[1:].split(":")
                tag_sets_to_delete.append(lalib.slot_to_tag_set(real_slot))
                forms_to_delete.append((real_slot, real_form))
            else:
                forms_to_delete.append((None, slot[1:]))
        elif slot in args:
            tag_sets_to_delete.append(lalib.slot_to_tag_set(slot))
            forms_to_delete.append((slot, args[slot]))
        elif slot == "allbutlemma":
            for sl, formspec in args.iteritems():
                forms = formspec.split(",")
                forms = [
                    form for form in forms
                    if lemma_no_macrons != remove_macrons(form)
                ]
                if forms:
                    tag_sets_to_delete.append(lalib.slot_to_tag_set(sl))
                    forms_to_delete.append((sl, ",".join(forms)))
        else:
            add_bad_forms(lambda sl: lalib.slot_matches_spec(sl, slot))

    single_forms_to_delete = []

    for slot, formspec in forms_to_delete:
        for single_form in formspec.split(","):
            single_forms_to_delete.append((slot, single_form))
    for formind, (slot, formval) in blib.iter_items(single_forms_to_delete,
                                                    get_name=lambda x: x[1]):
        partpos = None
        if slot == "pres_actv_ptc":
            partpos = "presactpart"
        elif slot in ["perf_actv_ptc", "perf_pasv_ptc"]:
            partpos = "perfpasspart"
        elif slot == "futr_actv_ptc":
            partpos = "futactpart"
        elif slot == "futr_pasv_ptc":
            partpos = "futpasspart"

        if partpos:
            delete_participle(index, lemma, formind, formval, partpos,
                              preserve_diaeresis, save, verbose, diff)
        else:
            if pos == "noun":
                posform = "nounform"
            elif pos == "verb":
                posform = "verbform"
            elif pos == "adj":
                posform = "adjform"
            elif pos == "nounadj":
                # Noun that uses an adjective declension
                posform = "nounform"
            elif pos == "numadj":
                posform = "numform"
            elif pos == "part":
                posform = "partform"
            else:
                raise ValueError("Invalid part of speech %s" % pos)
            delete_form(index, lemma, formind, formval, posform,
                        True if slot is None else tag_sets_to_delete,
                        preserve_diaeresis, save, verbose, diff)