def process_page(page, index, args, comment):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))
        errmsg("Page %s %s: %s" % (index, pagetitle, txt))

    if args.verbose:
        pagemsg("Processing")
    catname = re.sub("^Category:", "", pagetitle)
    num_pages = len(list(blib.cat_articles(catname)))
    num_subcats = len(list(blib.cat_subcats(catname)))
    if num_pages > 0 or num_subcats > 0:
        errandpagemsg("Skipping (not empty): num_pages=%s, num_subcats=%s" %
                      (num_pages, num_subcats))
        return
    this_comment = comment or 'delete empty category'
    if page.exists():
        if args.save:
            delete_page(
                page,
                '%s (content was "%s")' % (this_comment, unicode(page.text)),
                errandpagemsg)
            errandpagemsg("Deleted (comment=%s)" % this_comment)
        else:
            pagemsg("Would delete (comment=%s)" % this_comment)
    else:
        pagemsg("Skipping, page doesn't exist")
def create_cat(cat, catargs, extratext=None):
    global args
    if args.pos == "verb":
        pos = "verb"
        shortpos = "verb"
    elif args.pos == "adj":
        pos = "adjective"
        shortpos = "adj"
    elif args.pos == "noun":
        pos = "noun"
        shortpos = "noun"
    else:
        assert False, "Invalid pos %s" % args.pos
    cat = "Belarusian " + cat.replace("~", "%ss" % pos)
    text = "{{be-%s cat%s}}" % (shortpos, "".join("|" + arg
                                                  for arg in catargs))
    if extratext:
        text += "\n%s" % extratext
    num_pages = len(list(blib.cat_articles(cat)))
    if num_pages == 0:
        return
    cat = "Category:" + cat
    page = pywikibot.Page(site, cat)
    if not args.overwrite and page.exists():
        msg("Page %s already exists, not overwriting" % cat)
        return
    page.text = unicode(text)
    changelog = "Creating '%s' with text '%s'" % (cat, text)
    msg("Changelog = %s" % changelog)
    if args.save:
        blib.safe_page_save(page, changelog, errandmsg)
Example #3
0
def rewrite_verb_headword(save, startFrom, upTo):
    for cat in [u"Arabic verbs"]:
        for index, page in blib.cat_articles(cat, startFrom, upTo):
            blib.do_edit(page,
                         index,
                         rewrite_one_page_verb_headword,
                         save=save)
Example #4
0
def do_nouns(poses, headtempls, save, startFrom, upTo):
  def do_one_page_noun(page, index, text):
    pagename = page.title()
    nouncount = 0
    nounids = []
    for template in text.filter_templates():
      if template.name in headtempls:
        nouncount += 1
        params_done = []
        entry = getparam(template, "1")
        for param in template.params:
          value = param.value
          newvalue = remove_i3rab(pagename, index, entry, unicode(value))
          if newvalue != value:
            param.value = newvalue
            params_done.append(unicode(param.name))
        if params_done:
          nounids.append("#%s %s %s (%s)" %
              (nouncount, template.name, entry, ", ".join(params_done)))
    return text, "Remove i3rab from params in %s" % (
          '; '.join(nounids))

  for pos in poses:
    for index, page in blib.cat_articles("Arabic %ss" % pos.lower(), startFrom, upTo):
      blib.do_edit(page, index, do_one_page_noun, save=save, verbose=verbose)
Example #5
0
def rewrite_pages(refrom, reto, refs, cat, pages, pagefile, pagetitle_sub,
    comment, filter_pages, save, verbose, startFrom, upTo):
  def rewrite_one_page(page, index, text):
    #blib.msg("From: [[%s]], To: [[%s]]" % (refrom, reto))
    text = unicode(text)
    text = reorder_shadda(text)
    zipped_fromto = zip(refrom, reto)
    for fromval, toval in zipped_fromto:
      if pagetitle_sub:
        pagetitle = unicode(page.title())
        fromval = fromval.replace(pagetitle_sub, re.escape(pagetitle))
        toval = toval.replace(pagetitle_sub, pagetitle)
      text = re.sub(fromval, toval, text)
    return text, comment or "replace %s" % (", ".join("%s -> %s" % (f, t) for f, t in zipped_fromto))

  if pages:
    pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(pages, startFrom, upTo))
  elif pagefile:
    lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")]
    pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(lines, startFrom, upTo))
  elif refs:
    pages = blib.references(refs, startFrom, upTo, includelinks=True)
  else:
    pages = blib.cat_articles(cat, startFrom, upTo)
  for page, index in pages:
    pagetitle = unicode(page.title())
    if filter_pages and not re.search(filter_pages, pagetitle):
      blib.msg("Skipping %s because doesn't match --filter-pages regex %s" %
          (pagetitle, filter_pages))
    else:
      if verbose:
        blib.msg("Processing %s" % pagetitle)
      blib.do_edit(page, index, rewrite_one_page, save=save, verbose=verbose)
Example #6
0
def do_nouns(poses, headtempls, save, startFrom, upTo):
  def do_one_page_noun(page, index, text):
    pagename = page.title()
    nouncount = 0
    nounids = []
    for template in text.filter_templates():
      if template.name in headtempls:
        nouncount += 1
        params_done = []
        entry = getparam(template, "1")
        for param in template.params:
          value = param.value
          newvalue = remove_i3rab(pagename, index, entry, unicode(value))
          if newvalue != value:
            param.value = newvalue
            params_done.append(unicode(param.name))
        if params_done:
          nounids.append("#%s %s %s (%s)" %
              (nouncount, template.name, entry, ", ".join(params_done)))
    return text, "Remove i3rab from params in %s" % (
          '; '.join(nounids))

  for pos in poses:
    for page, index in blib.cat_articles("Arabic %ss" % pos.lower(), startFrom, upTo):
      blib.do_edit(page, index, do_one_page_noun, save=save, verbose=verbose)
def snarf_adj_accents():
    for index, page in blib.cat_articles("Bulgarian adjectives"):
        pagetitle = unicode(page.title())

        def pagemsg(txt):
            msg("Page %s %s: %s" % (index, pagetitle, txt))

        parsed = blib.parse(page)
        for t in parsed.filter_templates():
            if tname(t) == "bg-adj":
                adj = getparam(t, "1")
                if not adj:
                    pagemsg("WARNING: Missing headword in adj: %s" %
                            unicode(t))
                    continue
                if bglib.needs_accents(adj):
                    pagemsg("WARNING: Adjective %s missing an accent: %s" %
                            (adj, unicode(t)))
                    continue
                unaccented_adj = bglib.remove_accents(adj)
                if unaccented_adj in adjs_to_accents and adjs_to_accents[
                        unaccented_adj] != adj:
                    pagemsg(
                        "WARNING: Two different accents possible for %s: %s and %s: %s"
                        % (unaccented_adj, adjs_to_accents[unaccented_adj],
                           adj, unicode(t)))
                adjs_to_accents[unaccented_adj] = adj
Example #8
0
def process_headwords(save, verbose, startFrom, upTo):
  def process_page(page, index, text):
    return process_one_page_headwords(unicode(page.title()), index, text)
  #for page in blib.references(u"Template:tracking/ar-head/head", startFrom, upTo):
  #for page in blib.references("Template:ar-nisba", startFrom, upTo):
  for cat in [u"Arabic lemmas", u"Arabic non-lemma forms"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, process_page, save=save, verbose=verbose)
def rewrite_ru_decl_adj(save, verbose, startFrom, upTo):
    for cat in [u"Russian adjectives"]:
        for index, page in blib.cat_articles(cat, startFrom, upTo):
            blib.do_edit(page,
                         index,
                         rewrite_one_page_ru_decl_adj,
                         save=save,
                         verbose=verbose)
Example #10
0
def rewrite_arz_headword(save, verbose, startFrom, upTo):
    for cat in [u"Egyptian Arabic adjectives", "Egyptian Arabic nouns"]:
        for index, page in blib.cat_articles(cat, startFrom, upTo):
            blib.do_edit(page,
                         index,
                         rewrite_one_page_arz_headword,
                         save=save,
                         verbose=verbose)
Example #11
0
def search_noconj(startFrom, upTo):
  for index, page in blib.cat_articles(u"Arabic verbs", startFrom, upTo):
    text = unicode(blib.parse(page))
    pagetitle = page.title()
    if "{{ar-verb" not in text:
      msg("* ar-verb not in {{l|ar|%s}}" % pagetitle)
    if "{{ar-conj" not in text:
      msg("* ar-conj not in {{l|ar|%s}}" % pagetitle)
Example #12
0
def process_headwords(save, verbose, startFrom, upTo):
  def process_page(page, index, text):
    return process_one_page_headwords(unicode(page.title()), index, text)
  #for page in blib.references(u"Template:tracking/ar-head/head", startFrom, upTo):
  #for page in blib.references("Template:ar-nisba", startFrom, upTo):
  for cat in [u"Arabic lemmas", u"Arabic non-lemma forms"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, process_page, save=save, verbose=verbose)
Example #13
0
def search_iyya_noetym(startFrom, upTo):
  for page, index in blib.cat_articles(u"Arabic nouns", startFrom, upTo):
    text = blib.parse(page)
    pagetitle = page.title()
    etym = False
    suffix = False
    if pagetitle.endswith(u"ية"):
      for t in text.filter_templates():
        if t.name in ["ar-etym-iyya", "ar-etym-nisba-a",
            "ar-etym-noun-nisba", "ar-etym-noun-nisba-linking"]:
          etym = True
        if t.name == "suffix":
          suffix = True
      if not etym:
        msg("Page %s %s: Ends with -iyya, no appropriate etym template%s" % (
          index, pagetitle, " (has suffix template)" if suffix else ""))
Example #14
0
def search_iyya_noetym(startFrom, upTo):
    for index, page in blib.cat_articles(u"Arabic nouns", startFrom, upTo):
        text = blib.parse(page)
        pagetitle = page.title()
        etym = False
        suffix = False
        if pagetitle.endswith(u"ية"):
            for t in text.filter_templates():
                if t.name in [
                        "ar-etym-iyya", "ar-etym-nisba-a",
                        "ar-etym-noun-nisba", "ar-etym-noun-nisba-linking"
                ]:
                    etym = True
                if t.name == "suffix":
                    suffix = True
            if not etym:
                msg("Page %s %s: Ends with -iyya, no appropriate etym template%s"
                    % (index, pagetitle,
                       " (has suffix template)" if suffix else ""))
def process_page(page, index):
  global args
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
  def errandpagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))
    errmsg("Page %s %s: %s" % (index, pagetitle, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)
  if args.verbose:
    pagemsg("Processing")
  if page.exists():
    errandpagemsg("Page already exists, not overwriting")
    return
  if not pagetitle.startswith("Category:"):
    pagemsg("Page not a category, skipping")
    return
  catname = re.sub("^Category:", "", pagetitle)
  if blacklist(catname):
    pagemsg("Category is blacklisted, skipping")
    return
  num_pages = len(list(blib.cat_articles(catname)))
  num_subcats = len(list(blib.cat_subcats(catname)))
  if num_pages == 0 and num_subcats == 0:
    pagemsg("Skipping empty category")
    return
  contents = u"{{auto cat}}"
  result = expand_text(contents)
  if not result:
    return
  if ("Category:Categories with invalid label" in result or
      "The automatically-generated contents of this category has errors" in result):
    pagemsg("Won't create page, would lead to errors: <%s>" % result)
  else:
    pagemsg("Creating page, output is <%s>" % result)
    comment = 'Created page with "%s"' % contents
    if args.save:
      page.text = contents
      if blib.safe_page_save(page, comment, errandpagemsg):
        errandpagemsg("Created page, comment = %s" % comment)
    else:
      pagemsg("Would create, comment = %s" % comment)
Example #16
0
def do_verbs(save, startFrom, upTo):
  def do_one_page_verb(page, index, text):
    pagename = page.title()
    verbcount = 0
    verbids = []
    for template in text.filter_templates():
      if template.name == "ar-conj":
        verbcount += 1
        vnvalue = getparam(template, "vn")
        uncertain = False
        if vnvalue.endswith("?"):
          vnvalue = vnvalue[:-1]
          msg("Page %s %s: Verbal noun(s) identified as uncertain" % (
            index, pagename))
          uncertain = True
        if not vnvalue:
          continue
        vns = re.split(u"[,،]", vnvalue)
        form = getparam(template, "1")
        verbid = "#%s form %s" % (verbcount, form)
        if re.match("^[1I](-|$)", form):
          verbid += " (%s,%s)" % (getparam(template, "2"), getparam(template, "3"))
        no_i3rab_vns = []
        for vn in vns:
          no_i3rab_vns.append(remove_i3rab(pagename, index, verbid, vn))
        newvn = ",".join(no_i3rab_vns)
        if uncertain:
          newvn += "?"
        if newvn != vnvalue:
          msg("Page %s %s: Verb %s, replacing %s with %s" % (
            index, pagename, verbid, vnvalue, newvn))
          addparam(template, "vn", newvn)
          verbids.append(verbid)
    return text, "Remove i3rab from verbal nouns for verb(s) %s" % (
          ', '.join(verbids))
  for index, page in blib.cat_articles("Arabic verbs", startFrom, upTo):
    blib.do_edit(page, index, do_one_page_verb, save=save, verbose=verbose)
Example #17
0
def do_verbs(save, startFrom, upTo):
  def do_one_page_verb(page, index, text):
    pagename = page.title()
    verbcount = 0
    verbids = []
    for template in text.filter_templates():
      if template.name == "ar-conj":
        verbcount += 1
        vnvalue = getparam(template, "vn")
        uncertain = False
        if vnvalue.endswith("?"):
          vnvalue = vnvalue[:-1]
          msg("Page %s %s: Verbal noun(s) identified as uncertain" % (
            index, pagename))
          uncertain = True
        if not vnvalue:
          continue
        vns = re.split(u"[,،]", vnvalue)
        form = getparam(template, "1")
        verbid = "#%s form %s" % (verbcount, form)
        if re.match("^[1I](-|$)", form):
          verbid += " (%s,%s)" % (getparam(template, "2"), getparam(template, "3"))
        no_i3rab_vns = []
        for vn in vns:
          no_i3rab_vns.append(remove_i3rab(pagename, index, verbid, vn))
        newvn = ",".join(no_i3rab_vns)
        if uncertain:
          newvn += "?"
        if newvn != vnvalue:
          msg("Page %s %s: Verb %s, replacing %s with %s" % (
            index, pagename, verbid, vnvalue, newvn))
          addparam(template, "vn", newvn)
          verbids.append(verbid)
    return text, "Remove i3rab from verbal nouns for verb(s) %s" % (
          ', '.join(verbids))
  for page, index in blib.cat_articles("Arabic verbs", startFrom, upTo):
    blib.do_edit(page, index, do_one_page_verb, save=save, verbose=verbose)
Example #18
0
 def yield_pages():
     if pages:
         for index, page in blib.iter_items(pages, startFrom, upTo):
             yield index, pywikibot.Page(blib.site, page), None
     if pagefile:
         lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")]
         for index, page in blib.iter_items(lines, startFrom, upTo):
             yield index, pywikibot.Page(blib.site, page), None
     if from_to_pagefile:
         lines = [
             x.strip() for x in codecs.open(from_to_pagefile, "r", "utf-8")
         ]
         for index, line in blib.iter_items(lines, startFrom, upTo):
             if " ||| " not in line:
                 msg("WARNING: Saw bad line in --from-to-pagefile: %s" %
                     line)
                 continue
             frompage, topage = line.split(" ||| ")
             yield index, pywikibot.Page(blib.site, frompage), topage
     if refs:
         for ref in refs:
             for index, page in blib.references(
                     ref, startFrom, upTo, only_template_inclusion=False):
                 yield index, page, None
     if pages_and_refs:
         for page_and_refs in pages_and_refs:
             for index, page in blib.references(
                     page_and_refs,
                     startFrom,
                     upTo,
                     only_template_inclusion=False,
                     include_page=True):
                 yield index, page, None
     if cats:
         for cat in cats:
             for index, page in blib.cat_articles(cat, startFrom, upTo):
                 yield index, page, None
Example #19
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"Purge (null-save) pages in category or references")
parser.add_argument('--cat', help="Category to purge")
parser.add_argument('--ref', help="References to purge")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

pages = []
if args.cat:
  pages_to_list = blib.cat_articles(args.cat, start, end)
else:
  pages_to_list = blib.references(args.ref, start, end)
for i, page in pages_to_list:
  # msg("Page %s %s: Null-saving" % (i, unicode(page.title())))
  page.save(comment="null save")
Example #20
0
def correct_link_formatting(save, startFrom, upTo):
  for cat in [u"Arabic lemmas", u"Arabic non-lemma forms"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, correct_one_page_link_formatting, save=save)
Example #21
0
def search_category_for_missing_form(form, pos, templates, save, startFrom,
    upTo):
  if not isinstance(templates, list):
    templates = [templates]
  cat = "Arabic %ss" % form
  repltemplate = templates[0]
  msg("---Searching [[Category:%s|%s]] for %s:---" %
      (cat, cat, ' or '.join(["{{temp|%s}}" % temp for temp in templates])))

  def parse_infls(infltext, tr):
    fs = []
    ftrs = []
    pls = []
    pltrs = []
    fpls = []
    fpltrs = []
    for rawinfl in re.split(", *", infltext):
      if not rawinfl:
        continue
      infl = re.match("'*\{\{(?:lang|l)\|ar\|(.*?)\}\}'* *(?:(?:\{\{IPAchar\|)?\((.*?)\)(?:\}\})?)? *\{\{g\|(.*?)\}\}",
        rawinfl)
      if not infl:
        msg("WARNING: Unable to match infl-outside-head %s" % rawinfl)
        continue
      msg("Found infl outside head: %s" % infl.group(0))
      if "|" in infl.group(1):
        msg("WARNING: Found | in head, skipping: %s" % infl.group(1))
        continue
      if infl.group(3) == "f":
        fs.append(infl.group(1))
        ftrs.append(infl.group(2))
      elif infl.group(3) == "p":
        pls.append(infl.group(1))
        pltrs.append(infl.group(2))
      elif infl.group(3) == "f-p":
        fpls.append(infl.group(1))
        fpltrs.append(infl.group(2))
      else:
        msg("WARNING: Unrecognized inflection gender '%s'" % infl.group(3))
    infls = ""
    if tr:
      infls += "|tr=%s" % tr
    def handle_infls(infls, arabic, latin, argname):
      count = 1
      for ar in arabic:
        if count == 1:
          arg = argname
        else:
          arg = "%s%s" % (argname, count)
        infls += "|%s=%s" % (arg, ar)
        if latin[count - 1] != None:
          if count == 1:
            larg = "%str" % argname
          else:
            larg = "%s%str" % (argname, count)
          infls += "|%s=%s" % (larg, latin[count - 1])
        count += 1
      return infls
    infls = handle_infls(infls, fs, ftrs, "f")
    infls = handle_infls(infls, pls, pltrs, "pl")
    infls = handle_infls(infls, fpls, fpltrs, "fpl")
    return infls

  def remove_empty_args(templ):
    templ = re.sub(r"\|+\}", "}", templ)
    templ = re.sub(r"\|\|+([A-Za-z0-9_]+=)", r"|\1", templ)
    return templ

  def correct_one_page_headword_formatting(page, index, text):
    text = unicode(text)
    pagetitle = page.title()
    sawtemp = False
    for temp in templates:
      if "{{%s" % temp in text:
        sawtemp = True
    if not sawtemp:
      if "{{head|ar|" in text:
        msg("* %s not in {{l|ar|%s}} but {{temp|head|ar}} is" % (' or '.join(templates), pagetitle))
      else:
        msg("* %s not in {{l|ar|%s}}, nor {{temp|head|ar}}" % (' or '.join(templates), pagetitle))
    replsfound = 0
    for m in re.finditer(r'(===+%s===+\s*)\{\{head\|ar\|(?:sc=Arab\|)?%s((?:\|[A-Za-z0-9_]+=(?:\[[^\]]*\]|[^|}])*)*)\}\} *(?:(?:\{\{IPAchar\|)?\((.*?)\)(?:\}\})?)? *((?:,[^,\n]*)*)(.*)' % (pos, form), text, re.I):
      replsfound += 1
      msg("Found match: %s" % m.group(0))
      if m.group(5):
        msg("WARNING: Trailing text %s" % m.group(5))
      head = ""
      g = ""
      tr = None
      for infl in re.finditer(r"\|([A-Za-z0-9_]+)=((?:\[[^\]]*\]|[^|}])*)", m.group(2)):
        msg("Found infl within head: %s" % infl.group(0))
        if infl.group(1) == "head":
          head = infl.group(2).replace("'", "")
        elif infl.group(1) == "g":
          g = infl.group(2).replace("'", "")
        elif infl.group(1) == "tr":
          tr = infl.group(2)
        elif infl.group(1) == "sc":
          pass
        else:
          msg("WARNING: Unrecognized argument '%s'" % infl.group(1))
      if m.group(3):
        tr = m.group(3)
      infls = parse_infls(m.group(4), tr)
      repl = "{{%s|%s|%s%s}}" % (repltemplate, head, g, infls)
      repl = remove_empty_args(repl)
      repl = m.group(1) + repl + m.group(5) # Include leading, trailing text
      msg("Replacing\n%s\nwith\n%s" % (m.group(0), repl))
      newtext = text.replace(m.group(0), repl, 1)
      if newtext == text:
        msg("WARNING: Unable to do replacement")
      else:
        text = newtext
    for m in re.finditer(r"(===+%s===+\s*)(?:'*\{\{(?:lang|l)\|ar\|(.*?)\}\}'*|'+([^{}']+)'+) *(?:(?:\{\{IPAchar\|)?\((.*?)\)(?:\}\})?)? *(?:\{\{g\|(.*?)\}\})? *((?:,[^,\n]*)*)(.*)" % pos, text, re.I):
      replsfound += 1
      msg("Found match: %s" % m.group(0))
      if m.group(7):
        msg("WARNING: Trailing text %s" % m.group(7))
      head = m.group(2) or m.group(3)
      g = m.group(5) or ""
      tr = m.group(4)
      infls = parse_infls(m.group(6), tr)
      repl = "{{%s|%s|%s%s}}" % (repltemplate, head, g, infls)
      repl = remove_empty_args(repl)
      repl = m.group(1) + repl + m.group(7) # Include leading, trailing text
      msg("Replacing\n%s\nwith\n%s" % (m.group(0), repl))
      newtext = text.replace(m.group(0), repl, 1)
      if newtext == text:
        msg("WARNING: Unable to do replacement")
      else:
        text = newtext
      # If there's a blank line before and after the category, leave a single
      # blank line
      newtext, nsubs = \
        re.subn(r"\n\n\[\[Category:%s\]\]\n\n" % cat, "\n\n", text, 1)
      if nsubs == 0:
        newtext = re.sub(r"\[\[Category:%s\]\]\n?" % cat, "", text, 1)
      if newtext != text:
        msg("Removed [[Category:%s]]" % cat)
        text = newtext
      else:
        msg("WARNING: Unable to remove [[Category:%s]]" % cat)
    if not sawtemp and replsfound == 0:
      msg("WARNING: No replacements found for {{l|ar|%s}}" % pagetitle)
    return text, "Correct headword formatting for [[:Category:%s]]" % cat

  for index, page in blib.cat_articles(cat, startFrom, upTo):
    blib.do_edit(page, index, correct_one_page_headword_formatting, save=save)
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  text = unicode(page.text)
  parsed = blib.parse(page)

  found_audio = False
  for t in parsed.filter_templates():
    if unicode(t.name) == "audio" and getparam(t, "lang") == "ru":
      found_audio = True
      break
  if found_audio:
    new_text = re.sub(r"\n*\[\[Category:Russian terms with audio links]]\n*", "\n\n", text)
    if new_text != text:
      comment = "Remove redundant [[:Category:Russian terms with audio links]]"
      if save:
        pagemsg("Saving with comment = %s" % comment)
        page.text = new_text
        page.save(comment=comment)
      else:
        pagemsg("Would save with comment = %s" % comment)

parser = blib.create_argparser("Remove redundant audio-link categories")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, page in blib.cat_articles("Russian terms with audio links", start, end):
  process_page(i, page, args.save, args.verbose)
Example #23
0
def list_category(cat):
    for i, page in blib.cat_articles(cat, start, end):
        msg("Page %s %s: Processing page" % (i, unicode(page.title())))
    for i, page in blib.cat_subcats(cat, start, end):
        msg("Page %s %s: Processing subcategory" % (i, unicode(page.title())))
        list_category(re.sub("^Category:", "", unicode(page.title())))
        arg_set.append(val)

  for t in parsed.filter_templates():
    tname = unicode(t.name)
    if tname == "ru-decl-noun-see":
      pagemsg("WARNING: Skipping ru-decl-noun-see, can't handle yet: %s" % unicode(t))
    elif tname in ["ru-noun+", "ru-proper noun+"]:
      pagemsg("Found %s" % unicode(t))
      process_new_style_headword(t)
    elif tname in ["ru-noun", "ru-proper noun"]:
      pagemsg("WARNING: Skipping ru-noun or ru-proper noun, can't handle yet: %s" % unicode(t))

parser = blib.create_argparser(u"Find red links in multiword lemmas")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

msg("Reading Russian lemmas")
for i, page in blib.cat_articles("Russian lemmas", start, end):
  lemmas.add(unicode(page.title()))

for pos in ["nouns", "proper nouns"]:
  tracking_page = "Template:tracking/ru-headword/space-in-headword/" + pos
  msg("PROCESSING REFERENCES TO: %s" % tracking_page)
  for index, page in blib.references(tracking_page, start, end):
    process_page(index, page, args.verbose)

for lemma, nonexistent_msg in sorted(nonexistent_lemmas.items(), key=lambda pair:(-lemma_count[pair[0]], pair[0])):
  msg("* [[%s]] (%s occurrence%s): %s (refs: %s)" % (lemma, lemma_count[lemma],
    "" if lemma_count[lemma] == 1 else "s", nonexistent_msg,
    ", ".join("[[%s]]" % x for x in nonexistent_lemmas_refs[lemma])))
def process_page(index, page, cat):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")
  parsed = blib.parse(page)

  found_infl = False
  for t in parsed.filter_templates():
    tn = tname(t)
    if pos == "verbs" and tn.startswith("ang-conj"):
      pagemsg("Found verb conjugation: %s" % unicode(t))
      found_infl = True
    elif pos == "nouns" and tn.startswith("ang-decl-noun"):
      pagemsg("Found noun conjugation: %s" % unicode(t))
      found_infl = True
    elif pos == "adjectives" and tn.startswith("ang-decl-adj"):
      pagemsg("Found adjective conjugation: %s" % unicode(t))
      found_infl = True
  if not found_infl:
    pagemsg("WARNING: Couldn't find inflection template")

parser = blib.create_argparser("Find Old English terms without inflection")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for pos in ["nouns", "verbs", "adjectives"]:
  for index, page in blib.cat_articles("Old English %s" % pos, start, end):
    process_page(index, page, pos)
  def yield_lemma_non_lemma_page_titles():
    for cat in yield_cats("lemma,non-lemma"):
      msg("Retrieving pages from %s ..." % cat)
      errmsg("Retrieving pages from %s ..." % cat)
      for index, page in blib.cat_articles(cat, None, None):
        yield page.title()

  if params.ignore_lemma_non_lemma:
    pages_to_ignore = set(yield_lemma_non_lemma_page_titles())
  else:
    pages_to_ignore = set()

  for category in yield_cats():
    msg("Processing category %s ..." % category)
    errmsg("Processing category %s ..." % category)
    for index, page in blib.cat_articles(category, startFrom, upTo):
      if page.title() not in pages_to_ignore:
        blib.do_edit(page, index, remove_translit_one_page, save=params.save,
            verbose=params.verbose)

pa = blib.init_argparser("Remove translit, sc= from hy, xcl, ka, el, grc templates")
pa.add_argument("--langs", default="all",
    help="Languages to do, a comma-separated list or 'all'")
pa.add_argument("--cattype", default="all",
    help="""Categories to examine ('all' or comma-separated list of
'translit', 'lemma', 'non-lemma'; default 'all')""")
pa.add_argument("--ignore-lemma-non-lemma", action="store_true",
    help="""Ignore lemma and non-lemma pages (useful with '--cattype translit').""")
pa.add_argument("--do-head", action="store_true",
    help="""Remove tr= in {{head|..}}""")
params = pa.parse_args()
Example #27
0
                pn = pname(param)
                if pn not in ["1", "g", "g2", "g3", "g4"]:
                    pagemsg("WARNING: Extraneous param %s=: %s" %
                            (pn, unicode(t)))
    return None, None


def process_page(page, index, parsed):
    pagetitle = unicode(page.title())
    text = unicode(page.text)
    return process_text_on_page(index, pagetitle, text)


parser = blib.create_argparser(
    "Check for Latin non-lemma forms with bad params")
parser.add_argument("--stdin",
                    help="Read dump from stdin.",
                    action="store_true")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.stdin:
    blib.parse_dump(sys.stdin, process_text_on_page)
else:
    for i, page in blib.cat_articles("Latin non-lemma forms", start, end):
        blib.do_edit(page,
                     i,
                     process_page,
                     save=args.save,
                     verbose=args.verbose)
Example #28
0
def rewrite_ar_plural(save, verbose, startFrom, upTo):
  for cat in [u"Arabic plurals"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, rewrite_one_page_ar_plural, save=save, verbose=verbose)
Example #29
0
  for t in parsed.filter_templates():
    origt = unicode(t)
    if unicode(t.name) == "ordinalbox" and getparam(t, "1") == "ru":
      if not adjval:
        pagemsg("WARNING: Can't find accented ordinal form")
      elif adjval != pagetitle:
        t.add("alt", adjval)
        notes.append("Add alt=%s to ordinalbox" % adjval)
    if unicode(t.name) == "cardinalbox" and getparam(t, "1") == "ru":
      if not numval:
        pagemsg("WARNING: Can't find accented cardinal form")
      elif numval != pagetitle:
        t.add("alt", numval)
        notes.append("Add alt=%s to cardinalbox" % numval)
      if "[[Category:Russian cardinal numbers]]" not in unicode(parsed):
        pagemsg("WARNING: Numeral not in [[Category:Russian cardinal numbers]]")
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  return parsed, notes

parser = blib.create_argparser(u"Add accented forms to {{cardinalbox}} and {{ordinalbox}}")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, page in blib.cat_articles("Russian ordinal numbers", start, end):
  blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
for i, page in blib.cat_articles("Russian numerals", start, end):
  blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
Example #30
0
            newt = unicode(t)
            if origt != newt:
                pagemsg("Replaced %s with %s" % (origt, newt))

    return unicode(parsed), notes


parser = blib.create_argparser(
    u"Convert Japanese headwords from old-style to new-style",
    include_pagefile=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

romaji_to_keep = set()
for i, page in blib.cat_articles(
        "Japanese terms with romaji needing attention"):
    pagetitle = unicode(page.title())
    romaji_to_keep.add(pagetitle)


def do_process_page(page, index, parsed):
    return process_page(index, page, romaji_to_keep)


blib.do_pagefile_cats_refs(
    args,
    start,
    end,
    do_process_page,
    edit=True,
    default_refs=[
Example #31
0
                        t.add("2", "irreg/c'")
                        notes.append(
                            "make past stress /c' explicit in irreg verb")
                    else:
                        t.add("2", "irreg/c")
                        notes.append(
                            "make past stress /c explicit in irreg verb")
                elif param2 == "irreg/a":
                    t.add("2", "irreg")
                    notes.append("make past stress /a default in irreg verb")
                elif not param2.startswith("irreg/"):
                    errpagemsg("WARNING: Unable to parse param2 %s" % param2)

        newt = unicode(t)
        if origt != newt:
            pagemsg("Replaced %s with %s" % (origt, newt))

    return parsed, notes


parser = blib.create_argparser(
    u"Fix up class-8 and irregular arguments to have class a as default past stress"
)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, page in blib.cat_articles("Russian class 8b verbs", start, end):
    blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
for i, page in blib.cat_articles("Russian irregular verbs", start, end):
    blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
Example #32
0
            (",".join(manual_ppps), ",".join(auto_ppps), unicode(t)))
      else: # no break in for loop
        for m in notsamemsgs:
          pagemsg(m)

    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

parser = blib.create_argparser(u"Infer the past passive participle variant from the actual PPP")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for category in ["Russian verbs"]:
  for i, page in blib.cat_articles(category, start, end):
    process_page(i, page, args.save, args.verbose)
Example #33
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    m = re.search(
        "^Category:(Japanese|Okinawan) terms spelled with (.*) read as (.*)$",
        pagetitle)
    if not m:
        pagemsg("Skipped")
        return

    notes = []

    lang, spelling, reading = m.groups()
    langcode = lang == "Japanese" and "ja" or "ryu"
    spelling_page = pywikibot.Page(site, spelling)

    def pagemsg_with_spelling(txt):
        pagemsg("%s: %s" % (spelling, txt))

    def errandpagemsg_with_spelling(txt):
        pagemsg_with_spelling(txt)
        errmsg("Page %s %s: %s: %s" % (index, pagetitle, spelling, txt))

    if not blib.safe_page_exists(spelling_page, pagemsg_with_spelling):
        pagemsg_with_spelling("Spelling page doesn't exist, skipping")
        return
    spelling_page_text = blib.safe_page_text(spelling_page,
                                             pagemsg_with_spelling)
    retval = blib.find_modifiable_lang_section(spelling_page_text, lang,
                                               pagemsg_with_spelling)
    if retval is None:
        pagemsg_with_spelling("WARNING: Couldn't find %s section" % lang)
        return
    sections, j, secbody, sectail, has_non_lang = retval

    parsed = blib.parse_text(secbody)
    saw_readings_template = False
    reading_types = []
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "%s-readings" % langcode:
            saw_readings_template = True
            for reading_type in allowed_reading_types:
                readings = getparam(t, reading_type).strip()
                if readings:
                    readings = re.split(r"\s*,\s*", readings)
                    readings = [re.sub("[<-].*", "", r) for r in readings]
                    if reading in readings:
                        reading_type = canonicalize_reading_types.get(
                            reading_type, reading_type)
                        pagemsg_with_spelling(
                            "Appending reading type %s based on %s" %
                            (reading_type, unicode(t)))
                        if reading_type not in reading_types:
                            reading_types.append(reading_type)
                            notes.append(
                                "add %s reading based on {{%s-readings}} on page [[%s]]"
                                % (reading_type, langcode, spelling))
            if not reading_types:
                pagemsg_with_spelling(
                    "WARNING: Can't find reading %s among readings listed in %s"
                    % (reading, unicode(t).replace("\n", r"\n")))

    if not saw_readings_template:
        pagemsg_with_spelling(
            "WARNING: Couldn't find reading template {{%s-readings}}" %
            langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling("WARNING: Can't find reading %s on page" %
                              reading)

    for i, contents_page in blib.cat_articles(
            re.sub("^Category:", "", pagetitle)):
        contents_title = unicode(contents_page.title())

        def pagemsg_with_contents(txt):
            pagemsg("%s: %s" % (contents_title, txt))

        def errandpagemsg_with_contents(txt):
            pagemsg_with_contents(txt)
            errmsg("Page %s %s: %s: %s" %
                   (index, pagetitle, contents_title, txt))

        contents_page_text = blib.safe_page_text(contents_page,
                                                 pagemsg_with_contents)
        retval = blib.find_modifiable_lang_section(contents_page_text, lang,
                                                   pagemsg_with_contents)
        if retval is None:
            pagemsg_with_contents("WARNING: Couldn't find %s section" % lang)
            return
        sections, j, secbody, sectail, has_non_lang = retval

        saw_kanjitab = False
        must_continue = False
        for ch in contents_title:
            if 0xD800 <= ord(ch) <= 0xDFFF:
                pagemsg_with_contents(
                    "WARNING: Surrogates in page name, skipping: %s" % ord(ch))
                must_continue = True
                break
        if must_continue:
            continue
        chars_in_contents_title = [x for x in contents_title]
        for i, ch in enumerate(chars_in_contents_title):
            if ch == u"々":  # kanji repeat char
                if i == 0:
                    pagemsg_with_contents(
                        u"Repeat char 々 found at beginning of contents title")
                    must_continue = True
                    break
                else:
                    chars_in_contents_title[i] = chars_in_contents_title[i - 1]
        if must_continue:
            continue
        kanji_in_contents_title = [
            x for x in chars_in_contents_title
            if unicodedata.name(x).startswith("CJK UNIFIED IDEOGRAPH")
        ]
        parsed = blib.parse_text(secbody)
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "%s-kanjitab" % langcode:
                saw_kanjitab = True
                readings = []
                for i in range(1, 10):
                    contents_reading = getparam(t, str(i))
                    if contents_reading:
                        readings.append(contents_reading)
                if len(kanji_in_contents_title) != len(readings):
                    pagemsg_with_contents(
                        "WARNING: Saw %s chars in contents title but %s readings %s, skipping: %s"
                        % (len(kanji_in_contents_title), len(readings),
                           ",".join(readings), unicode(t)))
                    continue
                yomi = getparam(t, "yomi")
                if not yomi:
                    pagemsg_with_contents("WARNING: No yomi, skipping: %s" %
                                          unicode(t))
                    continue
                if "," in yomi or re.search("[0-9]$", yomi):
                    yomi = yomi.split(",")
                if type(yomi) is list:
                    expanded_yomi = []
                    for y in yomi:
                        m = re.search("^(.*?)([0-9]+)$", y)
                        if m:
                            baseyomi, numyomi = m.groups()
                            numyomi = int(numyomi)
                            expanded_yomi.extend([baseyomi] * numyomi)
                        else:
                            expanded_yomi.append(y)
                    if expanded_yomi != yomi:
                        pagemsg_with_contents(
                            "Expanding yomi %s to %s" %
                            (",".join(yomi), ",".join(expanded_yomi)))
                    yomi = expanded_yomi
                if type(yomi) is list and len(yomi) != len(
                        kanji_in_contents_title):
                    pagemsg_with_contents(
                        "WARNING: %s values in yomi=%s but %s chars in contents, skipping: %s"
                        % (len(yomi), ",".join(yomi),
                           len(kanji_in_contents_title), unicode(t)))
                    continue
                saw_spelling_in_contents = False
                must_continue = False
                for i, (ch, contents_reading) in enumerate(
                        zip(kanji_in_contents_title, readings)):
                    if ch == spelling:
                        saw_spelling_in_contents = True
                        if contents_reading == reading:
                            if type(yomi) is list:
                                reading_type = yomi[i]
                            else:
                                reading_type = yomi
                            yomi_to_canonical_reading_type = {
                                "o": "on",
                                "on": "on",
                                "kanon": "kanon",
                                "goon": "goon",
                                "soon": "soon",
                                "toon": "toon",
                                "kan": "kanyoon",
                                "kanyo": "kanyoon",
                                "kanyoon": "kanyoon",
                                "k": "kun",
                                "kun": "kun",
                                "juku": "jukujikun",
                                "jukuji": "jukujikun",
                                "jukujikun": "jukujikun",
                                "n": "nanori",
                                "nanori": "nanori",
                                "ok": "jubakoyomi",
                                "j": "jubakoyomi",
                                "ko": "yutoyomi",
                                "y": "yutoyomi",
                                "irr": "irregular",
                                "irreg": "irregular",
                                "irregular": "irregular",
                            }
                            if reading_type not in yomi_to_canonical_reading_type:
                                pagemsg_with_contents(
                                    "WARNING: Unrecognized reading type %s: %s"
                                    % (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = yomi_to_canonical_reading_type[
                                reading_type]
                            if reading_type not in allowed_reading_types:
                                pagemsg_with_contents(
                                    "WARNING: Disallowed reading type %s: %s" %
                                    (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = canonicalize_reading_types.get(
                                reading_type, reading_type)
                            pagemsg_with_contents(
                                "Appending reading type %s based on %s" %
                                (reading_type, unicode(t)))
                            if reading_type not in reading_types:
                                reading_types.append(reading_type)
                                notes.append(
                                    "add %s reading based on {{%s-kanjitab}} on page [[%s]]"
                                    % (reading_type, langcode, contents_title))
                if must_continue:
                    continue
                if not saw_spelling_in_contents:
                    pagemsg_with_contents(
                        "WARNING: Didn't see spelling in contents: %s" %
                        unicode(t))
                    continue
        if not saw_kanjitab:
            pagemsg_with_contents("WARNING: Didn't see {{%s-kanjitab}}" %
                                  langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling(
            "WARNING: Can't find reading %s by looking through category contents"
            % reading)
def split_etymologies(save, verbose, startFrom, upTo):
  def split_page_etymologies(page, index, pagetext):
    return split_one_page_etymologies(page, index, pagetext, verbose)
  for index, page in blib.cat_articles("Arabic lemmas", startFrom, upTo):
    blib.do_edit(page, index, split_page_etymologies, save=save,
        verbose=verbose)
                pagemsg("Skipping adjective with multiple heads: %s" %
                        ",".join(heads))
                return
            tr = getparam(t, "tr")

            nounsection = blib.find_lang_section(noun, "Russian", pagemsg,
                                                 errandpagemsg)
            if not nounsection:
                pagemsg("Couldn't find Russian section for %s" % noun)
                continue
            if "==Etymology" in nounsection:
                pagemsg("Noun %s already has etymology" % noun)
                continue
            if tr:
                msg(u"%s %s+tr1=%s+-ость no-etym" % (noun, heads[0], tr))
            else:
                msg(u"%s %s+-ость no-etym" % (noun, heads[0]))


parser = blib.create_argparser(u"Find etymologies for nouns in -ость")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

nouns = []
for i, page in blib.cat_articles("Russian nouns"):
    nouns.append(page.title())

for category in ["Russian adjectives"]:
    for i, page in blib.cat_articles(category, start, end):
        process_page(i, page, args.save, args.verbose, nouns)
 def yield_lemma_non_lemma_page_titles():
   for cat in yield_cats("lemma,non-lemma"):
     msg("Retrieving pages from %s ..." % cat)
     errmsg("Retrieving pages from %s ..." % cat)
     for index, page in blib.cat_articles(cat, None, None):
       yield page.title()
# for {{R:vep:UVVV}} templates, and check the pages in those templates to
# see if they exist.

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

def process_page(index, page, save, verbose):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("Processing")

  parsed = blib.parse(page)
  for t in parsed.filter_templates():
    if unicode(t.name) == "R:vep:UVVV":
      refpages = blib.fetch_param_chain(t, "1", "")
      for refpage in refpages:
        if not pywikibot.Page(site, refpage).exists():
          pagemsg("Page [[%s]] does not exist" % refpage)

parser = blib.create_argparser(u"Find red links in pages in Category:R:vep:UVVV with red link")
parser.add_argument("--pagefile", help="File containing pages to check")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, page in blib.cat_articles("R:vep:UVVV with red link", start, end):
  process_page(i, page, args.save, args.verbose)
Example #38
0
def rewrite_ar_plural(save, verbose, startFrom, upTo):
  for cat in [u"Arabic plurals"]:
    for page, index in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, rewrite_one_page_ar_plural, save=save, verbose=verbose)
Example #39
0
                    help="Categories to do (can be comma-separated list)")
parser.add_argument('--refs',
                    help="References to do (can be comma-separated list)")
parser.add_argument('--lemmafile',
                    help="File of lemmas to process. May have accents.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.lemmafile:
    lemmas = []
    for i, pagename in blib.iter_items([
            ru.remove_accents(x.strip())
            for x in codecs.open(args.lemmafile, "r", "utf-8")
    ]):
        page = pywikibot.Page(site, pagename)
        process_page(i, page, args.verbose)
elif args.refs:
    for ref in re.split(",", args.refs):
        msg("Processing references to: %s" % ref)
        for i, page in blib.references(ref, start, end):
            process_page(i, page, args.verbose)
else:
    for cat in re.split(",", args.cats):
        msg("Processing category: %s" % cat)
        lemmas = []
        if cat == "Russian verbs":
            for i, page in blib.cat_articles(cat):
                lemmas.append(page.title())
        for i, page in blib.cat_articles(cat, start, end):
            process_page(i, page, args.verbose)
Example #40
0
def rewrite_arz_headword(save, verbose, startFrom, upTo):
  for cat in [u"Egyptian Arabic adjectives", "Egyptian Arabic nouns"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, rewrite_one_page_arz_headword, save=save,
          verbose=verbose)
                    help="Do derived adverbs")
parser.add_argument("--base-lemmafile", help="File containing base lemmas")
parser.add_argument("--derived-lemmafile",
                    help="File containing derived lemmas")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

derived_lemmas = []
if args.derived_lemmafile:
    derived_lemmas = [
        rulib.remove_accents(x.strip())
        for x in codecs.open(args.derived_lemmafile, "r", "utf-8")
    ]
else:
    for i, page in blib.cat_articles(
            "Russian adverbs" if args.adverbs else "Russian nouns" if args.
            nouns else "Russian adjectives"):
        derived_lemmas.append(page.title())

if args.base_lemmafile:
    for i, pagename in blib.iter_items([
            rulib.remove_accents(x.strip())
            for x in codecs.open(args.base_lemmafile, "r", "utf-8")
    ]):
        page = pywikibot.Page(site, pagename)
        process_page(i, page, args.save, args.verbose, derived_lemmas)
else:
    for category in ["Russian adjectives"] if args.adverbs else [
            "Russian proper nouns", "Russian nouns", "Russian verbs"
    ]:
        for i, page in blib.cat_articles(category, start, end):
Example #42
0
        if val:
          seenval = True
        if seenval:
          t.add(str(i + 1), val)
      t.add("1", conjtype)
      blib.sort_params(t)
    newt = unicode(t)
    if origt != newt:
      pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

parser = blib.create_argparser(u"Convert ru-conj-* to ru-conj and move variant")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, page in blib.cat_articles("Russian verbs", start, end):
  process_page(i, page, args.save, args.verbose)
Example #43
0
    pagemsg("Processing")
    notes = []

    text = unicode(page.text)
    text = re.sub(r"\n(===+)Adjective(===+)\n\{\{head\|de\|adjective form\}\}",
                  "\n" + r"\1Numeral\2" + "\n{{head|de|numeral form}}", text)
    notes.append("change headword from adjective form to numeral form")
    return text, notes


parser = blib.create_argparser(
    u"Change ordinal numeral form headwords from adjective to numeral")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

endings = ["en", "er", "em", "es"]

for index, page in blib.cat_articles("German ordinal numbers", start, end):
    pagetitle = unicode(page.title())
    if not pagetitle.endswith("e"):
        continue
    for ending in endings:
        page = pywikibot.Page(site, pagetitle[:-1] + ending)
        if page.exists():
            blib.do_edit(page,
                         index,
                         process_page,
                         save=args.save,
                         verbose=args.verbose)
Example #44
0
def rewrite_ru_decl_adj(save, verbose, startFrom, upTo):
  for cat in [u"Russian adjectives"]:
    for page, index in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, rewrite_one_page_ru_decl_adj, save=save, verbose=verbose)
  text = unicode(page.text)

  foundrussian = False
  sections = re.split("(^==[^=]*==\n)", text, 0, re.M)

  for j in xrange(2, len(sections), 2):
    if sections[j-1] == "==Russian==\n":
      if foundrussian:
        pagemsg("WARNING: Found multiple Russian sections, skipping page")
        return
      foundrussian = True

      found_headword_template = False
      parsed = blib.parse_text(sections[j])
      for t in parsed.filter_templates():
        tname = unicode(t.name)
        if tname == "ru-adj" or (tname == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form"):
          found_headword_template = True
      if not found_headword_template and "===Adjective===" in sections[j]:
        pagemsg("WARNING: Missing adj headword template")

parser = blib.create_argparser("Find missing adjective headwords")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for cat in ["Russian adjectives", "Russian adjective forms", "Russian lemmas", "Russian non-lemma forms"]:
  msg("Processing category %s" % cat)
  for index, page in blib.cat_articles(cat, start, end):
    process_page(index, page)
Example #46
0
      if origt != newt:
        pagemsg("Replaced %s with %s" % (origt, newt))

  new_text = unicode(parsed)

  if new_text != text:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (text, new_text))
    assert notes
    comment = "; ".join(blib.group_notes(notes))
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = new_text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)

parser = blib.create_argparser(u"Convert Japanese headwords from old-style to new-style")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

romaji_to_keep = set()
for i, page in blib.cat_articles("Japanese terms with romaji needing attention"):
  pagetitle = unicode(page.title())
  romaji_to_keep.add(pagetitle)

for ref in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]:
  msg("Processing references to Template:%s" % ref)
  for i, page in blib.references("Template:%s" % ref, start, end):
    process_page(i, page, args.save, args.verbose, romaji_to_keep)
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, set_template_name, msg, errmsg, site, tname

northern_kurdish_lemmas = set()
for i, art in blib.cat_articles("Northern Kurdish lemmas"):
    northern_kurdish_lemmas.add(unicode(art.title()))
central_kurdish_lemmas = set()
for i, art in blib.cat_articles("Central Kurdish lemmas"):
    central_kurdish_lemmas.add(unicode(art.title()))

trans_templates = [
    "t", "t+", "t-", "tt", "tt+", "t-check", "t+check", "t-needed"
]

arabic_charset = u"؀-ۿݐ-ݿࢠ-ࣿﭐ-﷽ﹰ-ﻼ"


def process_text_on_page(index, pagename, text):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagename, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagename, txt))

    pagemsg("Processing")
Example #48
0
    tname = unicode(t.name)
    if tname in ru_head_templates:
      headname = tname
      found_this_head = True
    elif tname == "head" and getparam(t, "1") == "ru":
      headtype = getparam(t, "2")
      headname = "head|ru|%s" % headtype
      if headtype in ru_heads_to_warn_about:
        pagemsg("WARNING: Found %s" % headname)
      found_this_head = True
    if found_this_head:
      cat_head_count[headname] = cat_head_count.get(headname, 0) + 1
      overall_head_count[headname] = overall_head_count.get(headname, 0) + 1
      found_page_head = True
  if not found_page_head:
    pagemsg("WARNING: No head")
  if index % 100 == 0:
    output_heads_seen()

parser = blib.create_argparser(u"Find Russian terms without a proper headword line")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for category in ["Russian nouns", "Russian proper nouns", "Russian pronouns", "Russian determiners", "Russian adjectives", "Russian verbs", "Russian participles", "Russian adverbs", "Russian prepositions", "Russian conjunctions", "Russian interjections", "Russian idioms", "Russian phrases", "Russian abbreviations", "Russian acronyms", "Russian initialisms", "Russian noun forms", "Russian proper noun forms", "Russian pronoun forms", "Russian determiner forms", "Russian verb forms", "Russian adjective forms", "Russian participle forms"]:
  cat_head_count = {}
  msg("Processing category: %s" % category)
  for i, page in blib.cat_articles(category, start, end):
    process_page(i, page, args.save, args.verbose)
  output_heads_seen()
output_heads_seen(overall=True)
Example #49
0
#    find_rfdef.py is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Find pages that need definitions among a set list (e.g. most frequent words).

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"Find pages that need definitions")
parser.add_argument("--pagefile", help="File containing pages to check")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lines = set([x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")])
for i, page in blib.cat_articles("Russian entries needing definition", start, end):
    pagetitle = page.title()
    if pagetitle in lines:
        msg("* Page %s [[%s]]" % (i, pagetitle))
def rewrite_verb_headword(save, startFrom, upTo):
  for cat in [u"Arabic verbs"]:
    for page, index in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, rewrite_one_page_verb_headword, save=save)
            process_new_style_headword(t)
        elif tname in ["ru-verb"]:
            pagemsg("Found %s" % unicode(t))
            process_verb_headword(t)
        elif tname in ["ru-noun", "ru-proper noun"]:
            pagemsg(
                "WARNING: Skipping ru-noun or ru-proper noun, can't handle yet: %s"
                % unicode(t))


parser = blib.create_argparser(u"Find red links in multiword lemmas")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

msg("Reading Russian lemmas")
for i, page in blib.cat_articles("Russian lemmas", start, end):
    lemmas.add(unicode(page.title()))

for pos in ["nouns", "proper nouns", "verbs"]:
    tracking_page = "Template:tracking/ru-headword/space-in-headword/" + pos
    msg("PROCESSING REFERENCES TO: %s" % tracking_page)
    for index, page in blib.references(tracking_page, start, end):
        process_page(index, page, args.verbose)

for lemma, nonexistent_msg in sorted(nonexistent_lemmas.items(),
                                     key=lambda pair:
                                     (-lemma_count[pair[0]], pair[0])):
    msg("* [[%s]] (%s occurrence%s): %s (refs: %s)" %
        (lemma, lemma_count[lemma], "" if lemma_count[lemma] == 1 else "s",
         nonexistent_msg, ", ".join("[[%s]]" % x
                                    for x in nonexistent_lemmas_refs[lemma])))
# Find redlinks (non-existent pages).

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"Find red links")
parser.add_argument("--pagefile", help="File containing pages to check")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lemmas = set()
msg("Reading Bulgarian lemmas")
for i, page in blib.cat_articles("Bulgarian lemmas", start, end):
  lemmas.add(unicode(page.title()))

lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")]
words = lines

for i, line in blib.iter_items(words, start, end):
  pagename, freq = line.split("\t")
  m = re.search(u"[^-Ѐ-џҊ-ԧꚀ-ꚗ]", pagename)
  def fmsg(txt):
    msg("Page %s [[%s]]: %s (freq %s)" % (i, pagename, txt, freq))
  if m:
    fmsg("skipped due to non-Cyrillic characters")
  else:
    for pagenm, pagetype in [(pagename, ""),
        (pagename.capitalize(), " (capitalized)"),
def clean_verb_headword(save, startFrom, upTo):
  for cat in [u"Arabic verbs"]:
    for index, page in blib.cat_articles(cat, startFrom, upTo):
      blib.do_edit(page, index, clean_one_page_verb_headword, save=save)