Ejemplo n.º 1
0
 def yield_pages():
     if pages:
         for index, page in blib.iter_items(pages, startFrom, upTo):
             yield index, pywikibot.Page(blib.site, page), None
     if pagefile:
         lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")]
         for index, page in blib.iter_items(lines, startFrom, upTo):
             yield index, pywikibot.Page(blib.site, page), None
     if from_to_pagefile:
         lines = [
             x.strip() for x in codecs.open(from_to_pagefile, "r", "utf-8")
         ]
         for index, line in blib.iter_items(lines, startFrom, upTo):
             if " ||| " not in line:
                 msg("WARNING: Saw bad line in --from-to-pagefile: %s" %
                     line)
                 continue
             frompage, topage = line.split(" ||| ")
             yield index, pywikibot.Page(blib.site, frompage), topage
     if refs:
         for ref in refs:
             for index, page in blib.references(
                     ref, startFrom, upTo, only_template_inclusion=False):
                 yield index, page, None
     if pages_and_refs:
         for page_and_refs in pages_and_refs:
             for index, page in blib.references(
                     page_and_refs,
                     startFrom,
                     upTo,
                     only_template_inclusion=False,
                     include_page=True):
                 yield index, page, None
     if cats:
         for cat in cats:
             for index, page in blib.cat_articles(cat, startFrom, upTo):
                 yield index, page, None
def process_page(index, lemma, forms, lang, pages_to_delete, save, verbose,
                 diff):
    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, lemma, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, lemma, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, lemma, pagemsg, verbose)

    pagemsg("Processing")

    for formind, form in blib.iter_items(forms):
        delete_form(index, lemma, formind, form, lang, save, verbose, diff)
Ejemplo n.º 3
0
def search_pages(args, regex, invert, input_from_diff, start, end, lang_only):
    def do_process_text_on_page(index, title, text):
        process_text_on_page(index, title, text, regex, invert, args.verbose,
                             args.text, args.all, args.mainspace_only,
                             lang_only, args.from_to)

    if input_from_diff:
        lines = codecs.open(input_from_diff, "r", "utf-8")
        index_pagename_and_text = blib.yield_text_from_diff(lines, verbose)
        for _, (index, pagename,
                text) in blib.iter_items(index_pagename_and_text,
                                         start,
                                         end,
                                         get_name=lambda x: x[1],
                                         get_index=lambda x: x[0]):
            do_process_text_on_page(index, pagename, text)
        return

    blib.do_pagefile_cats_refs(args,
                               start,
                               end,
                               do_process_text_on_page,
                               stdin=True)
Ejemplo n.º 4
0
def process_page(index, pos, lemma, subs, infl, save, verbose):
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, lemma, txt))
  def errandpagemsg(txt):
    errandmsg("Page %s %s: %s" % (index, lemma, txt))
  def expand_text(tempcall):
    return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose)

  pagemsg("Processing")

  args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text)
  if args is None:
    return

  forms_to_delete = []

  for key, form in args.iteritems():
    forms_to_delete.extend(form.split(","))

  for formind, form in blib.iter_items(forms_to_delete):
    def handler(page, formind, parsed):
      return process_form(index, page, lemma, formind, form, subs)
    blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)
Ejemplo n.º 5
0
  for k in xrange(1, len(splitsections), 2):
    if splitsections[k] == "English":
      saw_english = True
    else:
      saw_langs.add(splitsections[k])
  if saw_english:
    english_pages[pagetitle] = saw_langs

def process_line(index, line):
  m = re.search("^Page [0-9]+ (.*?): Replacing (.*) with (.*) in .* section in (.*)$", line)
  if not m:
    return
  pagetitle, fromtext, totext, lang = m.groups()
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  for m in re.finditer(r"\{\{(?:m|l|term)\|.*?\|(.*?)\}\}", totext):
    linkpage = m.group(1)
    if linkpage in english_pages and lang not in english_pages[linkpage]:
      pagemsg("Possible false positive for [[%s]] in %s: %s" % (linkpage, lang, fromtext))

parser = blib.create_argparser("Check for likely false-positive links converted from raw links")
parser.add_argument("--direcfile", help="File of output from fix_links.py")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

blib.parse_dump(sys.stdin, find_english_pages)

for index, line in blib.iter_items(codecs.open(args.direcfile, "r", encoding="utf-8"), start, end):
  process_line(index, line)
Ejemplo n.º 6
0
        pagemsg("Replaced %s with %s" % (origt, unicode(t)))

  newtext = unicode(parsed)

  if newtext != text:
    if verbose:
      pagemsg("Replacing <<%s>> with <<%s>>" % (text, newtext))
    comment = "Add phon= to ru-IPA templates"
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
  else:
    pagemsg("Skipping")

parser = argparse.ArgumentParser(description="Add phon= to ru-IPA uses")
parser.add_argument('start', help="Starting page index", nargs="?")
parser.add_argument('end', help="Ending page index", nargs="?")
parser.add_argument('--save', action="store_true", help="Save results")
parser.add_argument('--verbose', action="store_true", help="More verbose output")
parser.add_argument('--pagefile', help="File containing pages to process, one per line")
args = parser.parse_args()
start, end = blib.get_args(args.start, args.end)

pages = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")]
for i, page in blib.iter_items(pages, start, end):
  msg("Page %s %s: Processing" % (i, page))
  process_page(i, pywikibot.Page(site, page), args.save, args.verbose)
Ejemplo n.º 7
0
    "16b",
    u"irreg-бежать",
    u"irreg-спать",
    u"irreg-хотеть",
    u"irreg-дать",
    u"irreg-есть",
    u"irreg-сыпать",
    u"irreg-лгать",
    u"irreg-мочь",
    u"irreg-слать",
    u"irreg-идти",
    u"irreg-ехать",
    u"irreg-минуть",
    u"irreg-живописать-миновать",
    u"irreg-лечь",
    u"irreg-зиждиться",
    u"irreg-клясть",
    u"irreg-слыхать-видать",
    u"irreg-стелить-стлать",
    u"irreg-быть",
    u"irreg-ссать-сцать",
    u"irreg-чтить",
    u"irreg-ошибиться",
    u"irreg-плескать",
    u"irreg-внимать",
    u"irreg-обязывать",
]
for i, ty in blib.iter_items(types, start, end):
    template = "Template:ru-conj-%s/documentation" % ty
    process_page(i, pywikibot.Page(site, template), args.save, args.verbose)
                "Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}"
                % getparam(t, "1"))
        if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t":
            t.add("2", getparam(t, "text"), before="text")
            rmparam(t, "text")
            if t.has("nocap"):
                rmparam(t, "nocap")
            else:
                t.add("withcap", "1")
            if t.has("nodot"):
                rmparam(t, "nodot")
            else:
                t.add("withdot", "1")
            t.add("1", "form_of_t")

        if unicode(t) != origt:
            pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t)))

    return unicode(parsed), notes


parser = blib.create_argparser(
    "Convert form_of_t and alt_form_of_t invocations in [[Module:form of]] to form_of_t in [[Module:form of/templates]]"
)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, template in blib.iter_items(templates_to_process, start, end):
    page = pywikibot.Page(site, "Template:%s" % template)
    blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
Ejemplo n.º 9
0
parser.add_argument("--comment",
                    help="Comment to use when saving pages.",
                    required=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

fulltext = codecs.open(args.textfile, "r", "utf-8").read()

titles_and_text = re.split(r"\n\n\n\n+", fulltext)

assert len(titles_and_text) % 2 == 0

title_and_text_pairs = []
for i in xrange(0, len(titles_and_text), 2):
    title_and_text_pairs.append((titles_and_text[i], titles_and_text[i + 1]))

for i, (pagetitle, pagetext) in blib.iter_items(title_and_text_pairs,
                                                start,
                                                end,
                                                get_name=lambda x: x[0]):

    def handler(page, index, parsed):
        return process_page(page, index, pagetext,
                            args.comment.decode('utf-8'))

    blib.do_edit(pywikibot.Page(site, pagetitle),
                 i,
                 handler,
                 save=args.save,
                 verbose=args.verbose)
Ejemplo n.º 10
0
                    help="File containing derived lemmas")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

derived_lemmas = []
if args.derived_lemmafile:
    derived_lemmas = [
        rulib.remove_accents(x.strip())
        for x in codecs.open(args.derived_lemmafile, "r", "utf-8")
    ]
else:
    for i, page in blib.cat_articles(
            "Russian adverbs" if args.adverbs else "Russian nouns" if args.
            nouns else "Russian adjectives"):
        derived_lemmas.append(page.title())

if args.base_lemmafile:
    for i, pagename in blib.iter_items([
            rulib.remove_accents(x.strip())
            for x in codecs.open(args.base_lemmafile, "r", "utf-8")
    ]):
        page = pywikibot.Page(site, pagename)
        process_page(i, page, args.save, args.verbose, derived_lemmas)
else:
    for category in ["Russian adjectives"] if args.adverbs else [
            "Russian proper nouns", "Russian nouns", "Russian verbs"
    ]:
        for i, page in blib.cat_articles(category, start, end):
            process_page(i, page, args.save, args.verbose, args.adverbs,
                         derived_lemmas)
Ejemplo n.º 11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"Find verbs with impersonal conjugations")
parser.add_argument('--verbfile', help="File listing verbs to check.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, line in blib.iter_items(codecs.open(args.verbfile, "r", "utf-8"), start, end):
  page = pywikibot.Page(site, line.strip())
  if "-impers|" in page.text:
    msg("Page %s %s: Found impersonal conjugation" % (i, unicode(page.title())))
  else:
    msg("Page %s %s: No impersonal conjugation" % (i, unicode(page.title())))
        msg("Page %s %s: %s" % (index, pagetitle, txt))
      origcontents = origpages.get(pagetitle, None)
      newtext = newpages.get(pagetitle, None)
      if not newtext:
        pagemsg("Skipping because not found in among new page contents")
        return
      if origcontents == newtext:
        pagemsg("Page %s %s: Skipping contents for %s because no change" % pagetitle)
        return
      return process_page(index, page, newtext, origcontents,
        args.verbose, args.comment.decode("utf-8"), args.lang_only and args.lang_only.decode("utf-8"),
        args.allow_page_creation)
    blib.do_pagefile_cats_refs(args, start, end, do_process_page, edit=True)

  else:
    lines = codecs.open(args.direcfile.decode("utf-8"), "r", "utf-8")

    index_pagetitle_and_text = blib.yield_text_from_find_regex(lines, args.verbose)
    for _, (index, pagetitle, newtext) in blib.iter_items(index_pagetitle_and_text, start, end,
        get_name=lambda x:x[1], get_index=lambda x:x[0]):
      origcontents = origpages.get(pagetitle, None)
      if origcontents == newtext:
        msg("Page %s %s: Skipping contents for %s because no change" % (index, pagetitle, pagetitle))
      else:
        def do_process_page(page, index, parsed):
          return process_page(index, page, newtext, origcontents,
              args.verbose, args.comment.decode("utf-8"), args.lang_only and args.lang_only.decode("utf-8"),
              args.allow_page_creation)
        blib.do_edit(pywikibot.Page(site, pagetitle), index, do_process_page,
            save=args.save, verbose=args.verbose, diff=args.diff)
Ejemplo n.º 13
0
doc_comment = "Delete documentation page of " + re.sub(
    "^([Dd]elete|[Rr]emove) ", "", comment)


def delete_page(page, comment):
    for i in range(11):
        try:
            page.delete(comment)
            return
        except APIError as e:
            if i == 10:
                raise e
            errandmsg("APIError, try #%s: %s" % (i + 1, e))


for i, pagename in blib.iter_items(pages_to_delete, start, end):
    page = pywikibot.Page(site, pagename)
    if page.exists():
        msg("Deleting %s (comment=%s)" % (page.title(), comment))
        delete_page(page,
                    '%s (content was "%s")' % (comment, unicode(page.text)))
        errandmsg("Page [[%s]] deleted" % page.title())
    if args.delete_docs:
        doc_page = pywikibot.Page(site, "%s/documentation" % pagename)
        if doc_page.exists():
            msg("Deleting %s (comment=%s)" % (doc_page.title(), doc_comment))
            delete_page(
                doc_page, '%s (content was "%s")' %
                (doc_comment, unicode(doc_page.text)))
            errandmsg("Page [[%s]] deleted" % doc_page.title())
Ejemplo n.º 14
0
def read_pages(filename, start, end):
  lines = [x.strip() for x in codecs.open(filename, "r", "utf-8")]
  for i, line in blib.iter_items(lines, start, end):
    if line.startswith("#"):
      continue
    yield i, line
Ejemplo n.º 15
0
parser = blib.create_argparser("Add pronunciation sections to Latin Wiktionary entries", include_pagefile=True)
parser.add_argument('--lemma-file', help="File containing lemmas to process, one per line; non-lemma forms will be done")
parser.add_argument('--lemmas', help="List of comma-separated lemmas to process; non-lemma forms will be done")
parser.add_argument("--slots", help="Slots to process in conjunction with --lemmas and --lemma-file.")
parser.add_argument('--override-pronun', action="store_true", help="Override existing pronunciations")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.lemma_file or args.lemmas:
  slots = args.slots.split(",")

  if args.lemma_file:
    lemmas = read_pages(args.lemma_file, start, end)
  else:
    lemmas = blib.iter_items(re.split(",", args.lemmas.decode("utf-8")), start, end)
  for i, lemma in lemmas:
    process_lemma(i, lalib.remove_macrons(lemma), slots, args)

else:
  def do_process_page(page, index, parsed):
    return process_page(index, page, args)
  blib.do_pagefile_cats_refs(args, start, end, do_process_page,
      default_cats=["Latin lemmas", "Latin non-lemma forms"], edit=True)

def subval_to_string(subval):
  if type(subval) is tuple:
    pron, extra_params, pre, post = subval
    return unicode(FoundPronun(pron, extra_params, pre, post))
  else:
    return subval
    "oblique plural of",
    "oblique singular of",
    "terminative plural of",
    "terminative singular of",
    "ancient form of",
    "early form of",
    "late form of",
    "masculine animate plural past participle of",
    "masculine inanimate plural past participle of",
    "masculine singular past participle of",
    "neuter plural past participle of",
    "dative dual of",
    "dative plural definite of",
    "dative plural indefinite of",
    "paucal of",
    "second-person singular of",
]

for i, temp in blib.iter_items(templates_to_delete, start, end):
    template_page = pywikibot.Page(site, "Template:%s" % temp)
    if template_page.exists():
        template_page.delete(
            'Delete obsoleted and orphaned form-of template (content was "%s")'
            % unicode(template_page.text))
    template_doc_page = pywikibot.Page(site,
                                       "Template:%s/documentation" % temp)
    if template_doc_page.exists():
        template_doc_page.delete(
            'Delete documentation page of obsoleted and orphaned form-of template (content was "%s")'
            % unicode(template_doc_page.text))
    text = re.sub("\n\n\n+", "\n\n", text)
    if not notes:
        notes.append("convert 3+ newlines to 2")
    return text, notes


parser = blib.create_argparser("Add missing declension to Latin terms")
parser.add_argument(
    "--direcfile",
    help="File of output directives from make_latin_missing_decl.py",
    required=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lines = [x.rstrip('\n') for x in codecs.open(args.direcfile, "r", "utf-8")]
for i, line in blib.iter_items(lines, start, end):
    m = re.search("^Page [0-9]+ (.*?): For noun (.*?), declension (.*?)$",
                  line)
    if not m:
        msg("Unrecognized line, skipping: %s" % line)
    else:
        pagename, headword_template, decl_template = m.groups()

        def do_process_page(page, index, parsed):
            return process_page(page, index, headword_template, decl_template)

        blib.do_edit(pywikibot.Page(site, pagename),
                     i,
                     do_process_page,
                     save=args.save,
                     verbose=args.verbose,
Ejemplo n.º 18
0
parser.add_argument("--field", help="Field containing terms", type=int, default=1)
parser.add_argument("--output-orig", help="Output original lines", action="store_true")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lemmas = set()
msg("Reading %s lemmas" % args.lang)
for i, page in blib.cat_articles("%s lemmas" % args.lang, start, end):
  lemmas.add(unicode(page.title()))

words_freq = {}

lines = [re.split(r"\s", x.strip()) for x in codecs.open(args.pagefile, "r", "utf-8")]
lines = [(x[args.field - 1], x) for x in lines]

for i, (pagename, origline) in blib.iter_items(lines, start, end):
  m = re.search(u"[^-'Ѐ-џҊ-ԧꚀ-ꚗ]", pagename)
  if m:
    outtext = "skipped due to non-Cyrillic characters"
  else:
    for pagenm, pagetype in [(pagename, ""),
        (pagename.capitalize(), " (capitalized)"),
        (pagename.upper(), " (uppercased)")]:
      if pagenm in lemmas:
        outtext = "exists%s" % pagetype
        break
      else:
        page = pywikibot.Page(site, pagenm)
        if page.exists():
          text = unicode(page.text)
          if re.search("#redirect", text, re.I):
Ejemplo n.º 19
0
if args.lang not in ["uk", "be"]:
  raise ValueError("Unrecognized language: %s" % args.lang)

lines = [x.strip() for x in codecs.open(args.declfile, "r", "utf-8")]

def yield_decls():
  for line in lines:
    found_ndecl_style = False
    for m in re.finditer(r"\{\{(?:User:Benwing2/)?" + args.lang + "-ndecl\|(.*?)\}\}", line):
      found_ndecl_style = True
      yield m.group(1)
    if not found_ndecl_style:
      for m in re.finditer(r"\(\(.*?\)\)|[^| \[\]]+<.*?\>", line):
        yield m.group(0)

for index, decl in blib.iter_items(yield_decls(), start, end):
  module = uk if args.lang == "uk" else be
  if decl.startswith("(("):
    m = re.search(r"^\(\((.*)\)\)$", decl)
    subdecls = m.group(1).split(",")
    decl_for_page = subdecls[0]
  else:
    decl_for_page = decl
  m = re.search(r"^(.+?)<.*>$", decl_for_page)
  if not m:
    msg("WARNING: Can't extract lemma from decl: %s" % decl)
    pagename = "UNKNOWN"
  else:
    pagename = module.remove_accents(blib.remove_links(m.group(1)))
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagename, txt))
Ejemplo n.º 20
0
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"Find red links")
parser.add_argument("--pagefile", help="File containing pages to check")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

lemmas = set()
msg("Reading Bulgarian lemmas")
for i, page in blib.cat_articles("Bulgarian lemmas", start, end):
  lemmas.add(unicode(page.title()))

lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")]
words = lines

for i, line in blib.iter_items(words, start, end):
  pagename, freq = line.split("\t")
  m = re.search(u"[^-Ѐ-џҊ-ԧꚀ-ꚗ]", pagename)
  def fmsg(txt):
    msg("Page %s [[%s]]: %s (freq %s)" % (i, pagename, txt, freq))
  if m:
    fmsg("skipped due to non-Cyrillic characters")
  else:
    for pagenm, pagetype in [(pagename, ""),
        (pagename.capitalize(), " (capitalized)"),
        (pagename.upper(), " (uppercased)")]:
      if pagenm in lemmas:
        fmsg("exists%s" % pagetype)
        break
      else:
        page = pywikibot.Page(site, pagenm)
Ejemplo n.º 21
0
from blib import getparam, rmparam, msg, site

def process_page(page, index, parsed):
  pagetitle = unicode(page.title())
  def pagemsg(txt):
    msg("Page %s %s: %s" % (index, pagetitle, txt))

  pagemsg("WARNING: Script no longer applies and would need fixing up")
  return

  pagemsg("Processing")
  return "#REDIRECT [[Module:ru-verb/documentation]]", "redirect to [[Module:ru-verb/documentation]]"

parser = blib.create_argparser("Redirect ru-conj-* documentation pages")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

types = ["7a", "7b", "8a", "8b", "9a", "9b", "10a", "10c", "11a", "11b",
    "12a", "12b", "13b", "14a", "14b", "14c", "15a", "16a", "16b",
    u"irreg-бежать", u"irreg-спать", u"irreg-хотеть", u"irreg-дать",
    u"irreg-есть", u"irreg-сыпать", u"irreg-лгать", u"irreg-мочь",
    u"irreg-слать", u"irreg-идти", u"irreg-ехать", u"irreg-минуть",
    u"irreg-живописать-миновать", u"irreg-лечь", u"irreg-зиждиться",
    u"irreg-клясть", u"irreg-слыхать-видать", u"irreg-стелить-стлать",
    u"irreg-быть", u"irreg-ссать-сцать", u"irreg-чтить", u"irreg-ошибиться",
    u"irreg-плескать", u"irreg-внимать", u"irreg-обязывать"]
for i, ty in blib.iter_items(types, start, end):
  template = "Template:ru-conj-%s/documentation" % ty
  blib.do_edit(pywikibot.Page(site, template), i, process_page, save=args.save,
    verbose=args.verbose, diff=args.diff)
    else:
      ru_proper_noun_changed = 1

  return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed

parser = blib.create_argparser("Copy the declension in ru-noun-table to ru-noun+, preserving any m=, f=, g=, etc. in the latter.")
parser.add_argument('--cats', default="nouns,proper nouns", help="Categories to do ('nouns', 'proper nouns' or 'nouns,proper nouns')")
parser.add_argument('--lemma-file', help="File containing lemmas to copy declension of. Will remove extraneous params from ru-noun-table and copy links to ru-noun-table regardless of this.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.lemma_file:
  lemmas = set([x.strip() for x in codecs.open(args.lemma_file, "r", "utf-8")])
else:
  lemmas = None

for cat in re.split(",", args.cats):
  if cat == "nouns":
    template = "Template:ru-noun+"
  elif cat == "proper nouns":
    template = "Template:ru-proper noun+"
  else:
    raise ValueError("Invalid value to --cats: %s" % cat)
  msg("Processing references to %s" % template)
  if lemmas:
    for i, page in blib.iter_items(lemmas, start, end):
      process_page(i, pywikibot.Page(site, page), args.save, args.verbose, lemmas)
  else:
    for i, page in blib.references(template, start, end):
      process_page(i, page, args.save, args.verbose, lemmas)
    help="File of ///-separated pairs of base declensions to move")
parser.add_argument('--comment', help="Comment to use when deleting")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

pages_to_move = [
    x.rstrip('\n').split("///")
    for x in codecs.open(args.pagefile, "r", "utf-8")
]

comment = args.comment or "Move erroneously-created non-lemma form"

endings = ["e", "en", "er", "em", "es"]

for i, (frombase, tobase) in blib.iter_items(pages_to_move,
                                             start,
                                             end,
                                             get_name=lambda x: x[1]):
    for ending in endings:
        page = pywikibot.Page(site, frombase + ending)

        def pagemsg(txt):
            msg("Page %s %s: %s" % (i, unicode(page.title()), txt))

        topagename = tobase + ending
        if page.exists():
            if pywikibot.Page(site, topagename).exists():
                pagemsg(
                    "WARNING: Destination page %s already exists, not moving" %
                    topagename)
            else:
                pagemsg("Moving to %s (comment=%s)" % (topagename, comment))
Ejemplo n.º 24
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_latin_section(text, pagemsg)
    if retval is None:
        return

    sections, j, secbody, sectail, has_non_latin = retval

    parsed = blib.parse_text(secbody)
    saw_noun = None
    saw_proper_noun = None
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "la-noun":
            if saw_noun:
                pagemsg(
                    "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_noun), unicode(t)))
                return
            saw_noun = t
        elif tn == "la-proper noun":
            if saw_proper_noun:
                pagemsg(
                    "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping"
                    % (unicode(saw_proper_noun), unicode(t)))
                return
            saw_proper_noun = t
    if saw_noun and saw_proper_noun:
        pagemsg(
            "WARNING: Saw both noun and proper noun, can't correct header/headword"
        )
        return
    if not saw_noun and not saw_proper_noun:
        pagemsg(
            "WARNING: Saw neither noun nor proper noun, can't correct header/headword"
        )
        return
    pos = "pn" if saw_proper_noun else "n"
    ht = saw_proper_noun or saw_noun
    if getparam(ht, "indecl"):
        pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht))
        return
    generate_template = blib.parse_text(unicode(ht)).filter_templates()[0]
    blib.set_template_name(generate_template, "la-generate-noun-forms")
    blib.remove_param_chain(generate_template, "lemma", "lemma")
    blib.remove_param_chain(generate_template, "m", "m")
    blib.remove_param_chain(generate_template, "f", "f")
    blib.remove_param_chain(generate_template, "g", "g")
    rmparam(generate_template, "type")
    rmparam(generate_template, "indecl")
    rmparam(generate_template, "id")
    rmparam(generate_template, "pos")
    result = expand_text(unicode(generate_template))
    if not result:
        pagemsg("WARNING: Error generating forms, skipping")
        return
    tempargs = blib.split_generate_args(result)
    forms_seen = set()
    slots_and_forms_to_process = []
    for slot, formarg in tempargs.iteritems():
        forms = formarg.split(",")
        for form in forms:
            if "[" in form or "|" in form:
                continue
            form_no_macrons = lalib.remove_macrons(form)
            if form_no_macrons == pagetitle:
                continue
            if form_no_macrons in forms_seen:
                continue
            forms_seen.add(form_no_macrons)
            slots_and_forms_to_process.append((slot, form))
    for index, (slot, form) in blib.iter_items(
            sorted(slots_and_forms_to_process,
                   key=lambda x: lalib.remove_macrons(x[1]))):

        def handler(page, index, parsed):
            return process_form(page, index, slot, form, pos)

        blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                     index,
                     handler,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
Ejemplo n.º 25
0
lines = [x for x in lines if x]


def get_items(lines):
    for line in lines:
        m = re.search("^Page ([0-9]*) (.*): <respelling> *(.*?) *<end>", line)
        if not m:
            # Not a warning, there will be several of these from output of snarf_it_pron.py
            msg("Unrecognized line: %s" % line)
        else:
            yield m.groups()


for _, (index, pagetitle, spec) in blib.iter_items(get_items(lines),
                                                   start,
                                                   end,
                                                   get_name=lambda x: x[1],
                                                   get_index=lambda x: x[0]):

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    page = pywikibot.Page(site, pagetitle)
    if not page.exists():
        pagemsg("WARNING: Page doesn't exist, skipping")
    else:

        def do_process_page(page, index, parsed):
            return process_page(index, page, spec)

        blib.do_edit(page,
Ejemplo n.º 26
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import blib
from blib import msg
import sys
import lalib

parser = blib.create_argparser("Remove Latin macrons from input",
                               no_beginning_line=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for index, line in blib.iter_items(sys.stdin, start, end):
    line = line.strip().decode('utf-8')
    msg(lalib.remove_macrons(line))
Ejemplo n.º 27
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    def expand_text(tempcall):
        return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose)

    text = unicode(page.text)

    retval = lalib.find_heads_and_defns(text, pagemsg)
    if retval is None:
        return None, None

    (sections, j, secbody, sectail, has_non_latin, subsections,
     parsed_subsections, headwords, pronun_sections, etym_sections) = retval

    part_headwords = []
    adj_headwords = []
    pn_headwords = []
    noun_headwords = []

    for headword in headwords:
        ht = headword['head_template']
        tn = tname(ht)
        if tn == "la-part" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["participle", "participles"]:
            part_headwords.append(headword)
        elif tn == "la-adj" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["adjective", "adjectives"]:
            adj_headwords.append(headword)
        elif tn == "la-proper noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(
                    ht, "2") in ["proper noun", "proper nouns"]:
            pn_headwords.append(headword)
        elif tn == "la-noun" or tn == "head" and getparam(
                ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]:
            noun_headwords.append(headword)
    headwords_to_do = None
    if part_headwords and not adj_headwords:
        pos = "part"
        headwords_to_do = part_headwords
        expected_inflt = "la-adecl"
    elif pn_headwords and not noun_headwords:
        pos = "pn"
        headwords_to_do = pn_headwords
        expected_inflt = "la-ndecl"

    if not headwords_to_do:
        return None, None

    for headword in headwords_to_do:
        for inflt in headword['infl_templates']:
            infltn = tname(inflt)
            if infltn != expected_inflt:
                pagemsg(
                    "WARNING: Saw bad declension template for %s, expected {{%s}}: %s"
                    % (pos, expected_inflt, unicode(inflt)))
                continue
            inflargs = lalib.generate_infl_forms(pos, unicode(inflt),
                                                 errandpagemsg, expand_text)
            forms_seen = set()
            slots_and_forms_to_process = []
            for slot, formarg in inflargs.iteritems():
                forms = formarg.split(",")
                for form in forms:
                    if "[" in form or "|" in form:
                        continue
                    form_no_macrons = lalib.remove_macrons(form)
                    if form_no_macrons == pagetitle:
                        continue
                    if form_no_macrons in forms_seen:
                        continue
                    forms_seen.add(form_no_macrons)
                    slots_and_forms_to_process.append((slot, form))
            for formindex, (slot, form) in blib.iter_items(
                    sorted(slots_and_forms_to_process,
                           key=lambda x: lalib.remove_macrons(x[1]))):

                def handler(page, formindex, parsed):
                    return process_form(page, formindex, slot, form, pos,
                                        pagemsg)

                blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)),
                             "%s.%s" % (index, formindex),
                             handler,
                             save=args.save,
                             verbose=args.verbose,
                             diff=args.diff)
Ejemplo n.º 28
0
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = text
      page.save(comment=comment)
    else:
      pagemsg("Would save with comment = %s" % comment)
  elif warn_on_no_change:
    pagemsg("WARNING: No changes")

parser = blib.create_argparser(u"Fix indentation of Pronunciation, Declension, Conjugation, Alternative forms sections")
parser.add_argument("--pagefile",
    help="""List of pages to process.""")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.pagefile:
  lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")]
  for i, line in blib.iter_items(lines, start, end):
    m = re.search("^Page [0-9]+ (.*?): WARNING: .*?$", line)
    if not m:
      msg("WARNING: Can't process line: %s" % line)
    else:
      page = m.group(1)
      process_page(i, pywikibot.Page(site, page), args.save, args.verbose,
          warn_on_no_change=True)
else:
  for cat in ["Russian lemmas", "Russian non-lemma forms"]:
    msg("Processing category %s" % cat)
    for i, page in blib.cat_articles(cat, start, end):
      process_page(i, page, args.save, args.verbose) 
Ejemplo n.º 29
0
if args.fix_pagefile:
    fixdireclines = [
        x.strip() for x in codecs.open(args.fix_pagefile, "r", "utf-8")
    ]
    fixdirecs = {}
    fixpages = []
    for line in fixdireclines:
        verb, direc = re.split(" ", line)
        fixdirecs[verb] = direc
        fixpages.append(verb)

    def do_process_page(page, index, parsed):
        return process_page(page, index, fixdirecs)

    for i, page in blib.iter_items(fixpages, start, end):
        blib.do_edit(pywikibot.Page(site, page),
                     i,
                     do_process_page,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
else:

    def do_process_page(page, index, parsed):
        return process_page(page, index, {})

    for category in ["Russian verbs"]:
        for i, page in blib.cat_articles(category, start, end):
            blib.do_edit(pywikibot.Page(site, page),
                         i,
Ejemplo n.º 30
0
    if notes:
        comment = "Add inanimacy to neuters (%s)" % "; ".join(notes)
    else:
        comment = "Add inanimacy to neuters"

    return unicode(parsed), notes


parser = blib.create_argparser("Fix hard-е nouns according to directives")
parser.add_argument("--direcfile",
                    help="File listing directives to apply to nouns",
                    required=True)
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

for i, line in blib.iter_items(codecs.open(args.direcfile, "r", "utf-8"),
                               start, end):
    line = line.strip()
    if "!!!" in line:
        page, direc = re.split("!!!", line)
    else:
        page, direc = re.split(" ", line)

        def do_process_page(page, index, parsed):
            return process_page(index, page, direc)

        blib.do_edit(pywikibot.Page(site, page),
                     i,
                     do_process_page,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
Ejemplo n.º 31
0
if __name__ == "__main__":
    parser = blib.create_argparser("Push new entries from generate_entries.py")
    parser.add_argument('--direcfile', help="File containing entries.")
    parser.add_argument('--comment', help="Comment to use.", required="true")
    parser.add_argument('--lang', help="Language of entries.", required="true")
    args = parser.parse_args()
    start, end = blib.parse_start_end(args.start, args.end)

    lines = codecs.open(args.direcfile, "r", "utf-8")

    index_pagename_and_text = blib.yield_text_from_find_regex(
        lines, args.verbose)
    for _, (index, pagename,
            text) in blib.iter_items(index_pagename_and_text,
                                     start,
                                     end,
                                     get_name=lambda x: x[1],
                                     get_index=lambda x: x[0]):

        def do_process_page(page, index, parsed):
            return process_page(index, page, text, args.lang, args.verbose,
                                args.comment.decode("utf-8"))

        blib.do_edit(pywikibot.Page(site, pagename),
                     index,
                     do_process_page,
                     save=args.save,
                     verbose=args.verbose,
                     diff=args.diff)
Ejemplo n.º 32
0
    if tn == "form of":
      lang = getparam(t, "lang")
      if lang:
        form = getparam(t, "1")
      else:
        form = getparam(t, "2")
      form_of_forms[form] += 1

parser = blib.create_argparser("Clean up bad inflection tags")
parser.add_argument("--textfile", help="File containing inflection templates to process.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.textfile:
  with codecs.open(args.textfile, "r", "utf-8") as fp:
    text = fp.read()
  pages = re.split('\nPage [0-9]+ ', text)
  title_text_split = ': Found match for regex: '
  for index, page in blib.iter_items(pages, start, end):
    if not page: # e.g. first entry
      continue
    split_vals = re.split(title_text_split, page, 1)
    if len(split_vals) < 2:
      msg("Page %s: Skipping bad text: %s" % (index, page))
      continue
    pagetitle, pagetext = split_vals
    process_text_on_page(pagetitle, index, pagetext)

  for form, count in sorted(list(form_of_forms.iteritems()), key=lambda x: -x[1]):
    msg("%-50s = %s" % (form, count))
Ejemplo n.º 33
0
words_freq = {}

lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")]
if args.with_freq:
  for line in lines:
    freq, word = re.split(r"\s", line)
    freq = int(freq)
    if word in words_freq:
      words_freq[word] += freq
    else:
      words_freq[word] = freq
  words = [x[0] for x in sorted(words_freq.items(), key=lambda y:-y[1])]
else:
  words = lines

for i, pagename in blib.iter_items(words, start, end):
  m = re.search(u"[^-Ѐ-џҊ-ԧꚀ-ꚗ]", pagename)
  if m:
    msg("Page %s [[%s]]: skipped due to non-Cyrillic characters" % (i, pagename))
  else:
    for pagenm, pagetype in [(pagename, ""),
        (pagename.capitalize(), " (capitalized)"),
        (pagename.upper(), " (uppercased)")]:
      if pagenm in lemmas:
        msg("Page %s [[%s]]: exists%s" % (i, pagename, pagetype))
        break
      else:
        page = pywikibot.Page(site, pagenm)
        if page.exists():
          if re.search("#redirect", unicode(page.text), re.I):
            msg("Page %s [[%s]]: exists%s as redirect" % (i, pagename, pagetype))
Ejemplo n.º 34
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pywikibot, re, sys, codecs, argparse

import blib
from blib import getparam, rmparam, msg, site

parser = blib.create_argparser(u"List pages, lemmas and/or non-lemmas",
                               include_pagefile=True)
parser.add_argument('--namespace', help="List all pages in namespace")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.namespace:
    ns = args.namespace.decode("utf-8")
    for i, page in blib.iter_items(
            site.allpages(
                start=start if isinstance(start, basestring) else '!',
                namespace=ns,
                filterredir=False), start, end):
        msg("Page %s %s: Processing" % (i, unicode(page.title())))
else:

    def process_page(page, index):
        msg("Page %s %s: Processing" % (index, unicode(page.title())))

    blib.do_pagefile_cats_refs(args, start, end, process_page)
Ejemplo n.º 35
0
      notes.append("add (manually specified) Etymology section to Russian lemma")
      break
  else:
    errandpagemsg("WARNING: Can't find Russian section, skipping")
    return

  if newtext != pagetext:
    if verbose:
      pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext))
    assert notes
    comment = "; ".join(group_notes(notes))
    if save:
      blib.safe_page_save(page, comment, errandpagemsg)
    else:
      pagemsg("Would save with comment = %s" % comment)

if __name__ == "__main__":
  parser = blib.create_argparser("Add etymologies to Russian pages based on directives")
  parser.add_argument('--direcfile', help="File containing directives.")
  parser.add_argument('--add-passive-of', action='store_true',
      help="Add {{passive of|lang=ru|...}} to defn.")
  parser.add_argument('--override-etym', action='store_true',
      help="Automatically override any existing etymologies.")
  args = parser.parse_args()
  start, end = blib.parse_start_end(args.start, args.end)

  lines = codecs.open(args.direcfile, "r", "utf-8")
  for i, line in iter_items(lines, start, end):
    line = line.strip()
    process_line(i, line, args.add_passive_of, args.override_etym, args.save, args.verbose)
Ejemplo n.º 36
0
          unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else ""))

  newtext = unicode(parsed)
  if newtext != text:
    assert notes
    comment = "; ".join(notes)
    if save:
      pagemsg("Saving with comment = %s" % comment)
      page.text = newtext
      blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg,
                    "save page")
    else:
      pagemsg("Would save with comment = %s" % comment)

parser = blib.create_argparser("Convert head|fr|* to fr-*")
parser.add_argument("--fix-missing-plurals", action="store_true", help="Fix cases with missing plurals by just assuming the default plural.")
parser.add_argument("--lemma-file",help="File containing lemmas to do.")
args = parser.parse_args()
start, end = blib.parse_start_end(args.start, args.end)

if args.lemma_file:
  lines = [x.strip() for x in codecs.open(args.lemma_file, "r", "utf-8")]
  for i, pagename in blib.iter_items(lines, start, end):
    process_page(i, pywikibot.Page(site, pagename), args.save, args.verbose, args.fix_missing_plurals)
else:
  for cat in ["French nouns", "French proper nouns", "French pronouns", "French determiners", "French adjectives", "French verbs", "French participles", "French adverbs", "French prepositions", "French conjunctions", "French interjections", "French idioms", "French phrases", "French abbreviations", "French acronyms", "French initialisms", "French noun forms", "French proper noun forms", "French pronoun forms", "French determiner forms", "French verb forms", "French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]:
  #for cat in ["French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]:
    msg("Processing category: %s" % cat)
    for i, page in blib.cat_articles(cat, start, end):
      process_page(i, page, args.save, args.verbose, args.fix_missing_plurals)
Ejemplo n.º 37
0
  lines = [x.strip() for x in codecs.open(args.cmu, "r", "iso8859-1") if not
      x.startswith(";;;")]
  joined_lines = []
  prev_word = None
  seen_pronuns = []
  for line in lines:
    word, pronun = re.split("  ", line)
    m = re.search(r"^(.*)\([0-9]+\)$", word)
    if m and m.group(1) == prev_word:
      seen_pronuns.append(pronun)
    else:
      if prev_word:
        joined_lines.append([prev_word, seen_pronuns])
      prev_word = word
      seen_pronuns = [pronun]
  if prev_word:
    joined_lines.append([prev_word, seen_pronuns])

  for i, line in blib.iter_items(joined_lines, start, end):
    word, pronuns = line
    process_cmu_line(i, word, pronuns)

  for i, onset in enumerate(list(sorted(seen_onsets))):
    msg("#%3s %s" % (i, onset))

if args.moby:
  lines = [x.strip() for x in codecs.open(args.moby, "r", "mac_roman")]
  for i, line in blib.iter_items(lines, start, end):
    word, pronun = re.split(" ", line)
    process_moby_line(i, word, pronun)