def yield_pages(): if pages: for index, page in blib.iter_items(pages, startFrom, upTo): yield index, pywikibot.Page(blib.site, page), None if pagefile: lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")] for index, page in blib.iter_items(lines, startFrom, upTo): yield index, pywikibot.Page(blib.site, page), None if from_to_pagefile: lines = [ x.strip() for x in codecs.open(from_to_pagefile, "r", "utf-8") ] for index, line in blib.iter_items(lines, startFrom, upTo): if " ||| " not in line: msg("WARNING: Saw bad line in --from-to-pagefile: %s" % line) continue frompage, topage = line.split(" ||| ") yield index, pywikibot.Page(blib.site, frompage), topage if refs: for ref in refs: for index, page in blib.references( ref, startFrom, upTo, only_template_inclusion=False): yield index, page, None if pages_and_refs: for page_and_refs in pages_and_refs: for index, page in blib.references( page_and_refs, startFrom, upTo, only_template_inclusion=False, include_page=True): yield index, page, None if cats: for cat in cats: for index, page in blib.cat_articles(cat, startFrom, upTo): yield index, page, None
def process_page(index, lemma, forms, lang, pages_to_delete, save, verbose, diff): def pagemsg(txt): msg("Page %s %s: %s" % (index, lemma, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, lemma, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, lemma, pagemsg, verbose) pagemsg("Processing") for formind, form in blib.iter_items(forms): delete_form(index, lemma, formind, form, lang, save, verbose, diff)
def search_pages(args, regex, invert, input_from_diff, start, end, lang_only): def do_process_text_on_page(index, title, text): process_text_on_page(index, title, text, regex, invert, args.verbose, args.text, args.all, args.mainspace_only, lang_only, args.from_to) if input_from_diff: lines = codecs.open(input_from_diff, "r", "utf-8") index_pagename_and_text = blib.yield_text_from_diff(lines, verbose) for _, (index, pagename, text) in blib.iter_items(index_pagename_and_text, start, end, get_name=lambda x: x[1], get_index=lambda x: x[0]): do_process_text_on_page(index, pagename, text) return blib.do_pagefile_cats_refs(args, start, end, do_process_text_on_page, stdin=True)
def process_page(index, pos, lemma, subs, infl, save, verbose): def pagemsg(txt): msg("Page %s %s: %s" % (index, lemma, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, lemma, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, remove_macrons(lemma), pagemsg, verbose) pagemsg("Processing") args = lalib.generate_infl_forms(pos, infl, errandpagemsg, expand_text) if args is None: return forms_to_delete = [] for key, form in args.iteritems(): forms_to_delete.extend(form.split(",")) for formind, form in blib.iter_items(forms_to_delete): def handler(page, formind, parsed): return process_form(index, page, lemma, formind, form, subs) blib.do_edit(pywikibot.Page(site, remove_macrons(form)), formind, handler, save=save, verbose=verbose)
for k in xrange(1, len(splitsections), 2): if splitsections[k] == "English": saw_english = True else: saw_langs.add(splitsections[k]) if saw_english: english_pages[pagetitle] = saw_langs def process_line(index, line): m = re.search("^Page [0-9]+ (.*?): Replacing (.*) with (.*) in .* section in (.*)$", line) if not m: return pagetitle, fromtext, totext, lang = m.groups() def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) for m in re.finditer(r"\{\{(?:m|l|term)\|.*?\|(.*?)\}\}", totext): linkpage = m.group(1) if linkpage in english_pages and lang not in english_pages[linkpage]: pagemsg("Possible false positive for [[%s]] in %s: %s" % (linkpage, lang, fromtext)) parser = blib.create_argparser("Check for likely false-positive links converted from raw links") parser.add_argument("--direcfile", help="File of output from fix_links.py") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) blib.parse_dump(sys.stdin, find_english_pages) for index, line in blib.iter_items(codecs.open(args.direcfile, "r", encoding="utf-8"), start, end): process_line(index, line)
pagemsg("Replaced %s with %s" % (origt, unicode(t))) newtext = unicode(parsed) if newtext != text: if verbose: pagemsg("Replacing <<%s>> with <<%s>>" % (text, newtext)) comment = "Add phon= to ru-IPA templates" if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) else: pagemsg("Skipping") parser = argparse.ArgumentParser(description="Add phon= to ru-IPA uses") parser.add_argument('start', help="Starting page index", nargs="?") parser.add_argument('end', help="Ending page index", nargs="?") parser.add_argument('--save', action="store_true", help="Save results") parser.add_argument('--verbose', action="store_true", help="More verbose output") parser.add_argument('--pagefile', help="File containing pages to process, one per line") args = parser.parse_args() start, end = blib.get_args(args.start, args.end) pages = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")] for i, page in blib.iter_items(pages, start, end): msg("Page %s %s: Processing" % (i, page)) process_page(i, pywikibot.Page(site, page), args.save, args.verbose)
"16b", u"irreg-бежать", u"irreg-спать", u"irreg-хотеть", u"irreg-дать", u"irreg-есть", u"irreg-сыпать", u"irreg-лгать", u"irreg-мочь", u"irreg-слать", u"irreg-идти", u"irreg-ехать", u"irreg-минуть", u"irreg-живописать-миновать", u"irreg-лечь", u"irreg-зиждиться", u"irreg-клясть", u"irreg-слыхать-видать", u"irreg-стелить-стлать", u"irreg-быть", u"irreg-ссать-сцать", u"irreg-чтить", u"irreg-ошибиться", u"irreg-плескать", u"irreg-внимать", u"irreg-обязывать", ] for i, ty in blib.iter_items(types, start, end): template = "Template:ru-conj-%s/documentation" % ty process_page(i, pywikibot.Page(site, template), args.save, args.verbose)
"Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}" % getparam(t, "1")) if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t": t.add("2", getparam(t, "text"), before="text") rmparam(t, "text") if t.has("nocap"): rmparam(t, "nocap") else: t.add("withcap", "1") if t.has("nodot"): rmparam(t, "nodot") else: t.add("withdot", "1") t.add("1", "form_of_t") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes parser = blib.create_argparser( "Convert form_of_t and alt_form_of_t invocations in [[Module:form of]] to form_of_t in [[Module:form of/templates]]" ) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for i, template in blib.iter_items(templates_to_process, start, end): page = pywikibot.Page(site, "Template:%s" % template) blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)
parser.add_argument("--comment", help="Comment to use when saving pages.", required=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) fulltext = codecs.open(args.textfile, "r", "utf-8").read() titles_and_text = re.split(r"\n\n\n\n+", fulltext) assert len(titles_and_text) % 2 == 0 title_and_text_pairs = [] for i in xrange(0, len(titles_and_text), 2): title_and_text_pairs.append((titles_and_text[i], titles_and_text[i + 1])) for i, (pagetitle, pagetext) in blib.iter_items(title_and_text_pairs, start, end, get_name=lambda x: x[0]): def handler(page, index, parsed): return process_page(page, index, pagetext, args.comment.decode('utf-8')) blib.do_edit(pywikibot.Page(site, pagetitle), i, handler, save=args.save, verbose=args.verbose)
help="File containing derived lemmas") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) derived_lemmas = [] if args.derived_lemmafile: derived_lemmas = [ rulib.remove_accents(x.strip()) for x in codecs.open(args.derived_lemmafile, "r", "utf-8") ] else: for i, page in blib.cat_articles( "Russian adverbs" if args.adverbs else "Russian nouns" if args. nouns else "Russian adjectives"): derived_lemmas.append(page.title()) if args.base_lemmafile: for i, pagename in blib.iter_items([ rulib.remove_accents(x.strip()) for x in codecs.open(args.base_lemmafile, "r", "utf-8") ]): page = pywikibot.Page(site, pagename) process_page(i, page, args.save, args.verbose, derived_lemmas) else: for category in ["Russian adjectives"] if args.adverbs else [ "Russian proper nouns", "Russian nouns", "Russian verbs" ]: for i, page in blib.cat_articles(category, start, end): process_page(i, page, args.save, args.verbose, args.adverbs, derived_lemmas)
#!/usr/bin/env python # -*- coding: utf-8 -*- import pywikibot, re, sys, codecs, argparse import blib from blib import getparam, rmparam, msg, site parser = blib.create_argparser(u"Find verbs with impersonal conjugations") parser.add_argument('--verbfile', help="File listing verbs to check.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for i, line in blib.iter_items(codecs.open(args.verbfile, "r", "utf-8"), start, end): page = pywikibot.Page(site, line.strip()) if "-impers|" in page.text: msg("Page %s %s: Found impersonal conjugation" % (i, unicode(page.title()))) else: msg("Page %s %s: No impersonal conjugation" % (i, unicode(page.title())))
msg("Page %s %s: %s" % (index, pagetitle, txt)) origcontents = origpages.get(pagetitle, None) newtext = newpages.get(pagetitle, None) if not newtext: pagemsg("Skipping because not found in among new page contents") return if origcontents == newtext: pagemsg("Page %s %s: Skipping contents for %s because no change" % pagetitle) return return process_page(index, page, newtext, origcontents, args.verbose, args.comment.decode("utf-8"), args.lang_only and args.lang_only.decode("utf-8"), args.allow_page_creation) blib.do_pagefile_cats_refs(args, start, end, do_process_page, edit=True) else: lines = codecs.open(args.direcfile.decode("utf-8"), "r", "utf-8") index_pagetitle_and_text = blib.yield_text_from_find_regex(lines, args.verbose) for _, (index, pagetitle, newtext) in blib.iter_items(index_pagetitle_and_text, start, end, get_name=lambda x:x[1], get_index=lambda x:x[0]): origcontents = origpages.get(pagetitle, None) if origcontents == newtext: msg("Page %s %s: Skipping contents for %s because no change" % (index, pagetitle, pagetitle)) else: def do_process_page(page, index, parsed): return process_page(index, page, newtext, origcontents, args.verbose, args.comment.decode("utf-8"), args.lang_only and args.lang_only.decode("utf-8"), args.allow_page_creation) blib.do_edit(pywikibot.Page(site, pagetitle), index, do_process_page, save=args.save, verbose=args.verbose, diff=args.diff)
doc_comment = "Delete documentation page of " + re.sub( "^([Dd]elete|[Rr]emove) ", "", comment) def delete_page(page, comment): for i in range(11): try: page.delete(comment) return except APIError as e: if i == 10: raise e errandmsg("APIError, try #%s: %s" % (i + 1, e)) for i, pagename in blib.iter_items(pages_to_delete, start, end): page = pywikibot.Page(site, pagename) if page.exists(): msg("Deleting %s (comment=%s)" % (page.title(), comment)) delete_page(page, '%s (content was "%s")' % (comment, unicode(page.text))) errandmsg("Page [[%s]] deleted" % page.title()) if args.delete_docs: doc_page = pywikibot.Page(site, "%s/documentation" % pagename) if doc_page.exists(): msg("Deleting %s (comment=%s)" % (doc_page.title(), doc_comment)) delete_page( doc_page, '%s (content was "%s")' % (doc_comment, unicode(doc_page.text))) errandmsg("Page [[%s]] deleted" % doc_page.title())
def read_pages(filename, start, end): lines = [x.strip() for x in codecs.open(filename, "r", "utf-8")] for i, line in blib.iter_items(lines, start, end): if line.startswith("#"): continue yield i, line
parser = blib.create_argparser("Add pronunciation sections to Latin Wiktionary entries", include_pagefile=True) parser.add_argument('--lemma-file', help="File containing lemmas to process, one per line; non-lemma forms will be done") parser.add_argument('--lemmas', help="List of comma-separated lemmas to process; non-lemma forms will be done") parser.add_argument("--slots", help="Slots to process in conjunction with --lemmas and --lemma-file.") parser.add_argument('--override-pronun', action="store_true", help="Override existing pronunciations") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.lemma_file or args.lemmas: slots = args.slots.split(",") if args.lemma_file: lemmas = read_pages(args.lemma_file, start, end) else: lemmas = blib.iter_items(re.split(",", args.lemmas.decode("utf-8")), start, end) for i, lemma in lemmas: process_lemma(i, lalib.remove_macrons(lemma), slots, args) else: def do_process_page(page, index, parsed): return process_page(index, page, args) blib.do_pagefile_cats_refs(args, start, end, do_process_page, default_cats=["Latin lemmas", "Latin non-lemma forms"], edit=True) def subval_to_string(subval): if type(subval) is tuple: pron, extra_params, pre, post = subval return unicode(FoundPronun(pron, extra_params, pre, post)) else: return subval
"oblique plural of", "oblique singular of", "terminative plural of", "terminative singular of", "ancient form of", "early form of", "late form of", "masculine animate plural past participle of", "masculine inanimate plural past participle of", "masculine singular past participle of", "neuter plural past participle of", "dative dual of", "dative plural definite of", "dative plural indefinite of", "paucal of", "second-person singular of", ] for i, temp in blib.iter_items(templates_to_delete, start, end): template_page = pywikibot.Page(site, "Template:%s" % temp) if template_page.exists(): template_page.delete( 'Delete obsoleted and orphaned form-of template (content was "%s")' % unicode(template_page.text)) template_doc_page = pywikibot.Page(site, "Template:%s/documentation" % temp) if template_doc_page.exists(): template_doc_page.delete( 'Delete documentation page of obsoleted and orphaned form-of template (content was "%s")' % unicode(template_doc_page.text))
text = re.sub("\n\n\n+", "\n\n", text) if not notes: notes.append("convert 3+ newlines to 2") return text, notes parser = blib.create_argparser("Add missing declension to Latin terms") parser.add_argument( "--direcfile", help="File of output directives from make_latin_missing_decl.py", required=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lines = [x.rstrip('\n') for x in codecs.open(args.direcfile, "r", "utf-8")] for i, line in blib.iter_items(lines, start, end): m = re.search("^Page [0-9]+ (.*?): For noun (.*?), declension (.*?)$", line) if not m: msg("Unrecognized line, skipping: %s" % line) else: pagename, headword_template, decl_template = m.groups() def do_process_page(page, index, parsed): return process_page(page, index, headword_template, decl_template) blib.do_edit(pywikibot.Page(site, pagename), i, do_process_page, save=args.save, verbose=args.verbose,
parser.add_argument("--field", help="Field containing terms", type=int, default=1) parser.add_argument("--output-orig", help="Output original lines", action="store_true") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lemmas = set() msg("Reading %s lemmas" % args.lang) for i, page in blib.cat_articles("%s lemmas" % args.lang, start, end): lemmas.add(unicode(page.title())) words_freq = {} lines = [re.split(r"\s", x.strip()) for x in codecs.open(args.pagefile, "r", "utf-8")] lines = [(x[args.field - 1], x) for x in lines] for i, (pagename, origline) in blib.iter_items(lines, start, end): m = re.search(u"[^-'Ѐ-џҊ-ԧꚀ-ꚗ]", pagename) if m: outtext = "skipped due to non-Cyrillic characters" else: for pagenm, pagetype in [(pagename, ""), (pagename.capitalize(), " (capitalized)"), (pagename.upper(), " (uppercased)")]: if pagenm in lemmas: outtext = "exists%s" % pagetype break else: page = pywikibot.Page(site, pagenm) if page.exists(): text = unicode(page.text) if re.search("#redirect", text, re.I):
if args.lang not in ["uk", "be"]: raise ValueError("Unrecognized language: %s" % args.lang) lines = [x.strip() for x in codecs.open(args.declfile, "r", "utf-8")] def yield_decls(): for line in lines: found_ndecl_style = False for m in re.finditer(r"\{\{(?:User:Benwing2/)?" + args.lang + "-ndecl\|(.*?)\}\}", line): found_ndecl_style = True yield m.group(1) if not found_ndecl_style: for m in re.finditer(r"\(\(.*?\)\)|[^| \[\]]+<.*?\>", line): yield m.group(0) for index, decl in blib.iter_items(yield_decls(), start, end): module = uk if args.lang == "uk" else be if decl.startswith("(("): m = re.search(r"^\(\((.*)\)\)$", decl) subdecls = m.group(1).split(",") decl_for_page = subdecls[0] else: decl_for_page = decl m = re.search(r"^(.+?)<.*>$", decl_for_page) if not m: msg("WARNING: Can't extract lemma from decl: %s" % decl) pagename = "UNKNOWN" else: pagename = module.remove_accents(blib.remove_links(m.group(1))) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt))
from blib import getparam, rmparam, msg, site parser = blib.create_argparser(u"Find red links") parser.add_argument("--pagefile", help="File containing pages to check") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lemmas = set() msg("Reading Bulgarian lemmas") for i, page in blib.cat_articles("Bulgarian lemmas", start, end): lemmas.add(unicode(page.title())) lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")] words = lines for i, line in blib.iter_items(words, start, end): pagename, freq = line.split("\t") m = re.search(u"[^-Ѐ-џҊ-ԧꚀ-ꚗ]", pagename) def fmsg(txt): msg("Page %s [[%s]]: %s (freq %s)" % (i, pagename, txt, freq)) if m: fmsg("skipped due to non-Cyrillic characters") else: for pagenm, pagetype in [(pagename, ""), (pagename.capitalize(), " (capitalized)"), (pagename.upper(), " (uppercased)")]: if pagenm in lemmas: fmsg("exists%s" % pagetype) break else: page = pywikibot.Page(site, pagenm)
from blib import getparam, rmparam, msg, site def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") return "#REDIRECT [[Module:ru-verb/documentation]]", "redirect to [[Module:ru-verb/documentation]]" parser = blib.create_argparser("Redirect ru-conj-* documentation pages") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) types = ["7a", "7b", "8a", "8b", "9a", "9b", "10a", "10c", "11a", "11b", "12a", "12b", "13b", "14a", "14b", "14c", "15a", "16a", "16b", u"irreg-бежать", u"irreg-спать", u"irreg-хотеть", u"irreg-дать", u"irreg-есть", u"irreg-сыпать", u"irreg-лгать", u"irreg-мочь", u"irreg-слать", u"irreg-идти", u"irreg-ехать", u"irreg-минуть", u"irreg-живописать-миновать", u"irreg-лечь", u"irreg-зиждиться", u"irreg-клясть", u"irreg-слыхать-видать", u"irreg-стелить-стлать", u"irreg-быть", u"irreg-ссать-сцать", u"irreg-чтить", u"irreg-ошибиться", u"irreg-плескать", u"irreg-внимать", u"irreg-обязывать"] for i, ty in blib.iter_items(types, start, end): template = "Template:ru-conj-%s/documentation" % ty blib.do_edit(pywikibot.Page(site, template), i, process_page, save=args.save, verbose=args.verbose, diff=args.diff)
else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed parser = blib.create_argparser("Copy the declension in ru-noun-table to ru-noun+, preserving any m=, f=, g=, etc. in the latter.") parser.add_argument('--cats', default="nouns,proper nouns", help="Categories to do ('nouns', 'proper nouns' or 'nouns,proper nouns')") parser.add_argument('--lemma-file', help="File containing lemmas to copy declension of. Will remove extraneous params from ru-noun-table and copy links to ru-noun-table regardless of this.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.lemma_file: lemmas = set([x.strip() for x in codecs.open(args.lemma_file, "r", "utf-8")]) else: lemmas = None for cat in re.split(",", args.cats): if cat == "nouns": template = "Template:ru-noun+" elif cat == "proper nouns": template = "Template:ru-proper noun+" else: raise ValueError("Invalid value to --cats: %s" % cat) msg("Processing references to %s" % template) if lemmas: for i, page in blib.iter_items(lemmas, start, end): process_page(i, pywikibot.Page(site, page), args.save, args.verbose, lemmas) else: for i, page in blib.references(template, start, end): process_page(i, page, args.save, args.verbose, lemmas)
help="File of ///-separated pairs of base declensions to move") parser.add_argument('--comment', help="Comment to use when deleting") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) pages_to_move = [ x.rstrip('\n').split("///") for x in codecs.open(args.pagefile, "r", "utf-8") ] comment = args.comment or "Move erroneously-created non-lemma form" endings = ["e", "en", "er", "em", "es"] for i, (frombase, tobase) in blib.iter_items(pages_to_move, start, end, get_name=lambda x: x[1]): for ending in endings: page = pywikibot.Page(site, frombase + ending) def pagemsg(txt): msg("Page %s %s: %s" % (i, unicode(page.title()), txt)) topagename = tobase + ending if page.exists(): if pywikibot.Page(site, topagename).exists(): pagemsg( "WARNING: Destination page %s already exists, not moving" % topagename) else: pagemsg("Moving to %s (comment=%s)" % (topagename, comment))
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) saw_noun = None saw_proper_noun = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-noun": if saw_noun: pagemsg( "WARNING: Saw multiple nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_noun), unicode(t))) return saw_noun = t elif tn == "la-proper noun": if saw_proper_noun: pagemsg( "WARNING: Saw multiple proper nouns %s and %s, not sure how to proceed, skipping" % (unicode(saw_proper_noun), unicode(t))) return saw_proper_noun = t if saw_noun and saw_proper_noun: pagemsg( "WARNING: Saw both noun and proper noun, can't correct header/headword" ) return if not saw_noun and not saw_proper_noun: pagemsg( "WARNING: Saw neither noun nor proper noun, can't correct header/headword" ) return pos = "pn" if saw_proper_noun else "n" ht = saw_proper_noun or saw_noun if getparam(ht, "indecl"): pagemsg("Noun is indeclinable, skipping: %s" % unicode(ht)) return generate_template = blib.parse_text(unicode(ht)).filter_templates()[0] blib.set_template_name(generate_template, "la-generate-noun-forms") blib.remove_param_chain(generate_template, "lemma", "lemma") blib.remove_param_chain(generate_template, "m", "m") blib.remove_param_chain(generate_template, "f", "f") blib.remove_param_chain(generate_template, "g", "g") rmparam(generate_template, "type") rmparam(generate_template, "indecl") rmparam(generate_template, "id") rmparam(generate_template, "pos") result = expand_text(unicode(generate_template)) if not result: pagemsg("WARNING: Error generating forms, skipping") return tempargs = blib.split_generate_args(result) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in tempargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for index, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, index, parsed): return process_form(page, index, slot, form, pos) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), index, handler, save=args.save, verbose=args.verbose, diff=args.diff)
lines = [x for x in lines if x] def get_items(lines): for line in lines: m = re.search("^Page ([0-9]*) (.*): <respelling> *(.*?) *<end>", line) if not m: # Not a warning, there will be several of these from output of snarf_it_pron.py msg("Unrecognized line: %s" % line) else: yield m.groups() for _, (index, pagetitle, spec) in blib.iter_items(get_items(lines), start, end, get_name=lambda x: x[1], get_index=lambda x: x[0]): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) page = pywikibot.Page(site, pagetitle) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") else: def do_process_page(page, index, parsed): return process_page(index, page, spec) blib.do_edit(page,
#!/usr/bin/env python # -*- coding: utf-8 -*- import blib from blib import msg import sys import lalib parser = blib.create_argparser("Remove Latin macrons from input", no_beginning_line=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for index, line in blib.iter_items(sys.stdin, start, end): line = line.strip().decode('utf-8') msg(lalib.remove_macrons(line))
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) text = unicode(page.text) retval = lalib.find_heads_and_defns(text, pagemsg) if retval is None: return None, None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval part_headwords = [] adj_headwords = [] pn_headwords = [] noun_headwords = [] for headword in headwords: ht = headword['head_template'] tn = tname(ht) if tn == "la-part" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["participle", "participles"]: part_headwords.append(headword) elif tn == "la-adj" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["adjective", "adjectives"]: adj_headwords.append(headword) elif tn == "la-proper noun" or tn == "head" and getparam( ht, "1") == "la" and getparam( ht, "2") in ["proper noun", "proper nouns"]: pn_headwords.append(headword) elif tn == "la-noun" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") in ["noun", "nouns"]: noun_headwords.append(headword) headwords_to_do = None if part_headwords and not adj_headwords: pos = "part" headwords_to_do = part_headwords expected_inflt = "la-adecl" elif pn_headwords and not noun_headwords: pos = "pn" headwords_to_do = pn_headwords expected_inflt = "la-ndecl" if not headwords_to_do: return None, None for headword in headwords_to_do: for inflt in headword['infl_templates']: infltn = tname(inflt) if infltn != expected_inflt: pagemsg( "WARNING: Saw bad declension template for %s, expected {{%s}}: %s" % (pos, expected_inflt, unicode(inflt))) continue inflargs = lalib.generate_infl_forms(pos, unicode(inflt), errandpagemsg, expand_text) forms_seen = set() slots_and_forms_to_process = [] for slot, formarg in inflargs.iteritems(): forms = formarg.split(",") for form in forms: if "[" in form or "|" in form: continue form_no_macrons = lalib.remove_macrons(form) if form_no_macrons == pagetitle: continue if form_no_macrons in forms_seen: continue forms_seen.add(form_no_macrons) slots_and_forms_to_process.append((slot, form)) for formindex, (slot, form) in blib.iter_items( sorted(slots_and_forms_to_process, key=lambda x: lalib.remove_macrons(x[1]))): def handler(page, formindex, parsed): return process_form(page, formindex, slot, form, pos, pagemsg) blib.do_edit(pywikibot.Page(site, lalib.remove_macrons(form)), "%s.%s" % (index, formindex), handler, save=args.save, verbose=args.verbose, diff=args.diff)
if save: pagemsg("Saving with comment = %s" % comment) page.text = text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) elif warn_on_no_change: pagemsg("WARNING: No changes") parser = blib.create_argparser(u"Fix indentation of Pronunciation, Declension, Conjugation, Alternative forms sections") parser.add_argument("--pagefile", help="""List of pages to process.""") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.pagefile: lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")] for i, line in blib.iter_items(lines, start, end): m = re.search("^Page [0-9]+ (.*?): WARNING: .*?$", line) if not m: msg("WARNING: Can't process line: %s" % line) else: page = m.group(1) process_page(i, pywikibot.Page(site, page), args.save, args.verbose, warn_on_no_change=True) else: for cat in ["Russian lemmas", "Russian non-lemma forms"]: msg("Processing category %s" % cat) for i, page in blib.cat_articles(cat, start, end): process_page(i, page, args.save, args.verbose)
if args.fix_pagefile: fixdireclines = [ x.strip() for x in codecs.open(args.fix_pagefile, "r", "utf-8") ] fixdirecs = {} fixpages = [] for line in fixdireclines: verb, direc = re.split(" ", line) fixdirecs[verb] = direc fixpages.append(verb) def do_process_page(page, index, parsed): return process_page(page, index, fixdirecs) for i, page in blib.iter_items(fixpages, start, end): blib.do_edit(pywikibot.Page(site, page), i, do_process_page, save=args.save, verbose=args.verbose, diff=args.diff) else: def do_process_page(page, index, parsed): return process_page(page, index, {}) for category in ["Russian verbs"]: for i, page in blib.cat_articles(category, start, end): blib.do_edit(pywikibot.Page(site, page), i,
if notes: comment = "Add inanimacy to neuters (%s)" % "; ".join(notes) else: comment = "Add inanimacy to neuters" return unicode(parsed), notes parser = blib.create_argparser("Fix hard-е nouns according to directives") parser.add_argument("--direcfile", help="File listing directives to apply to nouns", required=True) args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) for i, line in blib.iter_items(codecs.open(args.direcfile, "r", "utf-8"), start, end): line = line.strip() if "!!!" in line: page, direc = re.split("!!!", line) else: page, direc = re.split(" ", line) def do_process_page(page, index, parsed): return process_page(index, page, direc) blib.do_edit(pywikibot.Page(site, page), i, do_process_page, save=args.save, verbose=args.verbose, diff=args.diff)
if __name__ == "__main__": parser = blib.create_argparser("Push new entries from generate_entries.py") parser.add_argument('--direcfile', help="File containing entries.") parser.add_argument('--comment', help="Comment to use.", required="true") parser.add_argument('--lang', help="Language of entries.", required="true") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lines = codecs.open(args.direcfile, "r", "utf-8") index_pagename_and_text = blib.yield_text_from_find_regex( lines, args.verbose) for _, (index, pagename, text) in blib.iter_items(index_pagename_and_text, start, end, get_name=lambda x: x[1], get_index=lambda x: x[0]): def do_process_page(page, index, parsed): return process_page(index, page, text, args.lang, args.verbose, args.comment.decode("utf-8")) blib.do_edit(pywikibot.Page(site, pagename), index, do_process_page, save=args.save, verbose=args.verbose, diff=args.diff)
if tn == "form of": lang = getparam(t, "lang") if lang: form = getparam(t, "1") else: form = getparam(t, "2") form_of_forms[form] += 1 parser = blib.create_argparser("Clean up bad inflection tags") parser.add_argument("--textfile", help="File containing inflection templates to process.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.textfile: with codecs.open(args.textfile, "r", "utf-8") as fp: text = fp.read() pages = re.split('\nPage [0-9]+ ', text) title_text_split = ': Found match for regex: ' for index, page in blib.iter_items(pages, start, end): if not page: # e.g. first entry continue split_vals = re.split(title_text_split, page, 1) if len(split_vals) < 2: msg("Page %s: Skipping bad text: %s" % (index, page)) continue pagetitle, pagetext = split_vals process_text_on_page(pagetitle, index, pagetext) for form, count in sorted(list(form_of_forms.iteritems()), key=lambda x: -x[1]): msg("%-50s = %s" % (form, count))
words_freq = {} lines = [x.strip() for x in codecs.open(args.pagefile, "r", "utf-8")] if args.with_freq: for line in lines: freq, word = re.split(r"\s", line) freq = int(freq) if word in words_freq: words_freq[word] += freq else: words_freq[word] = freq words = [x[0] for x in sorted(words_freq.items(), key=lambda y:-y[1])] else: words = lines for i, pagename in blib.iter_items(words, start, end): m = re.search(u"[^-Ѐ-џҊ-ԧꚀ-ꚗ]", pagename) if m: msg("Page %s [[%s]]: skipped due to non-Cyrillic characters" % (i, pagename)) else: for pagenm, pagetype in [(pagename, ""), (pagename.capitalize(), " (capitalized)"), (pagename.upper(), " (uppercased)")]: if pagenm in lemmas: msg("Page %s [[%s]]: exists%s" % (i, pagename, pagetype)) break else: page = pywikibot.Page(site, pagenm) if page.exists(): if re.search("#redirect", unicode(page.text), re.I): msg("Page %s [[%s]]: exists%s as redirect" % (i, pagename, pagetype))
#!/usr/bin/env python # -*- coding: utf-8 -*- import pywikibot, re, sys, codecs, argparse import blib from blib import getparam, rmparam, msg, site parser = blib.create_argparser(u"List pages, lemmas and/or non-lemmas", include_pagefile=True) parser.add_argument('--namespace', help="List all pages in namespace") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.namespace: ns = args.namespace.decode("utf-8") for i, page in blib.iter_items( site.allpages( start=start if isinstance(start, basestring) else '!', namespace=ns, filterredir=False), start, end): msg("Page %s %s: Processing" % (i, unicode(page.title()))) else: def process_page(page, index): msg("Page %s %s: Processing" % (index, unicode(page.title()))) blib.do_pagefile_cats_refs(args, start, end, process_page)
notes.append("add (manually specified) Etymology section to Russian lemma") break else: errandpagemsg("WARNING: Can't find Russian section, skipping") return if newtext != pagetext: if verbose: pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext)) assert notes comment = "; ".join(group_notes(notes)) if save: blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment) if __name__ == "__main__": parser = blib.create_argparser("Add etymologies to Russian pages based on directives") parser.add_argument('--direcfile', help="File containing directives.") parser.add_argument('--add-passive-of', action='store_true', help="Add {{passive of|lang=ru|...}} to defn.") parser.add_argument('--override-etym', action='store_true', help="Automatically override any existing etymologies.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) lines = codecs.open(args.direcfile, "r", "utf-8") for i, line in iter_items(lines, start, end): line = line.strip() process_line(i, line, args.add_passive_of, args.override_etym, args.save, args.verbose)
unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else "")) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg, "save page") else: pagemsg("Would save with comment = %s" % comment) parser = blib.create_argparser("Convert head|fr|* to fr-*") parser.add_argument("--fix-missing-plurals", action="store_true", help="Fix cases with missing plurals by just assuming the default plural.") parser.add_argument("--lemma-file",help="File containing lemmas to do.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.lemma_file: lines = [x.strip() for x in codecs.open(args.lemma_file, "r", "utf-8")] for i, pagename in blib.iter_items(lines, start, end): process_page(i, pywikibot.Page(site, pagename), args.save, args.verbose, args.fix_missing_plurals) else: for cat in ["French nouns", "French proper nouns", "French pronouns", "French determiners", "French adjectives", "French verbs", "French participles", "French adverbs", "French prepositions", "French conjunctions", "French interjections", "French idioms", "French phrases", "French abbreviations", "French acronyms", "French initialisms", "French noun forms", "French proper noun forms", "French pronoun forms", "French determiner forms", "French verb forms", "French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]: #for cat in ["French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]: msg("Processing category: %s" % cat) for i, page in blib.cat_articles(cat, start, end): process_page(i, page, args.save, args.verbose, args.fix_missing_plurals)
lines = [x.strip() for x in codecs.open(args.cmu, "r", "iso8859-1") if not x.startswith(";;;")] joined_lines = [] prev_word = None seen_pronuns = [] for line in lines: word, pronun = re.split(" ", line) m = re.search(r"^(.*)\([0-9]+\)$", word) if m and m.group(1) == prev_word: seen_pronuns.append(pronun) else: if prev_word: joined_lines.append([prev_word, seen_pronuns]) prev_word = word seen_pronuns = [pronun] if prev_word: joined_lines.append([prev_word, seen_pronuns]) for i, line in blib.iter_items(joined_lines, start, end): word, pronuns = line process_cmu_line(i, word, pronuns) for i, onset in enumerate(list(sorted(seen_onsets))): msg("#%3s %s" % (i, onset)) if args.moby: lines = [x.strip() for x in codecs.open(args.moby, "r", "mac_roman")] for i, line in blib.iter_items(lines, start, end): word, pronun = re.split(" ", line) process_moby_line(i, word, pronun)