import pywikibot, re, sys, codecs, argparse from collections import defaultdict import blib from blib import getparam, rmparam, msg, errandmsg, site, tname def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) #pagemsg("Processing") if blib.page_should_be_ignored(pagetitle): #pagemsg("WARNING: Page should be ignored") return sections = re.split("(^==[^=\n]+==\n)", text, 0, re.M) langs = [] for j in xrange(1, len(sections), 2): m = re.search("^==(.*)==$", sections[j]) langs.append(m.group(1)) pagemsg("Languages = %s" % ",".join(langs)) parser = blib.create_argparser("Find languages on pages") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) blib.parse_dump(sys.stdin, process_text_on_page, startsort=start, endsort=end)
for k in xrange(1, len(splitsections), 2): if splitsections[k] == "English": saw_english = True else: saw_langs.add(splitsections[k]) if saw_english: english_pages[pagetitle] = saw_langs def process_line(index, line): m = re.search("^Page [0-9]+ (.*?): Replacing (.*) with (.*) in .* section in (.*)$", line) if not m: return pagetitle, fromtext, totext, lang = m.groups() def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) for m in re.finditer(r"\{\{(?:m|l|term)\|.*?\|(.*?)\}\}", totext): linkpage = m.group(1) if linkpage in english_pages and lang not in english_pages[linkpage]: pagemsg("Possible false positive for [[%s]] in %s: %s" % (linkpage, lang, fromtext)) parser = blib.create_argparser("Check for likely false-positive links converted from raw links") parser.add_argument("--direcfile", help="File of output from fix_links.py") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) blib.parse_dump(sys.stdin, find_english_pages) for index, line in blib.iter_items(codecs.open(args.direcfile, "r", encoding="utf-8"), start, end): process_line(index, line)
pn = pname(param) if pn not in ["1", "g", "g2", "g3", "g4"]: pagemsg("WARNING: Extraneous param %s=: %s" % (pn, unicode(t))) return None, None def process_page(page, index, parsed): pagetitle = unicode(page.title()) text = unicode(page.text) return process_text_on_page(index, pagetitle, text) parser = blib.create_argparser( "Check for Latin non-lemma forms with bad params") parser.add_argument("--stdin", help="Read dump from stdin.", action="store_true") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.stdin: blib.parse_dump(sys.stdin, process_text_on_page) else: for i, page in blib.cat_articles("Latin non-lemma forms", start, end): blib.do_edit(page, i, process_page, save=args.save, verbose=args.verbose)