def infer_one_page_decls(page, index, text): try: return infer_one_page_decls_1(page, index, text) except StandardError as e: msg("%s %s: WARNING: Got an error: %s" % (index, unicode(page.title()), repr(e))) traceback.print_exc(file=sys.stdout) return None, None
def rewrite_pages(refrom, reto, refs, cat, pages, pagefile, pagetitle_sub, comment, filter_pages, save, verbose, startFrom, upTo): def rewrite_one_page(page, index, text): #blib.msg("From: [[%s]], To: [[%s]]" % (refrom, reto)) text = unicode(text) text = reorder_shadda(text) zipped_fromto = zip(refrom, reto) for fromval, toval in zipped_fromto: if pagetitle_sub: pagetitle = unicode(page.title()) fromval = fromval.replace(pagetitle_sub, re.escape(pagetitle)) toval = toval.replace(pagetitle_sub, pagetitle) text = re.sub(fromval, toval, text) return text, comment or "replace %s" % (", ".join("%s -> %s" % (f, t) for f, t in zipped_fromto)) if pages: pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(pages, startFrom, upTo)) elif pagefile: lines = [x.strip() for x in codecs.open(pagefile, "r", "utf-8")] pages = ((pywikibot.Page(blib.site, page), index) for page, index in blib.iter_pages(lines, startFrom, upTo)) elif refs: pages = blib.references(refs, startFrom, upTo, includelinks=True) else: pages = blib.cat_articles(cat, startFrom, upTo) for page, index in pages: pagetitle = unicode(page.title()) if filter_pages and not re.search(filter_pages, pagetitle): blib.msg("Skipping %s because doesn't match --filter-pages regex %s" % (pagetitle, filter_pages)) else: if verbose: blib.msg("Processing %s" % pagetitle) blib.do_edit(page, index, rewrite_one_page, save=save, verbose=verbose)
def do_one_page_verb(page, index, text): pagename = page.title() verbcount = 0 verbids = [] for template in text.filter_templates(): if template.name == "ar-conj": verbcount += 1 vnvalue = getparam(template, "vn") uncertain = False if vnvalue.endswith("?"): vnvalue = vnvalue[:-1] msg("Page %s %s: Verbal noun(s) identified as uncertain" % ( index, pagename)) uncertain = True if not vnvalue: continue vns = re.split(u"[,،]", vnvalue) form = getparam(template, "1") verbid = "#%s form %s" % (verbcount, form) if re.match("^[1I](-|$)", form): verbid += " (%s,%s)" % (getparam(template, "2"), getparam(template, "3")) no_i3rab_vns = [] for vn in vns: no_i3rab_vns.append(remove_i3rab(pagename, index, verbid, vn)) newvn = ",".join(no_i3rab_vns) if uncertain: newvn += "?" if newvn != vnvalue: msg("Page %s %s: Verb %s, replacing %s with %s" % ( index, pagename, verbid, vnvalue, newvn)) addparam(template, "vn", newvn) verbids.append(verbid) return text, "Remove i3rab from verbal nouns for verb(s) %s" % ( ', '.join(verbids))
def investigate_possible_adj(index, adj_pagename, adv, adv_defns): def pagemsg(txt): msg("Page %s %s: %s" % (index, adj_pagename, txt)) pagemsg("Trying for adverb %s" % adv) page = pywikibot.Page(site, adj_pagename) if not page.exists(): pagemsg("Doesn't exist for adverb %s" % adv) return text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["la-adj", "la-part"]: adj = lalib.la_get_headword_from_template( t, adj_pagename, pagemsg)[0] adj_defns = lalib.find_defns(subsections[k]) msg("%s /// %s /// %s /// %s" % (adv, adj, ";".join(adv_defns), ";".join(adj_defns)))
def canon_param(pagetitle, index, template, lang, param, paramtr, translit_module): if isinstance(param, list): fromparam, toparam = param else: fromparam, toparam = (param, param) foreign = (pagetitle if fromparam == "page title" else getparam( template, fromparam)) latin = getparam(template, paramtr) if not foreign: return False canonforeign, canonlatin, actions = do_canon_param(pagetitle, index, template, lang, fromparam, toparam, paramtr, foreign, latin, translit_module) oldtempl = "%s" % unicode(template) if canonforeign: add_param_handling_head(template, toparam, canonforeign) if canonlatin == True: template.remove(paramtr) elif canonlatin: addparam(template, paramtr, canonlatin) if canonforeign or canonlatin: msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return actions
def process_page(index, page, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) if rulib.check_for_alt_yo_terms(text, pagemsg): return section = blib.find_lang_section_from_text(text, "Russian", pagemsg) if not section: pagemsg("Couldn't find Russian section for %s" % pagetitle) return defns = rulib.find_defns(section) if not defns: pagemsg("Couldn't find definitions for %s" % pagetitle) return msg("%s %s" % (pagetitle, ';'.join(defns)))
def canon_param(pagetitle, index, template, param, paramtr, include_tempname_in_changelog=False): if isinstance(param, list): fromparam, toparam = param else: fromparam, toparam = (param, param) arabic = (pagetitle if fromparam == "page title" else getparam( template, fromparam)) latin = getparam(template, paramtr) if not arabic: return False canonarabic, canonlatin, actions = do_canon_param( pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin, include_tempname_in_changelog) oldtempl = "%s" % unicode(template) if canonarabic: addparam(template, toparam, canonarabic) if canonlatin == True: template.remove(paramtr) elif canonlatin: addparam(template, paramtr, canonlatin) if canonarabic or canonlatin: msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return actions
def canon_param(pagetitle, index, template, param, paramtr, translit_module, include_tempname_in_changelog=False): if isinstance(param, list): fromparam, toparam = param else: fromparam, toparam = (param, param) foreign = (pagetitle if fromparam == "page title" else getparam(template, fromparam)) latin = getparam(template, paramtr) if not foreign: return False canonforeign, canonlatin, actions = do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr, foreign, latin, translit_module, include_tempname_in_changelog) oldtempl = "%s" % unicode(template) if canonforeign: addparam(template, toparam, canonforeign) if canonlatin == True: template.remove(paramtr) elif canonlatin: addparam(template, paramtr, canonlatin) if canonforeign or canonlatin: msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return actions
def vocalize_one_page_headwords(pagetitle, index, text): actions_taken = [] for template in text.filter_templates(): paramschanged = [] if template.name in arabiclib.arabic_non_verbal_headword_templates: paramschanged += vocalize_head(pagetitle, index, template) for param in [ "pl", "plobl", "cpl", "cplobl", "fpl", "fplobl", "f", "fobl", "m", "mobl", "obl", "el", "sing", "coll", "d", "dobl", "pauc", "cons" ]: paramschanged += vocalize_param_chain(pagetitle, index, template, param) if len(paramschanged) > 0: if template.has("tr"): tempname = "%s %s" % (template.name, getparam(template, "tr")) else: tempname = template.name actions_taken.append("%s (%s)" % (', '.join(paramschanged), tempname)) changelog = "vocalize parameters: %s" % '; '.join(actions_taken) #if len(actions_taken) > 0: msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog)) return text, changelog
def process_page_for_generate(page, index, verbs): pagename = unicode(page.title()) def pagemsg(txt): msg("# Page %s %s: %s" % (index, pagename, txt)) if " " not in pagename: pagemsg("WARNING: No space in page title") return if pagename.startswith("no "): prefix, verb_rest = pagename.split(" ", 1) if " " in verb_rest: verb, rest = verb_rest.split(" ", 1) else: verb = verb_rest rest = "" prefix = prefix + " " else: verb, rest = pagename.split(" ", 1) prefix = "" if verb not in verbs: pagemsg("WARNING: Unrecognized verb '%s'" % verb) return linked_rest = " ".join(singularize(x) for x in rest.split(" ")) spec = verbs[verb] if spec == "*": spec = "<>" msg("%s%s%s %s" % (prefix, verb, spec, linked_rest))
def create_cat(cat, catargs, extratext=None): global args if args.pos == "verb": pos = "verb" shortpos = "verb" elif args.pos == "adj": pos = "adjective" shortpos = "adj" elif args.pos == "noun": pos = "noun" shortpos = "noun" else: assert False, "Invalid pos %s" % args.pos cat = "Belarusian " + cat.replace("~", "%ss" % pos) text = "{{be-%s cat%s}}" % (shortpos, "".join("|" + arg for arg in catargs)) if extratext: text += "\n%s" % extratext num_pages = len(list(blib.cat_articles(cat))) if num_pages == 0: return cat = "Category:" + cat page = pywikibot.Page(site, cat) if not args.overwrite and page.exists(): msg("Page %s already exists, not overwriting" % cat) return page.text = unicode(text) changelog = "Creating '%s' with text '%s'" % (cat, text) msg("Changelog = %s" % changelog) if args.save: blib.safe_page_save(page, changelog, errandmsg)
def replace_raw_pos(m): if not langnamecode: msg("WARNING: Unable to parse langname %s when trying to replace raw link %s" % (langname, m.group(0))) return m.group(0) return "\n* {{l|%s|%s|pos=%s}}" % (langnamecode, m.group(1), pos_to_pos[m.group(2)])
def process_page(index, page, contents, origcontents, verbose, comment, lang_only, allow_page_creation): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if contents == origcontents: pagemsg("Skipping contents for %s because no change" % pagetitle) return None, None if verbose: pagemsg("For [[%s]]:" % pagetitle) pagemsg("------- begin text --------") msg(contents.rstrip('\n')) msg("------- end text --------") page_exists = page.exists() and origcontents is not None if not page_exists: if lang_only or not allow_page_creation: errandpagemsg("WARNING: Trying to create page when --lang-only or not --allow-page-creation") return None, None else: if lang_only: foundlang = False sec_to_search = 0 sections = re.split("(^==[^=]*==\n)", page.text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==%s==\n" % lang_only: if foundlang: errandpagemsg("WARNING: Found multiple %s sections, skipping page" % lang_only) return None, None foundlang = True sec_to_search = j if not sec_to_search: errandpagemsg("WARNING: Couldn't find %s section, skipping page" % lang_only) return None, None m = re.match(r"\A(.*?)(\n*)\Z", sections[sec_to_search], re.S) curtext, curnewlines = m.groups() curtext = unicodedata.normalize('NFC', curtext) supposed_curtext = unicodedata.normalize('NFC', origcontents.rstrip('\n')) if curtext != supposed_curtext: if curtext == contents.rstrip('\n'): pagemsg("Section has already been changed to new text, not saving") else: errandpagemsg("WARNING: Text has changed from supposed original text, not saving") return None, None sections[sec_to_search] = contents.rstrip('\n') + curnewlines contents = "".join(sections) else: curtext = unicodedata.normalize('NFC', page.text.rstrip('\n')) supposed_curtext = unicodedata.normalize('NFC', origcontents.rstrip('\n')) if curtext != supposed_curtext: if curtext == contents.rstrip('\n'): pagemsg("Page has already been changed to new text, not saving") else: errandpagemsg("WARNING: Text has changed from supposed original text, not saving") return None, None return contents, comment
def search_noconj(startFrom, upTo): for index, page in blib.cat_articles(u"Arabic verbs", startFrom, upTo): text = unicode(blib.parse(page)) pagetitle = page.title() if "{{ar-verb" not in text: msg("* ar-verb not in {{l|ar|%s}}" % pagetitle) if "{{ar-conj" not in text: msg("* ar-conj not in {{l|ar|%s}}" % pagetitle)
def get_items(lines): for line in lines: m = re.search("^Page ([0-9]*) (.*): <respelling> *(.*?) *<end>", line) if not m: # Not a warning, there will be several of these from output of snarf_it_pron.py msg("Unrecognized line: %s" % line) else: yield m.groups()
def undo_greek_removal(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.match(r"\* \[\[(.*?)]]: Removed (.*?)=.*?: <nowiki>(.*?)</nowiki>$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages(template_removals, startFrom, upTo, # key is the page name key = lambda x: x[0]): pagename, removed_param, template_text = current def undo_one_page_greek_removal(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) template = blib.parse_text(template_text).filter_templates()[0] orig_template = unicode(template) if getparam(template, "sc") == "polytonic": template.remove("sc") to_template = unicode(template) param_value = getparam(template, removed_param) template.remove(removed_param) from_template = unicode(template) text = unicode(text) found_orig_template = orig_template in text newtext = text.replace(from_template, to_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg("WARNING: Unable to locate 'from' template when undoing Greek param removal: %s" % from_template) else: pagemsg("Original template found, taking no action") else: if found_orig_template: pagemsg("WARNING: Undid removal, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(to_template) - len(from_template): pagemsg("WARNING: Length mismatch when undoing Greek param removal, may have matched multiple templates: from=%s, to=%s" % ( from_template, to_template)) changelog = "Undid removal of %s=%s in %s" % (removed_param, param_value, to_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % ( index, pagename)) else: blib.do_edit(page, index, undo_one_page_greek_removal, save=save, verbose=verbose)
def rewrite_one_page_ar_nisba(page, index, text): for template in text.filter_templates(): if template.name == "ar-nisba": if template.has("head") and not template.has(1): head = unicode(template.get("head").value) template.remove("head") addparam(template, "1", head, before=template.params[0].name if len(template.params) > 0 else None) if template.has("plhead"): blib.msg("%s has plhead=" % page.title()) return text, "ar-nisba: head= -> 1="
def parse_log_file(fn, startFrom, upTo): for current, index in blib.iter_pages(yield_page_lines(fn), startFrom, upTo, key=lambda x:x[1]): pageindex, pagename, lines = current for line in lines: m = re.match(r"^Page ([0-9/.-]+) (.*)$", line) if m: msg("Page %s/%s %s" % (pageindex, m.group(1), m.group(2))) else: msg(line)
def test_infer(): class Page: def title(self): return "test_infer" for pagetext in test_templates: text = blib.parse_text(pagetext) page = Page() newtext, comment = infer_one_page_decls(page, 1, text) msg("newtext = %s" % unicode(newtext)) msg("comment = %s" % comment)
def process_page(index, page, save, verbose, nouns): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if not re.search(u"[иы]й$", pagetitle): pagemsg(u"Skipping adjective not in -ый or -ий") return noun = re.sub(u"[иы]й$", u"ость", pagetitle) if noun not in nouns: return text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == u"ru-adj-alt-ё": pagemsg(u"Skipping alt-ё adjective") return for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-adj": heads = blib.fetch_param_chain(t, "1", "head", pagetitle) if len(heads) > 1: pagemsg("Skipping adjective with multiple heads: %s" % ",".join(heads)) return tr = getparam(t, "tr") nounsection = blib.find_lang_section(noun, "Russian", pagemsg, errandpagemsg) if not nounsection: pagemsg("Couldn't find Russian section for %s" % noun) continue if "==Etymology" in nounsection: pagemsg("Noun %s already has etymology" % noun) continue if tr: msg(u"%s %s+tr1=%s+-ость no-etym" % (noun, heads[0], tr)) else: msg(u"%s %s+-ость no-etym" % (noun, heads[0]))
def process_page(page, index, refrom, reto, pagetitle_sub, comment, lang_only, warn_on_no_replacement, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): blib.msg("Page %s %s: %s" % (index, pagetitle, txt)) if verbose: blib.msg("Processing %s" % pagetitle) #blib.msg("From: [[%s]], To: [[%s]]" % (refrom, reto)) text = unicode(page.text) origtext = text text = reorder_shadda(text) zipped_fromto = zip(refrom, reto) def replace_text(text): for fromval, toval in zipped_fromto: if pagetitle_sub: fromval = fromval.replace(pagetitle_sub, re.escape(pagetitle)) toval = toval.replace(pagetitle_sub, pagetitle) text = re.sub(fromval, toval, text, 0, re.M) return text if not lang_only: text = replace_text(text) else: sec_to_replace = None foundlang = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j - 1] == "==%s==\n" % lang_only: if foundlang: pagemsg( "WARNING: Found multiple %s sections, skipping page" % lang_only) if warn_on_no_replacement: pagemsg("WARNING: No replacements made") return foundlang = True sec_to_replace = j break if sec_to_replace is None: if warn_on_no_replacement: pagemsg("WARNING: No replacements made") return sections[sec_to_replace] = replace_text(sections[sec_to_replace]) text = "".join(sections) if warn_on_no_replacement and text == origtext: pagemsg("WARNING: No replacements made") return text, comment or "replace %s" % (", ".join( "%s -> %s" % (f, t) for f, t in zipped_fromto))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj-7a", "ru-conj-7b"]: past_stem = getparam(t, "4") vowel_end = re.search(u"[аэыоуяеиёю́]$", past_stem) past_m = getparam(t, "past_m") past_f = getparam(t, "past_f") past_n = getparam(t, "past_n") past_pl = getparam(t, "past_pl") if past_m or past_f or past_n or past_pl: upast_stem = ru.make_unstressed(past_stem) expected_past_m = past_stem + (u"л" if vowel_end else "") expected_past_f = upast_stem + u"ла́" expected_past_n = upast_stem + u"ло́" expected_past_pl = upast_stem + u"ли́" if ((not past_m or expected_past_m == past_m) and expected_past_f == past_f and expected_past_n == past_n and expected_past_pl == past_pl): msg("Would remove past overrides and add arg5=b") else: msg("WARNING: Remaining past overrides: past_m=%s, past_f=%s, past_n=%s, past_pl=%s, expected_past_m=%s, expected_past_f=%s, expected_past_n=%s, expected_past_pl=%s" % (past_m, past_f, past_n, past_pl, expected_past_m, expected_past_f, expected_past_n, expected_past_pl)) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def undo_ru_auto_accent(save, verbose, direcfile, startFrom, upTo): template_removals = [] for line in codecs.open(direcfile, "r", encoding="utf-8"): line = line.strip() m = re.search(r"^Page [0-9]+ (.*?): Replaced (\{\{.*?\}\}) with (\{\{.*?\}\})$", line) if not m: msg("WARNING: Unable to parse line: [%s]" % line) else: template_removals.append(m.groups()) for current, index in blib.iter_pages(template_removals, startFrom, upTo, # key is the page name key = lambda x: x[0]): pagename, orig_template, repl_template = current if not re.search(r"^\{\{(ux|usex|ru-ux|lang)\|", orig_template): continue def undo_one_page_ru_auto_accent(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) text = unicode(text) if not re.search("^#\*:* *%s" % re.escape(repl_template), text, re.M): return None, "" found_orig_template = orig_template in text newtext = text.replace(repl_template, orig_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg("WARNING: Unable to locate 'repl' template when undoing Russian auto-accenting: %s" % repl_template) else: pagemsg("Original template found, taking no action") else: pagemsg("Replaced %s with %s" % (repl_template, orig_template)) if found_orig_template: pagemsg("WARNING: Undid replacement, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(orig_template) - len(repl_template): pagemsg("WARNING: Length mismatch when undoing Russian auto-accenting, may have matched multiple templates: orig=%s, repl=%s" % ( orig_template, repl_template)) changelog = "Undid auto-accenting (per Wikitiki89) of %s" % (orig_template) pagemsg("Change log = %s" % changelog) return newtext, changelog page = pywikibot.Page(site, pagename) if not page.exists(): msg("Page %s %s: WARNING, something wrong, does not exist" % ( index, pagename)) else: blib.do_edit(page, index, undo_one_page_ru_auto_accent, save=save, verbose=verbose)
def correct_one_page_link_formatting(page, index, text): text = unicode(text) pagetitle = page.title() linkschanged = [] for m in re.finditer(r"\{\{l\|ar\|([^}]*?)\}\} *(?:'*(?:(?:\{\{IPAchar\|)?\(([^{})]*?)\)(?:\}\})?)'*)? *(?:\{\{g\|(.*?)\}\})?", text): if not m.group(2) and not m.group(3): continue msg("On page %s, found match: %s" % (pagetitle, m.group(0))) if "|tr=" in m.group(1): msg("Skipping because translit already present") continue if m.group(3): if m.group(3) == "m|f": gender = "|g=m|g2=f" else: gender = "|g=%s" % m.group(3) else: gender = "" if m.group(2): tr = "|tr=%s" % m.group(2) else: tr = "" repl = "{{l|ar|%s%s%s}}" % (m.group(1), tr, gender) msg("Replacing\n%s\nwith\n%s" % (m.group(0), repl)) newtext = text.replace(m.group(0), repl, 1) if newtext == text: msg("WARNING: Unable to do replacement") else: text = newtext linkschanged.append(m.group(1)) return text, "incorporated translit/gender into links: %s" % ', '.join(linkschanged)
def vocalize_param(pagetitle, index, template, param, paramtr): arabic = getparam(template, param) latin = getparam(template, paramtr) if not arabic: return False if latin: vocalized = do_vocalize_param(pagetitle, index, template, param, arabic, latin) if vocalized: oldtempl = "%s" % unicode(template) addparam(template, param, vocalized) msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return vocalized return True
def read_existing_pages(filename): pages_with_langs = {} for line in codecs.getreader("utf-8")(gzip.open(filename, "rb"), errors="replace"): line = line.rstrip("\n") if re.search("^Page [0-9]+ .*: WARNING: .*", line): msg("Skipping warning: %s" % line) else: m = re.search("^Page [0-9-]+ (.*): Langs=(.*?)$", line) if not m: msg("WARNING: Unrecognized line: %s" % line) else: pages_with_langs[m.group(1)] = set(m.group(2).split(",")) return pages_with_langs
def render_groups(groups): def is_noequiv(x): return x == "* (no equivalent)" def compare_aspect_pair(xpf, ximpf, ypf, yimpf): if not is_noequiv(xpf) and not is_noequiv(ypf): return cmp(xpf, ypf) elif not is_noequiv(ximpf) and not is_noequiv(yimpf): return cmp(ximpf, yimpf) elif not is_noequiv(xpf) and not is_noequiv(yimpf): return cmp(xpf, yimpf) elif not is_noequiv(ximpf) and not is_noequiv(ypf): return cmp(ximpf, ypf) else: return 0 def sort_aspect_pair(x, y): xpf, ximpf = x ypf, yimpf = y # First compare ignoring accents, so that влить goes before вли́ться, # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered # consistently. retval = compare_aspect_pair(rulib.remove_accents(xpf), rulib.remove_accents(ximpf), rulib.remove_accents(ypf), rulib.remove_accents(yimpf)) if retval == 0: return compare_aspect_pair(xpf, ximpf, ypf, yimpf) else: return retval pfs = [] impfs = [] for gr in groups: gr = sorted(gr, cmp=sort_aspect_pair) for pf, impf in gr: pfs.append(pf) impfs.append(impf) msg(""" ====Derived terms==== {{top2}} ''imperfective'' %s {{mid2}} ''perfective'' %s {{bottom}} """ % ("\n".join(impfs), "\n".join(pfs)))
def do_process_param(pagetitle, index, pagetext, template, templang, param, paramtr): result = process_param(pagetitle, index, template, param, paramtr, include_tempname_in_changelog=True) if getparam(template, "sc") == "Arab": msg("Page %s %s: %s.%s: Removing sc=Arab" % (index, pagetitle, template.name, "sc")) oldtempl = "%s" % unicode(template) template.remove("sc") msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) newresult = ["remove %s.sc=Arab" % template.name] if isinstance(result, list): result = result + newresult else: result = newresult return result
def do_process_param(pagetitle, index, template, param, paramtr): result = process_param(pagetitle, index, template, param, paramtr, include_tempname_in_changelog=True) if getparam(template, "sc") == "Arab": msg("Page %s %s: %s.%s: Removing sc=Arab" % (index, pagetitle, template.name, "sc")) oldtempl = "%s" % unicode(template) template.remove("sc") msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) newresult = ["remove %s.sc=Arab" % template.name] if isinstance(result, list): result = result + newresult else: result = newresult return result
def search_iyya_noetym(startFrom, upTo): for page, index in blib.cat_articles(u"Arabic nouns", startFrom, upTo): text = blib.parse(page) pagetitle = page.title() etym = False suffix = False if pagetitle.endswith(u"ية"): for t in text.filter_templates(): if t.name in ["ar-etym-iyya", "ar-etym-nisba-a", "ar-etym-noun-nisba", "ar-etym-noun-nisba-linking"]: etym = True if t.name == "suffix": suffix = True if not etym: msg("Page %s %s: Ends with -iyya, no appropriate etym template%s" % ( index, pagetitle, " (has suffix template)" if suffix else ""))
def process_one_page_headwords(pagetitle, index, text): actions = [] for template in text.filter_templates(): if template.name in arabiclib.arabic_non_verbal_headword_templates: thisactions = [] tr = getparam(template, "tr") thisactions += process_head(pagetitle, index, template) for param in ["pl", "plobl", "cpl", "cplobl", "fpl", "fplobl", "f", "fobl", "m", "mobl", "obl", "el", "sing", "coll", "d", "dobl", "pauc", "cons"]: thisactions += process_param_chain(pagetitle, index, template, param) if len(thisactions) > 0: actions.append("%s: %s" % (template.name, ', '.join(thisactions))) changelog = '; '.join(actions) #if len(actions) > 0: msg("Change log for page %s = %s" % (pagetitle, changelog)) return text, changelog
def render_groups(groups): def is_noequiv(x): return x == "* (no equivalent)" def compare_aspect_pair(xpf, ximpf, ypf, yimpf): if not is_noequiv(xpf) and not is_noequiv(ypf): return cmp(xpf, ypf) elif not is_noequiv(ximpf) and not is_noequiv(yimpf): return cmp(ximpf, yimpf) elif not is_noequiv(xpf) and not is_noequiv(yimpf): return cmp(xpf, yimpf) elif not is_noequiv(ximpf) and not is_noequiv(ypf): return cmp(ximpf, ypf) else: return 0 def sort_aspect_pair(x, y): xpf, ximpf = x ypf, yimpf = y # First compare ignoring accents, so that влить goes before вли́ться, # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered # consistently. retval = compare_aspect_pair(ru.remove_accents(xpf), ru.remove_accents(ximpf), ru.remove_accents(ypf), ru.remove_accents(yimpf)) if retval == 0: return compare_aspect_pair(xpf, ximpf, ypf, yimpf) else: return retval pfs = [] impfs = [] for gr in groups: gr = sorted(gr, cmp=sort_aspect_pair) for pf, impf in gr: pfs.append(pf) impfs.append(impf) msg(""" ====Derived terms==== {{top2}} ''imperfective'' %s {{mid2}} ''perfective'' %s {{bottom2}} """ % ("\n".join(impfs), "\n".join(pfs)))
def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errpagemsg(txt): errmsg("Page %s %s: %s" % (index, pagetitle, txt)) errpagemsg("Processing references") if not args.table_of_uses: pagemsg("Processing references") aliases = [] for i, subpage in blib.references(pagetitle, namespaces=[10], only_template_inclusion=False, filter_redirects=args.redirects_only): aliases.append(unicode(subpage.title())) if not args.table_of_uses: process_subpage(page, index, subpage, i) if args.table_of_uses: msg("%s%s" % (pagetitle.replace("Template:", ""), aliases and "," + ",".join(x.replace("Template:", "") for x in aliases) or ""))
def parse_infls(infltext, tr): fs = [] ftrs = [] pls = [] pltrs = [] fpls = [] fpltrs = [] for rawinfl in re.split(", *", infltext): if not rawinfl: continue infl = re.match("'*\{\{(?:lang|l)\|ar\|(.*?)\}\}'* *(?:(?:\{\{IPAchar\|)?\((.*?)\)(?:\}\})?)? *\{\{g\|(.*?)\}\}", rawinfl) if not infl: msg("WARNING: Unable to match infl-outside-head %s" % rawinfl) continue msg("Found infl outside head: %s" % infl.group(0)) if "|" in infl.group(1): msg("WARNING: Found | in head, skipping: %s" % infl.group(1)) continue if infl.group(3) == "f": fs.append(infl.group(1)) ftrs.append(infl.group(2)) elif infl.group(3) == "p": pls.append(infl.group(1)) pltrs.append(infl.group(2)) elif infl.group(3) == "f-p": fpls.append(infl.group(1)) fpltrs.append(infl.group(2)) else: msg("WARNING: Unrecognized inflection gender '%s'" % infl.group(3)) infls = "" if tr: infls += "|tr=%s" % tr def handle_infls(infls, arabic, latin, argname): count = 1 for ar in arabic: if count == 1: arg = argname else: arg = "%s%s" % (argname, count) infls += "|%s=%s" % (arg, ar) if latin[count - 1] != None: if count == 1: larg = "%str" % argname else: larg = "%s%str" % (argname, count) infls += "|%s=%s" % (larg, latin[count - 1]) count += 1 return infls infls = handle_infls(infls, fs, ftrs, "f") infls = handle_infls(infls, pls, pltrs, "pl") infls = handle_infls(infls, fpls, fpltrs, "fpl") return infls
def search_iyya_noetym(startFrom, upTo): for index, page in blib.cat_articles(u"Arabic nouns", startFrom, upTo): text = blib.parse(page) pagetitle = page.title() etym = False suffix = False if pagetitle.endswith(u"ية"): for t in text.filter_templates(): if t.name in [ "ar-etym-iyya", "ar-etym-nisba-a", "ar-etym-noun-nisba", "ar-etym-noun-nisba-linking" ]: etym = True if t.name == "suffix": suffix = True if not etym: msg("Page %s %s: Ends with -iyya, no appropriate etym template%s" % (index, pagetitle, " (has suffix template)" if suffix else ""))
def create_cat(cat, args, adj=False, verb=False): if verb: cat = "Category:Russian " + cat.replace("~", "verbs") text = "{{ruverbcatboiler}}" elif adj: cat = "Category:Russian " + cat.replace("~", "adjectives") text = "{{ruadjcatboiler|%s}}" % "|".join(args) else: cat = "Category:Russian " + cat.replace("~", "nouns") text = "{{runouncatboiler|%s}}" % "|".join(args) page = pywikibot.Page(site, cat) if not overwrite and page.exists(): msg("Page %s already exists, not overwriting" % cat) return page.text = unicode(text) changelog = "Creating '%s' with text '%s'" % (cat, text) msg("Changelog = %s" % changelog) if dosave: blib.safe_page_save(page, changelog, errandmsg)
def vocalize_one_page_headwords(pagetitle, index, text): actions_taken = [] for template in text.filter_templates(): paramschanged = [] if template.name in arabiclib.arabic_non_verbal_headword_templates: paramschanged += vocalize_head(pagetitle, index, template) for param in ["pl", "plobl", "cpl", "cplobl", "fpl", "fplobl", "f", "fobl", "m", "mobl", "obl", "el", "sing", "coll", "d", "dobl", "pauc", "cons"]: paramschanged += vocalize_param_chain(pagetitle, index, template, param) if len(paramschanged) > 0: if template.has("tr"): tempname = "%s %s" % (template.name, getparam(template, "tr")) else: tempname = template.name actions_taken.append("%s (%s)" % (', '.join(paramschanged), tempname)) changelog = "vocalize parameters: %s" % '; '.join(actions_taken) #if len(actions_taken) > 0: msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog)) return text, changelog
def process_page(index, page, contents, lang, verbose, comment): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if verbose: pagemsg("For [[%s]]:" % pagename) pagemsg("------- begin text --------") msg(contents.rstrip('\n')) msg("------- end text --------") if not page.exists(): return contents, comment else: insert_before = 0 curtext = page.text sections = re.split("(^==[^=]*==\n)", curtext, 0, re.M) for j in xrange(2, len(sections), 2): m = re.search(r"^==\s*(.*?)\s*==\n", sections[j - 1]) if not m: errandpagemsg("WARNING: Saw bad second-level header: %s" % sections[j - 1].strip()) return foundlang = m.group(1) if foundlang == lang: errandpagemsg("WARNING: Already found %s section" % lang) return if foundlang > lang: insert_before = j - 1 break if insert_before == 0: # Add to the end newtext = curtext.rstrip("\n") + "\n\n----\n\n" + contents return newtext, comment sections[insert_before:insert_before] = contents.rstrip( "\n") + "\n\n----\n\n" return "".join(sections), comment
def canonicalize_one_page_verb_form(page, index, text): pagetitle = page.title() msg("Processing page %s" % pagetitle) actions_taken = [] for template in text.filter_templates(): if template.name == tempname: origtemp = unicode(template) form = getparam(template, formarg) if form: addparam(template, formarg, canonicalize_form(form)) newtemp = unicode(template) if origtemp != newtemp: msg("Replacing %s with %s" % (origtemp, newtemp)) if re.match("^[1I](-|$)", form): actions_taken.append("form=%s (%s/%s)" % (form, getparam(template, str(1+int(formarg))), getparam(template, str(2+int(formarg))))) else: actions_taken.append("form=%s" % form) changelog = "%s: canonicalize form (%s=) to Roman numerals: %s" % ( tempname, formarg, '; '.join(actions_taken)) if len(actions_taken) > 0: msg("Change log = %s" % changelog) return text, changelog
def rewrite_one_page_verb_headword(page, index, text): pagetitle = page.title() msg("Processing page %s" % pagetitle) actions_taken = [] for template in text.filter_templates(): if template.name in ["ar-verb"]: origtemp = unicode(template) form = getparam(template, "form") if form: # In order to keep in the same order, just forcibly change the # param "names" (numbers) for pno in xrange(10, 0, -1): if template.has(str(pno)): template.get(str(pno)).name = str(pno + 1) # Make sure form= param is first ... template.remove("form") addparam(template, "form", canonicalize_form(form), before=template.params[0].name if len(template.params) > 0 else None) # ... then forcibly change its name to 1= template.get("form").name = "1" template.get("1").showkey = False newtemp = unicode(template) if origtemp != newtemp: msg("Replacing %s with %s" % (origtemp, newtemp)) if re.match("^[1I](-|$)", form): actions_taken.append("form=%s (%s/%s)" % (form, getparam(template, "2"), getparam(template, "3"))) else: actions_taken.append("form=%s" % form) changelog = "ar-verb: form= -> 1= and canonicalize to Roman numerals, move other params up: %s" % '; '.join(actions_taken) if len(actions_taken) > 0: msg("Change log = %s" % changelog) return text, changelog
def process_param(pagetitle, index, pagetext, template, templang, param, paramtr): result = canon_param(pagetitle, index, template, lang, param, paramtr, translit_module) scvalue = getparam(template, "sc") if scvalue in script: tname = unicode(template.name) if show_template and result == False: msg("Page %s %s: %s.%s: Processing %s" % (index, pagetitle, tname, "sc", unicode(template))) msg("Page %s %s: %s.%s: Removing sc=%s" % (index, pagetitle, tname, "sc", scvalue)) oldtempl = "%s" % unicode(template) template.remove("sc") msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) newresult = [ "remove sc=%s in {{%s}}" % (scvalue, template_changelog_name(template, lang)) ] if result != False: result = result + newresult else: result = newresult return result
def output_heads_seen(overall=False): if overall: dic = overall_head_count msg("Overall templates seen:") else: dic = cat_head_count msg("Templates seen per category:") for head, count in sorted(dic.items(), key=lambda x:-x[1]): msg(" %s = %s" % (head, count))
def fix_one_page_tool_place_noun(page, index, text): pagetitle = page.title() for t in text.filter_templates(): if t.name == template: if getparam(t, "cap"): msg("Page %s %s: Template %s: Remove cap=" % (index, pagetitle, template)) t.remove("cap") else: msg("Page %s %s: Template %s: Add lc=1" % (index, pagetitle, template)) addparam(t, "lc", "1") changelog = "%s: If cap= is present, remove it, else add lc=" % template msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog)) return text, changelog
def do_pages(createfn, iterfn=iter_pages): pages = iterfn(createfn) for current, index in blib.iter_pages(pages, startFrom, upTo, key=lambda x: x[0]): pagename, text, changelog = current pagetitle = remove_diacritics(pagename) if params.offline: msg("Text for %s: [[%s]]" % (pagename, text)) msg("Changelog = %s" % changelog) else: page = pywikibot.Page(site, pagetitle) if page.exists(): msg("Page %s %s: WARNING, page already exists, skipping" % (index, pagename)) else: def save_text(page, index, parsed): return text, changelog blib.do_edit(page, index, save_text, save=params.save, verbose=params.verbose)
def process_param(pagetitle, index, template, param, paramtr): result = canon_param(pagetitle, index, template, param, paramtr, include_tempname_in_changelog=True) if getparam(template, "sc") == "Arab": tname = unicode(template.name) if show_template and result == False: msg("Page %s %s: %s.%s: Processing %s" % (index, pagetitle, tname, "sc", unicode(template))) msg("Page %s %s: %s.%s: Removing sc=Arab" % (index, pagetitle, tname, "sc")) oldtempl = "%s" % unicode(template) template.remove("sc") msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) newresult = ["remove %s.sc=Arab" % tname] if result != False: result = result + newresult else: result = newresult return result
def process_param(pagetitle, index, template, param, paramtr): result = canon_param(pagetitle, index, template, param, paramtr, translit_module, include_tempname_in_changelog=True) scvalue = getparam(template, "sc") if scvalue in script: tname = unicode(template.name) if show_template and result == False: msg("Page %s %s: %s.%s: Processing %s" % (index, pagetitle, tname, "sc", unicode(template))) msg("Page %s %s: %s.%s: Removing sc=%s" % (index, pagetitle, tname, "sc", scvalue)) oldtempl = "%s" % unicode(template) template.remove("sc") msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) newresult = ["remove %s.sc=%s" % (tname, scvalue)] if result != False: result = result + newresult else: result = newresult return result
def fix_one_page_smp(page, index, text): pagetitle = page.title() for t in text.filter_templates(): head = reorder_shadda(getparam(t, "1")) if t.name.startswith("ar-decl-"): param = "pl" pl = getparam(t, param) i = 2 while pl: if pl == "smp": if head.endswith(TAM): msg("Page %s %s: WARNING: Found %s=smp with feminine ending head %s in %s: not changing" % ( index, pagetitle, param, head, t.name)) else: msg("Page %s %s: Changing %s=smp to %s=sp in %s" % ( index, pagetitle, param, param, t.name)) addparam(t, param, "sp") param = "pl%s" % i pl = getparam(t, param) i += 1 changelog = "Change pl=smp to pl=sp" msg("Page %s %s: Change log = %s" % (index, pagetitle, changelog)) return text, changelog
def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt))
unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else "")) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg, "save page") else: pagemsg("Would save with comment = %s" % comment) parser = blib.create_argparser("Convert head|fr|* to fr-*") parser.add_argument("--fix-missing-plurals", action="store_true", help="Fix cases with missing plurals by just assuming the default plural.") parser.add_argument("--lemma-file",help="File containing lemmas to do.") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) if args.lemma_file: lines = [x.strip() for x in codecs.open(args.lemma_file, "r", "utf-8")] for i, pagename in blib.iter_items(lines, start, end): process_page(i, pywikibot.Page(site, pagename), args.save, args.verbose, args.fix_missing_plurals) else: for cat in ["French nouns", "French proper nouns", "French pronouns", "French determiners", "French adjectives", "French verbs", "French participles", "French adverbs", "French prepositions", "French conjunctions", "French interjections", "French idioms", "French phrases", "French abbreviations", "French acronyms", "French initialisms", "French noun forms", "French proper noun forms", "French pronoun forms", "French determiner forms", "French verb forms", "French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]: #for cat in ["French adjective forms", "French participle forms", "French proverbs", "French prefixes", "French suffixes", "French diacritical marks", "French punctuation marks"]: msg("Processing category: %s" % cat) for i, page in blib.cat_articles(cat, start, end): process_page(i, page, args.save, args.verbose, args.fix_missing_plurals)
if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) parser = blib.create_argparser(u"Convert Japanese headwords from old-style to new-style") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) romaji_to_keep = set() for i, page in blib.cat_articles("Japanese terms with romaji needing attention"): pagetitle = unicode(page.title()) romaji_to_keep.add(pagetitle) for ref in ["ja-noun", "ja-adj", "ja-verb", "ja-pos"]: msg("Processing references to Template:%s" % ref) for i, page in blib.references("Template:%s" % ref, start, end): process_page(i, page, args.save, args.verbose, romaji_to_keep)