def process_page(page, index): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) if args.ignore_non_mainspace and ':' in pagetitle: return if not blib.safe_page_exists(page, pagemsg): pagemsg("WARNING: Page doesn't exist, null-saving it would create it") return # pagemsg("Null-saving") blib.safe_page_save(page, "null save", errandpagemsg)
def check_participle(form, pagemsg): orig_pagemsg = pagemsg def pagemsg(txt): orig_pagemsg("%s: %s" % (form, txt)) if "[" in form or "|" in form: pagemsg("Skipping form with brackets or vertical bar") return page = pywikibot.Page(site, lalib.remove_macrons(form)) if not blib.safe_page_exists(page, pagemsg): pagemsg("Skipping nonexistent page") parsed = blib.parse_text(unicode(page.text)) for t in parsed.filter_templates(): tn = tname(t) if tn == "la-part": actual_part = re.sub("/.*", "", getparam(t, "1")) if actual_part != form: pagemsg("WARNING: Found actual participle %s, expected %s" % ( actual_part, form))
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not blib.safe_page_exists(page, pagemsg): outtext = "does not exist" else: text = blib.safe_page_text(page, pagemsg) if re.search("#redirect", text, re.I): outtext = "exists as redirect" elif args.lang: if "==%s==" % args.lang in text: outtext = "exists in %s" % args.lang else: outtext = "exists but not in %s" % args.lang else: outtext = "exists" pagemsg(outtext)
def process_page(page, index): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) seen_trans = [pagetitle] parsed = blib.parse(page) for t in parsed.filter_templates(): tn = tname(t) if tn in ["t", "t+", "t-", "t+check", "t-check"]: trans = blib.remove_links(getparam(t, "2")) if trans not in seen_trans: seen_trans.append(trans) for trans in seen_trans: def pagemsg_with_trans(txt): pagemsg("%s: %s" % (trans, txt)) if blib.safe_page_exists(pywikibot.Page(site, trans), pagemsg_with_trans): msg("Page %s %s: Found existing translation for %s" % (index, trans, pagetitle))
def process_page(page, index, templates): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) parsed = blib.parse_text(unicode(page.text)) for t in parsed.filter_templates(): if tname(t) in templates: lang = getparam(t, "1") if lang not in blib.languages_byCode: pagemsg("WARNING: Unrecognized language code %s" % lang) continue langname = blib.languages_byCode[lang]["canonicalName"] term = getparam(t, "2") pagenm = remove_diacritics(term, lang) if not pagenm: continue if pagenm.startswith("*"): pagenm = "Reconstruction:%s/%s" % (langname, pagenm[1:]) page = pywikibot.Page(site, pagenm) if blib.safe_page_exists(page, pagemsg): text = unicode(page.text) if re.search("#redirect", text, re.I): outtext = "exists as redirect" elif "==%s==" % langname in text: outtext = "exists" else: outtext = "exists only in some other language" else: outtext = "does not exist" end if term == pagenm: pagemsg("%s [[%s]] %s" % (langname, pagenm, outtext)) else: pagemsg("%s [[%s|%s]] %s" % (langname, pagenm, term, outtext))
def process_page(index, num, save, verbose, params): comment = None notes = [] lemma = ru_num(num) pagetitle = rulib.remove_accents(lemma) newtext = generate_page(num) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Prepare to create page pagemsg("Creating entry") page = pywikibot.Page(site, pagetitle) # If invalid title, don't do anything. existing_text = blib.safe_page_text(page, errandpagemsg, bad_value_ret=None) if existing_text is None: return if not blib.safe_page_exists(page, errandpagemsg): # Page doesn't exist. Create it. pagemsg("Creating page") comment = "Create page for Russian numeral %s (%s)" % (lemma, num) page.text = newtext if verbose: pagemsg("New text is [[%s]]" % page.text) else: # Page does exist pagetext = existing_text # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": # Extract off trailing separator mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S) if mm: # Note that this changes the number of sections, which is seemingly # a problem because the for-loop above calculates the end point # at the beginning of the loop, but is not actually a problem # because we always break after processing the Russian section. sections[i:i + 1] = [mm.group(1), mm.group(2)] if params.overwrite_page: if "==Etymology 1==" in sections[ i] and not params.overwrite_etymologies: errandpagemsg( "WARNING: Found ==Etymology 1== in page text, not overwriting, skipping form" ) return else: pagemsg("WARNING: Overwriting entire Russian section") comment = "Create Russian section for numeral %s (%s)" % ( lemma, num) sections[i] = newtext notes.append("overwrite section") break else: errandpagemsg( "WARNING: Not overwriting existing Russian section") return elif m.group(1) > "Russian": pagemsg("Exists; inserting before %s section" % (m.group(1))) comment = "Create Russian section and entry for numeral %s (%s); insert before %s section" % ( lemma, num, m.group(1)) sections[i:i] = [newtext, "\n----\n\n"] break else: # else of for loop over sections, i.e. no break out of loop pagemsg("Exists; adding section to end") comment = "Create Russian section and entry for numeral %s (%s); append at end" % ( lemma, num) if sections: sections[-1] = ensure_two_trailing_nl(sections[-1]) sections += ["----\n\n", newsection] else: if not params.overwrite_page: notes.append("formerly empty") if pagehead.lower().startswith("#redirect"): pagemsg("WARNING: Page is redirect, overwriting") notes.append("overwrite redirect") pagehead = re.sub( r"#redirect *\[\[(.*?)\]\] *(<!--.*?--> *)*\n*", r"{{also|\1}}\n", pagehead, 0, re.I) elif not params.overwrite_page: pagemsg("WARNING: No language sections in current page") sections += [newsection] # End of loop over sections in existing page; rejoin sections newtext = pagehead + ''.join(sections) if page.text != newtext: assert comment or notes # Eliminate sequences of 3 or more newlines, which may come from # ensure_two_trailing_nl(). Add comment if none, in case of existing page # with extra newlines. newnewtext = re.sub(r"\n\n\n+", r"\n\n", newtext) if newnewtext != newtext and not comment and not notes: notes = ["eliminate sequences of 3 or more newlines"] newtext = newnewtext if page.text == newtext: pagemsg("No change in text") elif verbose: pagemsg("Replacing <%s> with <%s>" % (page.text, newtext)) else: pagemsg("Text has changed") page.text = newtext # Executed whether creating new page or modifying existing page. # Check for changed text and save if so. notestext = '; '.join(notes) if notestext: if comment: comment += " (%s)" % notestext else: comment = notestext if page.text != existing_text: if save: pagemsg("Saving with comment = %s" % comment) blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") m = re.search( "^Category:(Japanese|Okinawan) terms spelled with (.*) read as (.*)$", pagetitle) if not m: pagemsg("Skipped") return notes = [] lang, spelling, reading = m.groups() langcode = lang == "Japanese" and "ja" or "ryu" spelling_page = pywikibot.Page(site, spelling) def pagemsg_with_spelling(txt): pagemsg("%s: %s" % (spelling, txt)) def errandpagemsg_with_spelling(txt): pagemsg_with_spelling(txt) errmsg("Page %s %s: %s: %s" % (index, pagetitle, spelling, txt)) if not blib.safe_page_exists(spelling_page, pagemsg_with_spelling): pagemsg_with_spelling("Spelling page doesn't exist, skipping") return spelling_page_text = blib.safe_page_text(spelling_page, pagemsg_with_spelling) retval = blib.find_modifiable_lang_section(spelling_page_text, lang, pagemsg_with_spelling) if retval is None: pagemsg_with_spelling("WARNING: Couldn't find %s section" % lang) return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) saw_readings_template = False reading_types = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "%s-readings" % langcode: saw_readings_template = True for reading_type in allowed_reading_types: readings = getparam(t, reading_type).strip() if readings: readings = re.split(r"\s*,\s*", readings) readings = [re.sub("[<-].*", "", r) for r in readings] if reading in readings: reading_type = canonicalize_reading_types.get( reading_type, reading_type) pagemsg_with_spelling( "Appending reading type %s based on %s" % (reading_type, unicode(t))) if reading_type not in reading_types: reading_types.append(reading_type) notes.append( "add %s reading based on {{%s-readings}} on page [[%s]]" % (reading_type, langcode, spelling)) if not reading_types: pagemsg_with_spelling( "WARNING: Can't find reading %s among readings listed in %s" % (reading, unicode(t).replace("\n", r"\n"))) if not saw_readings_template: pagemsg_with_spelling( "WARNING: Couldn't find reading template {{%s-readings}}" % langcode) if reading_types: contents = "{{auto cat|%s}}" % "|".join(reading_types) return contents, notes else: pagemsg_with_spelling("WARNING: Can't find reading %s on page" % reading) for i, contents_page in blib.cat_articles( re.sub("^Category:", "", pagetitle)): contents_title = unicode(contents_page.title()) def pagemsg_with_contents(txt): pagemsg("%s: %s" % (contents_title, txt)) def errandpagemsg_with_contents(txt): pagemsg_with_contents(txt) errmsg("Page %s %s: %s: %s" % (index, pagetitle, contents_title, txt)) contents_page_text = blib.safe_page_text(contents_page, pagemsg_with_contents) retval = blib.find_modifiable_lang_section(contents_page_text, lang, pagemsg_with_contents) if retval is None: pagemsg_with_contents("WARNING: Couldn't find %s section" % lang) return sections, j, secbody, sectail, has_non_lang = retval saw_kanjitab = False must_continue = False for ch in contents_title: if 0xD800 <= ord(ch) <= 0xDFFF: pagemsg_with_contents( "WARNING: Surrogates in page name, skipping: %s" % ord(ch)) must_continue = True break if must_continue: continue chars_in_contents_title = [x for x in contents_title] for i, ch in enumerate(chars_in_contents_title): if ch == u"々": # kanji repeat char if i == 0: pagemsg_with_contents( u"Repeat char 々 found at beginning of contents title") must_continue = True break else: chars_in_contents_title[i] = chars_in_contents_title[i - 1] if must_continue: continue kanji_in_contents_title = [ x for x in chars_in_contents_title if unicodedata.name(x).startswith("CJK UNIFIED IDEOGRAPH") ] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn == "%s-kanjitab" % langcode: saw_kanjitab = True readings = [] for i in range(1, 10): contents_reading = getparam(t, str(i)) if contents_reading: readings.append(contents_reading) if len(kanji_in_contents_title) != len(readings): pagemsg_with_contents( "WARNING: Saw %s chars in contents title but %s readings %s, skipping: %s" % (len(kanji_in_contents_title), len(readings), ",".join(readings), unicode(t))) continue yomi = getparam(t, "yomi") if not yomi: pagemsg_with_contents("WARNING: No yomi, skipping: %s" % unicode(t)) continue if "," in yomi or re.search("[0-9]$", yomi): yomi = yomi.split(",") if type(yomi) is list: expanded_yomi = [] for y in yomi: m = re.search("^(.*?)([0-9]+)$", y) if m: baseyomi, numyomi = m.groups() numyomi = int(numyomi) expanded_yomi.extend([baseyomi] * numyomi) else: expanded_yomi.append(y) if expanded_yomi != yomi: pagemsg_with_contents( "Expanding yomi %s to %s" % (",".join(yomi), ",".join(expanded_yomi))) yomi = expanded_yomi if type(yomi) is list and len(yomi) != len( kanji_in_contents_title): pagemsg_with_contents( "WARNING: %s values in yomi=%s but %s chars in contents, skipping: %s" % (len(yomi), ",".join(yomi), len(kanji_in_contents_title), unicode(t))) continue saw_spelling_in_contents = False must_continue = False for i, (ch, contents_reading) in enumerate( zip(kanji_in_contents_title, readings)): if ch == spelling: saw_spelling_in_contents = True if contents_reading == reading: if type(yomi) is list: reading_type = yomi[i] else: reading_type = yomi yomi_to_canonical_reading_type = { "o": "on", "on": "on", "kanon": "kanon", "goon": "goon", "soon": "soon", "toon": "toon", "kan": "kanyoon", "kanyo": "kanyoon", "kanyoon": "kanyoon", "k": "kun", "kun": "kun", "juku": "jukujikun", "jukuji": "jukujikun", "jukujikun": "jukujikun", "n": "nanori", "nanori": "nanori", "ok": "jubakoyomi", "j": "jubakoyomi", "ko": "yutoyomi", "y": "yutoyomi", "irr": "irregular", "irreg": "irregular", "irregular": "irregular", } if reading_type not in yomi_to_canonical_reading_type: pagemsg_with_contents( "WARNING: Unrecognized reading type %s: %s" % (reading_type, unicode(t))) must_continue = True break reading_type = yomi_to_canonical_reading_type[ reading_type] if reading_type not in allowed_reading_types: pagemsg_with_contents( "WARNING: Disallowed reading type %s: %s" % (reading_type, unicode(t))) must_continue = True break reading_type = canonicalize_reading_types.get( reading_type, reading_type) pagemsg_with_contents( "Appending reading type %s based on %s" % (reading_type, unicode(t))) if reading_type not in reading_types: reading_types.append(reading_type) notes.append( "add %s reading based on {{%s-kanjitab}} on page [[%s]]" % (reading_type, langcode, contents_title)) if must_continue: continue if not saw_spelling_in_contents: pagemsg_with_contents( "WARNING: Didn't see spelling in contents: %s" % unicode(t)) continue if not saw_kanjitab: pagemsg_with_contents("WARNING: Didn't see {{%s-kanjitab}}" % langcode) if reading_types: contents = "{{auto cat|%s}}" % "|".join(reading_types) return contents, notes else: pagemsg_with_spelling( "WARNING: Can't find reading %s by looking through category contents" % reading)