Esempio n. 1
0
def process_page(page, index):
    global args
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    if args.ignore_non_mainspace and ':' in pagetitle:
        return
    if not blib.safe_page_exists(page, pagemsg):
        pagemsg("WARNING: Page doesn't exist, null-saving it would create it")
        return
    # pagemsg("Null-saving")
    blib.safe_page_save(page, "null save", errandpagemsg)
def check_participle(form, pagemsg):
  orig_pagemsg = pagemsg
  def pagemsg(txt):
    orig_pagemsg("%s: %s" % (form, txt))
  if "[" in form or "|" in form:
    pagemsg("Skipping form with brackets or vertical bar")
    return
  page = pywikibot.Page(site, lalib.remove_macrons(form))
  if not blib.safe_page_exists(page, pagemsg):
    pagemsg("Skipping nonexistent page")
  parsed = blib.parse_text(unicode(page.text))
  for t in parsed.filter_templates():
    tn = tname(t)
    if tn == "la-part":
      actual_part = re.sub("/.*", "", getparam(t, "1"))
      if actual_part != form:
        pagemsg("WARNING: Found actual participle %s, expected %s" % (
          actual_part, form))
Esempio n. 3
0
def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    if not blib.safe_page_exists(page, pagemsg):
        outtext = "does not exist"
    else:
        text = blib.safe_page_text(page, pagemsg)
        if re.search("#redirect", text, re.I):
            outtext = "exists as redirect"
        elif args.lang:
            if "==%s==" % args.lang in text:
                outtext = "exists in %s" % args.lang
            else:
                outtext = "exists but not in %s" % args.lang
        else:
            outtext = "exists"
    pagemsg(outtext)
Esempio n. 4
0
def process_page(page, index):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    seen_trans = [pagetitle]
    parsed = blib.parse(page)
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn in ["t", "t+", "t-", "t+check", "t-check"]:
            trans = blib.remove_links(getparam(t, "2"))
            if trans not in seen_trans:
                seen_trans.append(trans)
    for trans in seen_trans:

        def pagemsg_with_trans(txt):
            pagemsg("%s: %s" % (trans, txt))

        if blib.safe_page_exists(pywikibot.Page(site, trans),
                                 pagemsg_with_trans):
            msg("Page %s %s: Found existing translation for %s" %
                (index, trans, pagetitle))
def process_page(page, index, templates):
    pagetitle = unicode(page.title())

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    parsed = blib.parse_text(unicode(page.text))
    for t in parsed.filter_templates():
        if tname(t) in templates:
            lang = getparam(t, "1")
            if lang not in blib.languages_byCode:
                pagemsg("WARNING: Unrecognized language code %s" % lang)
                continue
            langname = blib.languages_byCode[lang]["canonicalName"]
            term = getparam(t, "2")
            pagenm = remove_diacritics(term, lang)
            if not pagenm:
                continue
            if pagenm.startswith("*"):
                pagenm = "Reconstruction:%s/%s" % (langname, pagenm[1:])
            page = pywikibot.Page(site, pagenm)
            if blib.safe_page_exists(page, pagemsg):
                text = unicode(page.text)
                if re.search("#redirect", text, re.I):
                    outtext = "exists as redirect"
                elif "==%s==" % langname in text:
                    outtext = "exists"
                else:
                    outtext = "exists only in some other language"
            else:
                outtext = "does not exist"
            end
            if term == pagenm:
                pagemsg("%s [[%s]] %s" % (langname, pagenm, outtext))
            else:
                pagemsg("%s [[%s|%s]] %s" % (langname, pagenm, term, outtext))
def process_page(index, num, save, verbose, params):
    comment = None
    notes = []

    lemma = ru_num(num)
    pagetitle = rulib.remove_accents(lemma)
    newtext = generate_page(num)

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    def errandpagemsg(txt):
        errandmsg("Page %s %s: %s" % (index, pagetitle, txt))

    # Prepare to create page
    pagemsg("Creating entry")
    page = pywikibot.Page(site, pagetitle)

    # If invalid title, don't do anything.
    existing_text = blib.safe_page_text(page,
                                        errandpagemsg,
                                        bad_value_ret=None)
    if existing_text is None:
        return

    if not blib.safe_page_exists(page, errandpagemsg):
        # Page doesn't exist. Create it.
        pagemsg("Creating page")
        comment = "Create page for Russian numeral %s (%s)" % (lemma, num)
        page.text = newtext
        if verbose:
            pagemsg("New text is [[%s]]" % page.text)
    else:  # Page does exist
        pagetext = existing_text

        # Split into sections
        splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M)
        # Extract off pagehead and recombine section headers with following text
        pagehead = splitsections[0]
        sections = []
        for i in xrange(1, len(splitsections)):
            if (i % 2) == 1:
                sections.append("")
            sections[-1] += splitsections[i]

        # Go through each section in turn, looking for existing Russian section
        for i in xrange(len(sections)):
            m = re.match("^==([^=\n]+)==$", sections[i], re.M)
            if not m:
                pagemsg("Can't find language name in text: [[%s]]" %
                        (sections[i]))
            elif m.group(1) == "Russian":
                # Extract off trailing separator
                mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S)
                if mm:
                    # Note that this changes the number of sections, which is seemingly
                    # a problem because the for-loop above calculates the end point
                    # at the beginning of the loop, but is not actually a problem
                    # because we always break after processing the Russian section.
                    sections[i:i + 1] = [mm.group(1), mm.group(2)]

                if params.overwrite_page:
                    if "==Etymology 1==" in sections[
                            i] and not params.overwrite_etymologies:
                        errandpagemsg(
                            "WARNING: Found ==Etymology 1== in page text, not overwriting, skipping form"
                        )
                        return
                    else:
                        pagemsg("WARNING: Overwriting entire Russian section")
                        comment = "Create Russian section for numeral %s (%s)" % (
                            lemma, num)
                        sections[i] = newtext
                        notes.append("overwrite section")
                        break
                else:
                    errandpagemsg(
                        "WARNING: Not overwriting existing Russian section")
                    return
            elif m.group(1) > "Russian":
                pagemsg("Exists; inserting before %s section" % (m.group(1)))
                comment = "Create Russian section and entry for numeral %s (%s); insert before %s section" % (
                    lemma, num, m.group(1))
                sections[i:i] = [newtext, "\n----\n\n"]
                break

        else:  # else of for loop over sections, i.e. no break out of loop
            pagemsg("Exists; adding section to end")
            comment = "Create Russian section and entry for numeral %s (%s); append at end" % (
                lemma, num)

            if sections:
                sections[-1] = ensure_two_trailing_nl(sections[-1])
                sections += ["----\n\n", newsection]
            else:
                if not params.overwrite_page:
                    notes.append("formerly empty")
                if pagehead.lower().startswith("#redirect"):
                    pagemsg("WARNING: Page is redirect, overwriting")
                    notes.append("overwrite redirect")
                    pagehead = re.sub(
                        r"#redirect *\[\[(.*?)\]\] *(<!--.*?--> *)*\n*",
                        r"{{also|\1}}\n", pagehead, 0, re.I)
                elif not params.overwrite_page:
                    pagemsg("WARNING: No language sections in current page")
                sections += [newsection]

        # End of loop over sections in existing page; rejoin sections
        newtext = pagehead + ''.join(sections)

        if page.text != newtext:
            assert comment or notes

        # Eliminate sequences of 3 or more newlines, which may come from
        # ensure_two_trailing_nl(). Add comment if none, in case of existing page
        # with extra newlines.
        newnewtext = re.sub(r"\n\n\n+", r"\n\n", newtext)
        if newnewtext != newtext and not comment and not notes:
            notes = ["eliminate sequences of 3 or more newlines"]
        newtext = newnewtext

        if page.text == newtext:
            pagemsg("No change in text")
        elif verbose:
            pagemsg("Replacing <%s> with <%s>" % (page.text, newtext))
        else:
            pagemsg("Text has changed")
        page.text = newtext

    # Executed whether creating new page or modifying existing page.
    # Check for changed text and save if so.
    notestext = '; '.join(notes)
    if notestext:
        if comment:
            comment += " (%s)" % notestext
        else:
            comment = notestext
    if page.text != existing_text:
        if save:
            pagemsg("Saving with comment = %s" % comment)
            blib.safe_page_save(page, comment, errandpagemsg)
        else:
            pagemsg("Would save with comment = %s" % comment)
Esempio n. 7
0
def process_text_on_page(index, pagetitle, text):
    global args

    def pagemsg(txt):
        msg("Page %s %s: %s" % (index, pagetitle, txt))

    pagemsg("Processing")

    m = re.search(
        "^Category:(Japanese|Okinawan) terms spelled with (.*) read as (.*)$",
        pagetitle)
    if not m:
        pagemsg("Skipped")
        return

    notes = []

    lang, spelling, reading = m.groups()
    langcode = lang == "Japanese" and "ja" or "ryu"
    spelling_page = pywikibot.Page(site, spelling)

    def pagemsg_with_spelling(txt):
        pagemsg("%s: %s" % (spelling, txt))

    def errandpagemsg_with_spelling(txt):
        pagemsg_with_spelling(txt)
        errmsg("Page %s %s: %s: %s" % (index, pagetitle, spelling, txt))

    if not blib.safe_page_exists(spelling_page, pagemsg_with_spelling):
        pagemsg_with_spelling("Spelling page doesn't exist, skipping")
        return
    spelling_page_text = blib.safe_page_text(spelling_page,
                                             pagemsg_with_spelling)
    retval = blib.find_modifiable_lang_section(spelling_page_text, lang,
                                               pagemsg_with_spelling)
    if retval is None:
        pagemsg_with_spelling("WARNING: Couldn't find %s section" % lang)
        return
    sections, j, secbody, sectail, has_non_lang = retval

    parsed = blib.parse_text(secbody)
    saw_readings_template = False
    reading_types = []
    for t in parsed.filter_templates():
        tn = tname(t)
        if tn == "%s-readings" % langcode:
            saw_readings_template = True
            for reading_type in allowed_reading_types:
                readings = getparam(t, reading_type).strip()
                if readings:
                    readings = re.split(r"\s*,\s*", readings)
                    readings = [re.sub("[<-].*", "", r) for r in readings]
                    if reading in readings:
                        reading_type = canonicalize_reading_types.get(
                            reading_type, reading_type)
                        pagemsg_with_spelling(
                            "Appending reading type %s based on %s" %
                            (reading_type, unicode(t)))
                        if reading_type not in reading_types:
                            reading_types.append(reading_type)
                            notes.append(
                                "add %s reading based on {{%s-readings}} on page [[%s]]"
                                % (reading_type, langcode, spelling))
            if not reading_types:
                pagemsg_with_spelling(
                    "WARNING: Can't find reading %s among readings listed in %s"
                    % (reading, unicode(t).replace("\n", r"\n")))

    if not saw_readings_template:
        pagemsg_with_spelling(
            "WARNING: Couldn't find reading template {{%s-readings}}" %
            langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling("WARNING: Can't find reading %s on page" %
                              reading)

    for i, contents_page in blib.cat_articles(
            re.sub("^Category:", "", pagetitle)):
        contents_title = unicode(contents_page.title())

        def pagemsg_with_contents(txt):
            pagemsg("%s: %s" % (contents_title, txt))

        def errandpagemsg_with_contents(txt):
            pagemsg_with_contents(txt)
            errmsg("Page %s %s: %s: %s" %
                   (index, pagetitle, contents_title, txt))

        contents_page_text = blib.safe_page_text(contents_page,
                                                 pagemsg_with_contents)
        retval = blib.find_modifiable_lang_section(contents_page_text, lang,
                                                   pagemsg_with_contents)
        if retval is None:
            pagemsg_with_contents("WARNING: Couldn't find %s section" % lang)
            return
        sections, j, secbody, sectail, has_non_lang = retval

        saw_kanjitab = False
        must_continue = False
        for ch in contents_title:
            if 0xD800 <= ord(ch) <= 0xDFFF:
                pagemsg_with_contents(
                    "WARNING: Surrogates in page name, skipping: %s" % ord(ch))
                must_continue = True
                break
        if must_continue:
            continue
        chars_in_contents_title = [x for x in contents_title]
        for i, ch in enumerate(chars_in_contents_title):
            if ch == u"々":  # kanji repeat char
                if i == 0:
                    pagemsg_with_contents(
                        u"Repeat char 々 found at beginning of contents title")
                    must_continue = True
                    break
                else:
                    chars_in_contents_title[i] = chars_in_contents_title[i - 1]
        if must_continue:
            continue
        kanji_in_contents_title = [
            x for x in chars_in_contents_title
            if unicodedata.name(x).startswith("CJK UNIFIED IDEOGRAPH")
        ]
        parsed = blib.parse_text(secbody)
        for t in parsed.filter_templates():
            tn = tname(t)
            if tn == "%s-kanjitab" % langcode:
                saw_kanjitab = True
                readings = []
                for i in range(1, 10):
                    contents_reading = getparam(t, str(i))
                    if contents_reading:
                        readings.append(contents_reading)
                if len(kanji_in_contents_title) != len(readings):
                    pagemsg_with_contents(
                        "WARNING: Saw %s chars in contents title but %s readings %s, skipping: %s"
                        % (len(kanji_in_contents_title), len(readings),
                           ",".join(readings), unicode(t)))
                    continue
                yomi = getparam(t, "yomi")
                if not yomi:
                    pagemsg_with_contents("WARNING: No yomi, skipping: %s" %
                                          unicode(t))
                    continue
                if "," in yomi or re.search("[0-9]$", yomi):
                    yomi = yomi.split(",")
                if type(yomi) is list:
                    expanded_yomi = []
                    for y in yomi:
                        m = re.search("^(.*?)([0-9]+)$", y)
                        if m:
                            baseyomi, numyomi = m.groups()
                            numyomi = int(numyomi)
                            expanded_yomi.extend([baseyomi] * numyomi)
                        else:
                            expanded_yomi.append(y)
                    if expanded_yomi != yomi:
                        pagemsg_with_contents(
                            "Expanding yomi %s to %s" %
                            (",".join(yomi), ",".join(expanded_yomi)))
                    yomi = expanded_yomi
                if type(yomi) is list and len(yomi) != len(
                        kanji_in_contents_title):
                    pagemsg_with_contents(
                        "WARNING: %s values in yomi=%s but %s chars in contents, skipping: %s"
                        % (len(yomi), ",".join(yomi),
                           len(kanji_in_contents_title), unicode(t)))
                    continue
                saw_spelling_in_contents = False
                must_continue = False
                for i, (ch, contents_reading) in enumerate(
                        zip(kanji_in_contents_title, readings)):
                    if ch == spelling:
                        saw_spelling_in_contents = True
                        if contents_reading == reading:
                            if type(yomi) is list:
                                reading_type = yomi[i]
                            else:
                                reading_type = yomi
                            yomi_to_canonical_reading_type = {
                                "o": "on",
                                "on": "on",
                                "kanon": "kanon",
                                "goon": "goon",
                                "soon": "soon",
                                "toon": "toon",
                                "kan": "kanyoon",
                                "kanyo": "kanyoon",
                                "kanyoon": "kanyoon",
                                "k": "kun",
                                "kun": "kun",
                                "juku": "jukujikun",
                                "jukuji": "jukujikun",
                                "jukujikun": "jukujikun",
                                "n": "nanori",
                                "nanori": "nanori",
                                "ok": "jubakoyomi",
                                "j": "jubakoyomi",
                                "ko": "yutoyomi",
                                "y": "yutoyomi",
                                "irr": "irregular",
                                "irreg": "irregular",
                                "irregular": "irregular",
                            }
                            if reading_type not in yomi_to_canonical_reading_type:
                                pagemsg_with_contents(
                                    "WARNING: Unrecognized reading type %s: %s"
                                    % (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = yomi_to_canonical_reading_type[
                                reading_type]
                            if reading_type not in allowed_reading_types:
                                pagemsg_with_contents(
                                    "WARNING: Disallowed reading type %s: %s" %
                                    (reading_type, unicode(t)))
                                must_continue = True
                                break
                            reading_type = canonicalize_reading_types.get(
                                reading_type, reading_type)
                            pagemsg_with_contents(
                                "Appending reading type %s based on %s" %
                                (reading_type, unicode(t)))
                            if reading_type not in reading_types:
                                reading_types.append(reading_type)
                                notes.append(
                                    "add %s reading based on {{%s-kanjitab}} on page [[%s]]"
                                    % (reading_type, langcode, contents_title))
                if must_continue:
                    continue
                if not saw_spelling_in_contents:
                    pagemsg_with_contents(
                        "WARNING: Didn't see spelling in contents: %s" %
                        unicode(t))
                    continue
        if not saw_kanjitab:
            pagemsg_with_contents("WARNING: Didn't see {{%s-kanjitab}}" %
                                  langcode)

    if reading_types:
        contents = "{{auto cat|%s}}" % "|".join(reading_types)
        return contents, notes
    else:
        pagemsg_with_spelling(
            "WARNING: Can't find reading %s by looking through category contents"
            % reading)