Ejemplo n.º 1
0
    def update_page(title, text, langlinks, weak_update=True):
        """
        :param str title: title of the page
        :param str text: wikitext of the page
        :param langlinks: a sorted list of ``(tag, title)`` tuples as obtained
                          from :py:meth:`self.get_langlinks`
        :param weak_update:
            When ``True``, the langlinks present on the page are mixed with those
            suggested by ``family_titles``. This is necessary only when there are
            multiple "intersecting" families, in which case the intersection should
            be preserved and solved manually. This is reported in _merge_families.
        :returns: updated wikicode
        """
        # temporarily skip main pages until the behavior switches
        # (__NOTOC__ etc.) can be parsed by mwparserfromhell
        # NOTE: handling whitespace right will be hard: https://wiki.archlinux.org/index.php?title=Main_page&diff=383144&oldid=382787
        if re.search("__NOTOC__|__NOEDITSECTION__", text):
            logger.warning("Skipping page '{}' (contains behavior switch(es))".format(title))
            return text

        # format langlinks, in the prefix form
        # (e.g. "cs:Some title" for title="Some title" and tag="cs")
        langlinks = ["[[{}:{}]]".format(tag, title) for tag, title in langlinks]

        logger.info("Parsing page [[{}]] ...".format(title))
        wikicode = mwparserfromhell.parse(text)
        if weak_update is True:
            parent, magics, cats, langlinks = header.get_header_parts(wikicode, langlinks=langlinks, remove_from_parent=True)
        else:
            # drop the extracted langlinks
            parent, magics, cats, _ = header.get_header_parts(wikicode, remove_from_parent=True)
        header.build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Ejemplo n.º 2
0
    def fix_page(title, text_old):
        langname = lang.detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text_old)
        parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True)

        for cat in cats:
            # get_header_parts returns list of wikicode objects, each with one node
            cat = cat.nodes[0]

            pure, ln = lang.detect_language(str(cat.title))
            if ln != langname:
                cat.title = lang.format_title(pure, langname)

        build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Ejemplo n.º 3
0
    def fix_page(title, text_old):
        langname = lang.detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text_old)
        parent, magics, cats, langlinks = get_header_parts(wikicode, remove_from_parent=True)

        for cat in cats:
            # get_header_parts returns list of wikicode objects, each with one node
            cat = cat.nodes[0]

            pure, ln = lang.detect_language(str(cat.title))
            if ln != langname:
                cat.title = lang.format_title(pure, langname)

        build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Ejemplo n.º 4
0
    def update_page(title, text, langlinks, weak_update=True):
        """
        :param str title: title of the page
        :param str text: wikitext of the page
        :param langlinks: a sorted list of ``(tag, title)`` tuples as obtained
                          from :py:meth:`self.get_langlinks`
        :param weak_update:
            When ``True``, the langlinks present on the page are mixed with those
            suggested by ``family_titles``. This is necessary only when there are
            multiple "intersecting" families, in which case the intersection should
            be preserved and solved manually. This is reported in _merge_families.
        :returns: updated wikicode
        """
        # temporarily skip main pages until the behavior switches
        # (__NOTOC__ etc.) can be parsed by mwparserfromhell
        # NOTE: handling whitespace right will be hard: https://wiki.archlinux.org/index.php?title=Main_page&diff=383144&oldid=382787
        if re.search("__NOTOC__|__NOEDITSECTION__", text):
            logger.warning(
                "Skipping page '{}' (contains behavior switch(es))".format(
                    title))
            return text

        # format langlinks, in the prefix form
        # (e.g. "cs:Some title" for title="Some title" and tag="cs")
        langlinks = [
            "[[{}:{}]]".format(tag, title) for tag, title in langlinks
        ]

        logger.info("Parsing page [[{}]] ...".format(title))
        wikicode = mwparserfromhell.parse(text)
        if weak_update is True:
            parent, magics, cats, langlinks = header.get_header_parts(
                wikicode, langlinks=langlinks, remove_from_parent=True)
        else:
            # drop the extracted langlinks
            parent, magics, cats, _ = header.get_header_parts(
                wikicode, remove_from_parent=True)
        header.build_header(wikicode, parent, magics, cats, langlinks)
        return wikicode
Ejemplo n.º 5
0
 def decategorize(title, text_old):
     wikicode = mwparserfromhell.parse(text_old)
     parent, magics, cats, langlinks = get_header_parts(
         wikicode, remove_from_parent=True)
     build_header(wikicode, parent, magics, [], langlinks)
     return wikicode