Example #1
0
    def update_wikilink(self, wikicode, wikilink, src_title, summary_parts):
        if str(wikilink) in self.void_update_cache:
            logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink))
            return

        title = self.api.Title(wikilink.title)
        # skip interlanguage links (handled by interlanguage.py)
        if title.iwprefix in self.api.site.interlanguagemap.keys():
            return

        summary = get_edit_checker(wikicode, summary_parts)

        with summary("simplification and beautification of wikilinks"):
            # beautify if urldecoded
            # FIXME: make it implicit - it does not always propagate from the Title class
            if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE):
                # handle links with leading colon properly
                wikilink.title = title.leading_colon + str(title)
                # FIXME: should be done in the Title class
                # the anchor is dot-encoded, but percent-encoding wors for links too
                # and is even rendered nicely
                wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D")

            self.collapse_whitespace_pipe(wikilink)
            self.check_trivial(wikilink, title)
            self.check_relative(src_title, wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)
            self.check_redirect_capitalization(wikilink, title)

            # reparse the title, the redirect checks might change it non-equivalently
            title = self.api.Title(wikilink.title)

            self.check_displaytitle(wikilink, title)

        with summary("fixed section fragments"):
            anchor_result = self.check_anchor(src_title, wikilink, title)
        if anchor_result is False:
            with summary("flagged broken section links"):
                ensure_flagged_by_template(wikicode, wikilink, "Broken section link")
        else:
            with summary("unflagged working section links"):
                ensure_unflagged_by_template(wikicode, wikilink, "Broken section link")

        with summary("simplification and beautification of wikilinks"):
            # partial second pass
            self.check_trivial(wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)

            # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]'
            self.collapse_whitespace(wikicode, wikilink)

        # cache context-less, correct wikilinks that don't need any update
        if title.pagename and len(summary_parts) == 0 and anchor_result is True:
            self.void_update_cache.add(str(wikilink))
Example #2
0
    def update_wikilink(self, wikicode, wikilink, src_title, summary_parts):
        if str(wikilink) in self.void_update_cache:
            logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink))
            return

        title = self.api.Title(wikilink.title)
        # skip interlanguage links (handled by interlanguage.py)
        if title.iwprefix in self.api.site.interlanguagemap.keys():
            return

        summary = get_edit_checker(wikicode, summary_parts)

        with summary("simplification and beautification of wikilinks"):
            # beautify if urldecoded
            # FIXME: make it implicit - it does not always propagate from the Title class
            if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE):
                # handle links with leading colon properly
                wikilink.title = title.leading_colon + str(title)
                # FIXME: should be done in the Title class
                # the anchor is dot-encoded, but percent-encoding wors for links too
                # and is even rendered nicely
                wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D")

            self.collapse_whitespace_pipe(wikilink)
            self.check_trivial(wikilink, title)
            self.check_relative(src_title, wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)
            self.check_redirect_capitalization(wikilink, title)

            # reparse the title, the redirect checks might change it non-equivalently
            title = self.api.Title(wikilink.title)

            self.check_displaytitle(wikilink, title)

        with summary("fixed section fragments"):
            anchor_result = self.check_anchor(src_title, wikilink, title)
        if anchor_result is False:
            with summary("flagged broken section links"):
                ensure_flagged_by_template(wikicode, wikilink, "Broken section link")
        else:
            with summary("unflagged working section links"):
                ensure_unflagged_by_template(wikicode, wikilink, "Broken section link")

        with summary("simplification and beautification of wikilinks"):
            # partial second pass
            self.check_trivial(wikilink, title)
            if lang.detect_language(src_title)[1] == "English":
                self.check_redirect_exact(src_title, wikilink, title)

            # collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]'
            self.collapse_whitespace(wikicode, wikilink)

        # cache context-less, correct wikilinks that don't need any update
        if title.pagename and len(summary_parts) == 0 and anchor_result is True:
            self.void_update_cache.add(str(wikilink))
Example #3
0
    def check_url_replacements(self, wikicode, extlink, url):
        for edit_summary, url_regex, url_replacement in self.url_replacements:
            match = url_regex.fullmatch(url.url)
            if match:
                env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
                template = env.from_string(url_replacement)
                new_url = template.render(m=match.groups(),
                                          **match.groupdict())

                # check if the resulting URL is valid
                # (irc:// and ircs:// cannot be validated - requests throws requests.exceptions.InvalidSchema)
                if not new_url.startswith("irc://") and not new_url.startswith(
                        "ircs://") and not self.check_url(
                            new_url, allow_redirects=True):
                    logger.warning("URL not replaced: {}".format(url))
                    return False

                # post-processing for gitlab.archlinux.org links
                #   - gitlab uses "blob" for files and "tree" for directories
                #   - if "blob" or "tree" is used incorrectly, gitlab gives 302 to the correct one
                #     (so we should replace new_url with what gitlab gives us)
                #   - the "/-/" disambiguator (which is added by gitlab's redirects) is ugly and should be removed thereafter
                #   - gitlab gives 302 to the master branch instead of 404 for non-existent files/directories
                if new_url.startswith("https://gitlab.archlinux.org"):
                    # use same query as ExtlinkStatusChecker.check_url
                    response = self.session.get(new_url,
                                                headers=self.headers,
                                                timeout=self.timeout,
                                                stream=True,
                                                allow_redirects=True)
                    # explicitly close the responses to release the connection back to the pool
                    # (this is important, especially when we use pool_block=True)
                    response.close()
                    if len(response.history) > 0:
                        if response.url.endswith("/master"):
                            # this is gitlab's "404" in most cases
                            logger.warning(
                                "URL not replaced (Gitlab redirected to a master branch): {}"
                                .format(url))
                            return False
                        new_url = response.url
                    new_url = new_url.replace("/-/", "/", 1)

                # some patterns match even the target
                # (e.g. links on addons.mozilla.org which already do not have a language code)
                if url.url == new_url:
                    return False

                extlink.url = new_url
                ensure_unflagged_by_template(wikicode,
                                             extlink,
                                             "Dead link",
                                             match_only_prefix=True)
                return edit_summary
        return False
Example #4
0
    def check_extlink_status(self, wikicode, extlink, src_title):
        with self.lock_wikicode:
            url = self.prepare_url(wikicode, extlink)
        if url is None:
            return

        logger.info("Checking link {} ...".format(extlink))
        status = self.check_url(url)

        with self.lock_wikicode:
            if status is True:
                # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code)
                ensure_unflagged_by_template(wikicode,
                                             extlink,
                                             "Dead link",
                                             match_only_prefix=True)
            elif status is False:
                # TODO: handle bbs.archlinux.org (some links may require login)
                # TODO: handle links inside {{man|url=...}} properly
                # first replace the existing template (if any) with a translated version
                flag = self.get_localized_template(
                    "Dead link",
                    lang.detect_language(src_title)[1])
                localize_flag(wikicode, extlink, flag)
                # flag the link, but don't overwrite date and don't set status yet
                flag = ensure_flagged_by_template(wikicode,
                                                  extlink,
                                                  flag,
                                                  *self.deadlink_params,
                                                  overwrite_parameters=False)
                # drop the fragment from the URL before looking into the cache
                if url.fragment:
                    url = urllib3.util.url.parse_url(
                        url.url.rsplit("#", maxsplit=1)[0])
                # overwrite by default, but skip overwriting date when the status matches
                overwrite = True
                if flag.has("status"):
                    status = flag.get("status").value
                    if str(status) == str(self.cache_invalid_urls[url]):
                        overwrite = False
                if overwrite is True:
                    # overwrite status as well as date
                    flag.add("status",
                             self.cache_invalid_urls[url],
                             showkey=True)
                    flag.add("1", self.deadlink_params[0], showkey=False)
                    flag.add("2", self.deadlink_params[1], showkey=False)
                    flag.add("3", self.deadlink_params[2], showkey=False)
            else:
                # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls
                logger.warning(
                    "status check indeterminate for external link {}".format(
                        extlink))
    def update_page(self, title, text):
        """
        Update package templates on given page.

        Parse wikitext, try to update all package templates, handle broken package links:
            - print warning to console
            - append message to self.log
            - mark it with {{Broken package link}} in the wikicode

        :param title: title of the wiki page
        :param text: content of the wiki page
        :returns: a :py:class:`mwparserfromhell.wikicode.Wikicode` object with the updated
                  content of the page
        """
        logger.info("Parsing page [[{}]]...".format(title))
        lang = detect_language(title)[1]
        wikicode = mwparserfromhell.parse(text)
        for template in wikicode.ifilter_templates():
            # skip unrelated templates
            if not any(template.name.matches(tmp) for tmp in ["Aur", "AUR", "Grp", "Pkg"]):
                continue

            # skip templates no longer under wikicode (templates nested under previously
            # removed parent template are still detected by ifilter)
            try:
                wikicode.index(template, True)
            except ValueError:
                continue

            # strip whitespace around the parameter, otherwise it is added to
            # the link and rendered incorrectly
            self.strip_whitespace(wikicode, template)

            hint = self.update_package_template(template, lang)

            # add/remove/update {{Broken package link}} flag
            if hint is not None:
                logger.warning("broken package link: {}: {}".format(template, hint))
                self.add_report_line(title, template, hint)
                # first unflag since the localized template might change
                ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True)
                # flag with a localized template and hint
                flag = self.get_localized_template("Broken package link", lang)
                ensure_flagged_by_template(wikicode, template, flag, hint, overwrite_parameters=True)
            else:
                ensure_unflagged_by_template(wikicode, template, "Broken package link", match_only_prefix=True)

        return wikicode
Example #6
0
    def update_man_template(self, wikicode, template):
        if template.name.lower() != "man":
            return

        now = datetime.datetime.utcnow()
        deadlink_params = [now.year, now.month, now.day]
        deadlink_params = ["{:02d}".format(i) for i in deadlink_params]

        if not template.has(1) or not template.has(2, ignore_empty=True):
            ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
            return

        if template.get(1).value.strip():
            url = self.url_template.format(section=template.get(1).value.strip(), pagename=queryencode(template.get(2).value.strip()))
        else:
            url = self.url_template_nosection.format(pagename=queryencode(template.get(2).value.strip()))
        if template.has(3):
            url += "#{}".format(queryencode(template.get(3).value.strip()))
        if template.has("url"):
            explicit_url = template.get("url").value.strip()
        else:
            explicit_url = None

        def check_url(url):
            if url.startswith("ftp://"):
                logger.error("The FTP protocol is not supported by the requests module. URL: {}".format(url))
                return True
            if url in self.cache_valid_urls:
                return True
            elif url in self.cache_invalid_urls:
                return False
            response = self.session.get(url, timeout=self.timeout)
            if response.status_code == 200:
                # heuristics to get the missing section (redirect from some_page to some_page.1)
                # WARNING: if the manual exists in multiple sections, the first one might not be the best
                if len(response.history) == 1 and response.url.startswith(url + "."):
                    # template parameter 1= should be empty
                    assert not template.has(1, ignore_empty=True)
                    template.add(1, response.url[len(url) + 1:])
                    self.cache_valid_urls.add(response.url)
                    return True
                else:
                    self.cache_valid_urls.add(url)
                    return True
            elif response.status_code >= 400:
                self.cache_invalid_urls.add(url)
                return False
            else:
                raise NotImplementedError("Unexpected status code {} for man page URL: {}".format(response.status_code, url))

        # check if the template parameters form a valid URL
        if check_url(url):
            ensure_unflagged_by_template(wikicode, template, "Dead link")
            # remove explicit url= parameter - not necessary
            if explicit_url is not None:
                template.remove("url")
        elif explicit_url is None:
            ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
        elif explicit_url != "":
            if check_url(explicit_url):
                ensure_unflagged_by_template(wikicode, template, "Dead link")
            else:
                ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
Example #7
0
    def update_man_template(self, wikicode, template):
        if template.name.lower() != "man":
            return

        now = datetime.datetime.utcnow()
        deadlink_params = [now.year, now.month, now.day]
        deadlink_params = ["{:02d}".format(i) for i in deadlink_params]

        if not template.has(1) or not template.has(2, ignore_empty=True):
            ensure_flagged_by_template(wikicode,
                                       template,
                                       "Dead link",
                                       *deadlink_params,
                                       overwrite_parameters=False)
            return

        url = self.url_prefix
        if template.has("pkg"):
            url += template.get("pkg").value.strip() + "/"
        url += queryencode(template.get(2).value.strip())
        if template.get(1).value.strip():
            url += "." + template.get(1).value.strip()
        if template.has(3):
            url += "#{}".format(queryencode(template.get(3).value.strip()))

        if template.has("url"):
            explicit_url = template.get("url").value.strip()
        else:
            explicit_url = None

        def check_url(url):
            if url.startswith("ftp://"):
                logger.error(
                    "The FTP protocol is not supported by the requests module. URL: {}"
                    .format(url))
                return True
            if url in self.cache_valid_urls:
                return True
            elif url in self.cache_invalid_urls:
                return False
            response = self.session.get(url, timeout=self.timeout)
            if response.status_code == 200:
                # heuristics to get the missing section (redirect from some_page to some_page.1)
                # WARNING: if the manual exists in multiple sections, the first one might not be the best
                if len(response.history) == 1 and response.url.startswith(url +
                                                                          "."):
                    # template parameter 1= should be empty
                    assert not template.has(1, ignore_empty=True)
                    template.add(1, response.url[len(url) + 1:])
                    self.cache_valid_urls.add(response.url)
                    return True
                else:
                    self.cache_valid_urls.add(url)
                    return True
            elif response.status_code >= 400:
                self.cache_invalid_urls.add(url)
                return False
            else:
                raise NotImplementedError(
                    "Unexpected status code {} for man page URL: {}".format(
                        response.status_code, url))

        # check if the template parameters form a valid URL
        if check_url(url):
            ensure_unflagged_by_template(wikicode, template, "Dead link")
            # remove explicit url= parameter - not necessary
            if explicit_url is not None:
                template.remove("url")
        elif explicit_url is None:
            ensure_flagged_by_template(wikicode,
                                       template,
                                       "Dead link",
                                       *deadlink_params,
                                       overwrite_parameters=False)
        elif explicit_url != "":
            if check_url(explicit_url):
                ensure_unflagged_by_template(wikicode, template, "Dead link")
            else:
                ensure_flagged_by_template(wikicode,
                                           template,
                                           "Dead link",
                                           *deadlink_params,
                                           overwrite_parameters=False)
Example #8
0
    def check_extlink_status(self, wikicode, extlink):
        # make a copy of the URL object (the skip_style_flags parameter is False,
        # so we will also properly parse URLs terminated by a wiki markup)
        url = mwparserfromhell.parse(str(extlink.url))

        # mwparserfromhell parses free URLs immediately followed by a template
        # (e.g. http://domain.tld/{{Dead link|2020|02|20}}) completely as one URL,
        # so we need to split it manually
        if "{{" in str(url):
            url, rest = str(url).split("{{", maxsplit=1)
            rest = "{{" + rest
            url = mwparserfromhell.parse(url)
            # remove everything after the real URL from the extlink...
            for node in extlink.url.nodes[1:]:
                extlink.url.remove(node)
            # ...and insert it into the parent wikicode after the link
            parent = get_parent_wikicode(wikicode, extlink)
            parent.insert_after(extlink, rest)

        # replace HTML entities like "&#61" or "Σ" with their unicode equivalents
        for entity in url.ifilter_html_entities(recursive=True):
            url.replace(entity, entity.normalize())

        try:
            # try to parse the URL - fails e.g. if port is not a number
            # reference: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.parse_url
            url = urllib3.util.url.parse_url(str(url))
        except urllib3.exceptions.LocationParseError:
            logger.debug("skipped invalid URL: {}".format(url))
            return

        # skip unsupported schemes
        if url.scheme not in ["http", "https"]:
            logger.debug("skipped URL with unsupported scheme: {}".format(url))
            return
        # skip URLs with empty host, e.g. "http://" or "http://git@" or "http:///var/run"
        # (partial workaround for https://github.com/earwig/mwparserfromhell/issues/196 )
        if not url.host:
            logger.debug("skipped URL with empty host: {}".format(url))
            return
        # skip links with top-level domains only
        # (in practice they would be resolved relative to the local domain, on the wiki they are used
        # mostly as a pseudo-variable like http://server/path or http://mydomain/path)
        if "." not in url.host:
            logger.debug(
                "skipped URL with only top-level domain host: {}".format(url))
            return
        # skip links to localhost
        if url.host == "localhost" or url.host.endswith(".localhost"):
            logger.debug("skipped URL to localhost: {}".format(url))
            return
        # skip links to 127.*.*.* and ::1
        try:
            addr = ipaddress.ip_address(url.host)
            local_network = ipaddress.ip_network("127.0.0.0/8")
            if addr in local_network:
                logger.debug("skipped URL to local IP address: {}".format(url))
                return
        except ValueError:
            pass
        # drop the fragment from the URL (to optimize caching)
        if url.fragment:
            url = urllib3.util.url.parse_url(
                url.url.rsplit("#", maxsplit=1)[0])

        logger.info("Checking link {} ...".format(extlink))

        status = self.check_url(url)
        if status is True:
            # TODO: the link might still be flagged for a reason (e.g. when the server redirects to some dummy page without giving a proper status code)
            ensure_unflagged_by_template(wikicode, extlink, "Dead link")
        elif status is False:
            # TODO: handle bbs.archlinux.org (some links may require login)
            # TODO: handle links inside {{man|url=...}} properly
            # flag the link, but don't overwrite date and don't set status yet
            flag = ensure_flagged_by_template(wikicode,
                                              extlink,
                                              "Dead link",
                                              *self.deadlink_params,
                                              overwrite_parameters=False)
            # overwrite by default, but skip overwriting date when the status matches
            overwrite = True
            if flag.has("status"):
                status = flag.get("status").value
                if str(status) == str(self.cache_invalid_urls[url]):
                    overwrite = False
            if overwrite is True:
                # overwrite status as well as date
                flag.add("status", self.cache_invalid_urls[url], showkey=True)
                flag.add("1", self.deadlink_params[0], showkey=False)
                flag.add("2", self.deadlink_params[1], showkey=False)
                flag.add("3", self.deadlink_params[2], showkey=False)
        else:
            # TODO: ask the user for manual check (good/bad/skip) and move the URL from self.cache_indeterminate_urls to self.cache_valid_urls or self.cache_invalid_urls
            logger.warning(
                "status check indeterminate for external link {}".format(
                    extlink))
    def update_man_template(self, wikicode, template, src_title):
        if template.name.lower() != "man":
            return
        src_lang = lang.detect_language(src_title)[1]

        now = datetime.datetime.utcnow()
        deadlink_params = [now.year, now.month, now.day]
        deadlink_params = ["{:02d}".format(i) for i in deadlink_params]

        if not template.has(1) or not template.has(2, ignore_empty=True):
            # first replace the existing template (if any) with a translated version
            flag = self.get_localized_template("Dead link", src_lang)
            localize_flag(wikicode, template, flag)
            # flag with the correct translated template
            ensure_flagged_by_template(wikicode,
                                       template,
                                       flag,
                                       *deadlink_params,
                                       overwrite_parameters=False)
            return

        url = self.man_url_prefix
        if template.has("pkg"):
            url += template.get("pkg").value.strip() + "/"
        url += urlencode(template.get(2).value.strip())
        # template parameter 1= should be empty
        if not template.has(1, ignore_empty=True):
            response = self.session.head(url,
                                         timeout=self.timeout,
                                         allow_redirects=True)
            # heuristics to get the missing section (redirect from some_page to some_page.1)
            # WARNING: if the manual exists in multiple sections, the first one might not be the best
            if response.status_code == 200 and len(
                    response.history) == 1 and response.url.startswith(url +
                                                                       "."):
                template.add(1, response.url[len(url) + 1:])
        if template.get(1).value.strip():
            url += "." + template.get(1).value.strip()
        if template.has(3):
            url += "#{}".format(
                urlencode(anchorencode(template.get(3).value.strip())))

        if template.has("url"):
            explicit_url = template.get("url").value.strip()
        else:
            explicit_url = None

        # check if the template parameters form a valid URL
        if self.check_url(url):
            ensure_unflagged_by_template(wikicode,
                                         template,
                                         "Dead link",
                                         match_only_prefix=True)
            # remove explicit url= parameter - not necessary
            if explicit_url is not None:
                template.remove("url")
        elif explicit_url is None:
            # first replace the existing template (if any) with a translated version
            flag = self.get_localized_template("Dead link", src_lang)
            localize_flag(wikicode, template, flag)
            # flag with the correct translated template
            ensure_flagged_by_template(wikicode,
                                       template,
                                       flag,
                                       *deadlink_params,
                                       overwrite_parameters=False)
        elif explicit_url != "":
            if self.check_url(explicit_url):
                ensure_unflagged_by_template(wikicode,
                                             template,
                                             "Dead link",
                                             match_only_prefix=True)
            else:
                # first replace the existing template (if any) with a translated version
                flag = self.get_localized_template("Dead link", src_lang)
                localize_flag(wikicode, template, flag)
                # flag with the correct translated template
                ensure_flagged_by_template(wikicode,
                                           template,
                                           flag,
                                           *deadlink_params,
                                           overwrite_parameters=False)