Exemple #1
0
def check_validity_external_links():
    """
    Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
    from time to time.
    """

    # TODO check if links are occurring in multiple entries, first go through all entries and find all links, then check links for multiple entries, then check links, follow redirects

    print("check external links (can take a while)")

    # regex for finding urls (can be in <> or in ]() or after a whitespace
    regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]")
    # regex = re.compile(r"[\s\n<(](http://.*?)[\s\n>)]")

    # count
    number_checked_links = 0

    # ignore the following urls (they give false positives here)
    ignored_urls = ('https://git.tukaani.org/xz.git')

    # iterate over all entries
    for _, entry_path, content in osg.entry_iterator():

            # apply regex
            matches = regex.findall(content)

            # for each match
            for match in matches:

                # for each possible clause
                for url in match:

                    # if there was something (and not a sourceforge git url)
                    if url and not url.startswith('https://git.code.sf.net/p/') and url not in ignored_urls:
                        try:
                            # without a special header, frequent 403 responses occur
                            req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'})
                            urllib.request.urlopen(req)
                        except urllib.error.HTTPError as e:
                            print("{}: {} - {}".format(os.path.basename(entry_path), url, e.code))
                        except urllib.error.URLError as e:
                            print("{}: {} - {}".format(os.path.basename(entry_path), url, e.reason))
                        except http.client.RemoteDisconnected:
                            print("{}: {} - disconnected without response".format(os.path.basename(entry_path), url))

                        number_checked_links += 1

                        if number_checked_links % 50 == 0:
                            print("{} links checked".format(number_checked_links))

    print("{} links checked".format(number_checked_links))
Exemple #2
0
    def check_template_leftovers(self):
        """
        Checks for template leftovers.
        Should be run only occasionally.
        """
        # load template and get all lines
        text = utils.read_text(os.path.join(c.root_path, 'template.md'))
        text = text.split('\n')
        check_strings = [x for x in text if x and not x.startswith('##')]

        # iterate over all entries
        for _, entry_path, content in osg.entry_iterator():

            for check_string in check_strings:
                if content.find(check_string) >= 0:
                    print('{}: found {}'.format(os.path.basename(entry_path), check_string))
        print('checked for template leftovers')
Exemple #3
0
    def check_external_links(self):
        """
        Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run
        from time to time.
        """

        # regex for finding urls (can be in <> or in ]() or after a whitespace
        regex = re.compile(
            r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]"
        )

        # ignore the following patterns (they give false positives here)
        ignored_urls = ('https://git.tukaani.org/xz.git',
                        'https://git.code.sf.net/',
                        'http://hg.hedgewars.org/hedgewars/',
                        'https://git.xiph.org/vorbis.git',
                        'http://svn.uktrainsim.com/svn/openrails',
                        'https://www.srb2.org/', 'http://wiki.srb2.org/')

        # some do redirect, but we nedertheless want the original URL in the database
        redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/',
                         'https://godotengine.org/download')

        # extract all links from entries
        import urllib3
        urllib3.disable_warnings(
        )  # otherwise we cannot verify those with SSL errors without getting warnings
        urls = {}
        for entry, _, content in osg.entry_iterator():
            # apply regex
            matches = regex.findall(content)
            # for each match
            for match in matches:
                for url in match:
                    if url and not any(
                        (url.startswith(x) for x in ignored_urls)):
                        # ignore bzr.sourceforge, no web address found
                        if 'bzr.sourceforge.net/bzrroot/' in url:
                            continue

                        # add "/" at the end
                        if any(
                            (url.startswith(x)
                             for x in ('https://anongit.freedesktop.org/git',
                                       'https://git.savannah.gnu.org/git/',
                                       'https://git.savannah.nongnu.org/git/',
                                       'https://git.artsoft.org/'))):
                            url += '/'

                        if url.startswith('https://bitbucket.org/'
                                          ) and url.endswith('.git'):
                            url = url[:-4] + '/commits/'
                        if url.startswith('https://svn.code.sf.net/p/'):
                            url = 'http' + url[5:] + '/'
                        if url.startswith(
                                'http://cvs.savannah.nongnu.org:/sources/'):
                            url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[
                                40:] + '/'
                        if url.startswith(
                                'http://cvs.savannah.gnu.org:/sources/'):
                            url = 'http://cvs.savannah.gnu.org/viewvc/' + url[
                                37:] + '/'

                        # generally ".git" at the end is not working well, except sometimes
                        if url.endswith('.git') and not any(
                            (url.startswith(x) for x in
                             ('https://repo.or.cz',
                              'https://git.tuxfamily.org/fanwor/fanwor'))):
                            url = url[:-4]

                        if url in urls:
                            urls[url].add(entry)
                        else:
                            urls[url] = {entry}
        print('found {} unique links'.format(len(urls)))
        print("start checking external links (can take a while)")

        # now iterate over all urls
        for url, names in urls.items():
            names = list(names)  # was a set
            if len(names) == 1:
                names = names[0]
            try:
                verify = True
                # some have an expired certificate but otherwise still work
                if any(
                    (url.startswith(x)
                     for x in ('https://perso.b2b2c.ca/~sarrazip/dev/',
                               'https://dreerally.com/', 'https://henlin.net/',
                               'https://www.megamek.org/',
                               'https://pixeldoctrine.com/',
                               'https://gitorious.org/',
                               'https://www.opmon-game.ga/'))):
                    verify = False
                r = requests.head(url,
                                  headers={
                                      'User-Agent':
                                      'Mozilla/5.0 (Windows NT 10.0; WOW64)'
                                  },
                                  timeout=20,
                                  allow_redirects=True,
                                  verify=verify)
                if r.status_code == 405:  # head method not supported, try get
                    r = requests.get(url,
                                     headers={
                                         'User-Agent':
                                         'Mozilla/5.0 (Windows NT 10.0; WOW64)'
                                     },
                                     timeout=20,
                                     allow_redirects=True,
                                     verify=verify)
                # check for bad status
                if r.status_code != requests.codes.ok:
                    print('{}: {} - {}'.format(names, url, r.status_code))
                # check for redirect
                if r.history and url not in redirect_okay:
                    # only / added or http->https sometimes
                    redirected_url = r.url
                    if redirected_url == url + '/':
                        output = '{}: {} -> {} - redirect "/" at end '
                    elif redirected_url == 'https' + url[4:]:
                        output = '{}: {} -> {} - redirect "https" at start'
                    else:
                        output = '{}: {} -> {} - redirect '
                    print(output.format(names, url, redirected_url))
            except Exception as e:
                error_name = type(e).__name__
                if error_name == 'SSLError' and any(
                    (url.startswith(x)
                     for x in ('https://gitorious.org/',
                               'https://www.freedroid.org/download/'))):
                    continue  # even though verify is False, these errors still get through
                print('{}: {} - exception {}'.format(names, url, error_name))
def fix_entries():
    """
    Fixes the keywords, code dependencies, build systems, .. entries, mostly by automatically sorting them.
    """

    keyword_synonyms = {'RTS': ('real time', 'strategy'), 'realtime': 'real time'}

    # TODO also sort other fields, only read once and then do all, move to separate file

    print('fix entries')

    # keywords
    regex = re.compile(r"(.*)- Keywords:([^\n]*)(.*)", re.DOTALL)

    # iterate over all entries
    for entry, entry_path, content in osg.entry_iterator():

        # match with regex
        matches = regex.findall(content)
        if len(matches) != 1:
            raise RuntimeError('Could not find keywords in entry "{}"'.format(entry))

        match = matches[0]

        # get elements out, split, strip, delete duplicates
        elements = match[1].split(',')
        elements = [x.strip() for x in elements]
        elements = list(set(elements))

        # get category out
        for keyword in osg.recommended_keywords:
            if keyword in elements:
                elements.remove(keyword)
                category = keyword
                break

        # special treatments here
        elements = [x if x != 'TBS' and x != 'TB' else 'turn based' for x in elements]
        elements = [x if x != 'RTS' else 'real time' for x in elements]
        elements = [x if x != 'MMO' else 'massive multiplayer online' for x in elements]
        elements = [x if x != 'MMO' else 'multiplayer online' for x in elements]
        elements = [x if x != 'SP' else 'singleplayer' for x in elements]
        elements = [x if x != 'MP' else 'multiplayer' for x in elements]
        elements = [x if x != 'engine' else 'game engine' for x in elements]
        elements = [x if x != 'rpg' else 'role playing' for x in elements]
        elements = [x if x != 'turn based' else 'turn-based' for x in elements]
        for keyword in ('browser', 'misc', 'tools'):
            if keyword in elements:
                elements.remove(keyword)

        # sort
        elements.sort(key=str.casefold)

        # add category
        elements.insert(0, category)

        keywords = '- Keywords: {}'.format(', '.join(elements))

        new_content = match[0] + keywords + match[2]

        if new_content != content:
            # write again
            utils.write_text(entry_path, new_content)

    # code dependencies
    regex = re.compile(r"(.*)- Code dependencies:([^\n]*)(.*)", re.DOTALL)

    # iterate over all entries
    for entry, entry_path, content in osg.entry_iterator():
        # match with regex
        matches = regex.findall(content)

        if not matches:
            # no code dependencies given
            continue

        match = matches[0]

        # get code dependencies out, split, strip, delete duplicates
        elements = match[1].split(',')
        elements = [x.strip() for x in elements]
        elements = list(set(elements))

        # special treatments here
        elements = [x if x != 'Blender' else 'Blender game engine' for x in elements]
        elements = [x if x.lower() != 'libgdx' else 'libGDX' for x in elements]
        elements = [x if x != 'SDL 2' else 'SDL2' for x in elements]
        elements = [x if x.lower() != "ren'py" else "Ren'Py" for x in elements]

        # sort
        elements.sort(key=str.casefold)

        code_dependencies = '- Code dependencies: {}'.format(', '.join(elements))

        new_content = match[0] + code_dependencies + match[2]

        if new_content != content:
            # write again
            utils.write_text(entry_path, new_content)

    # build systems
    regex = re.compile(r"(.*)- Build system:([^\n]*)(.*)", re.DOTALL)

    # iterate over all entries
    for entry, entry_path, content in osg.entry_iterator():
        # match with regex
        matches = regex.findall(content)

        if not matches:
            # no build system given
            continue

        match = matches[0]

        # get code dependencies out, split, strip, delete duplicates
        elements = match[1].split(',')
        elements = [x.strip() for x in elements]
        elements = list(set(elements))

        # special treatments here

        # sort
        elements.sort(key=str.casefold)

        build_system = '- Build system: {}'.format(', '.join(elements))

        new_content = match[0] + build_system + match[2]

        if new_content != content:
            # write again
            utils.write_text(entry_path, new_content)