def check_validity_external_links(): """ Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run from time to time. """ # TODO check if links are occurring in multiple entries, first go through all entries and find all links, then check links for multiple entries, then check links, follow redirects print("check external links (can take a while)") # regex for finding urls (can be in <> or in ]() or after a whitespace regex = re.compile(r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]") # regex = re.compile(r"[\s\n<(](http://.*?)[\s\n>)]") # count number_checked_links = 0 # ignore the following urls (they give false positives here) ignored_urls = ('https://git.tukaani.org/xz.git') # iterate over all entries for _, entry_path, content in osg.entry_iterator(): # apply regex matches = regex.findall(content) # for each match for match in matches: # for each possible clause for url in match: # if there was something (and not a sourceforge git url) if url and not url.startswith('https://git.code.sf.net/p/') and url not in ignored_urls: try: # without a special header, frequent 403 responses occur req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}) urllib.request.urlopen(req) except urllib.error.HTTPError as e: print("{}: {} - {}".format(os.path.basename(entry_path), url, e.code)) except urllib.error.URLError as e: print("{}: {} - {}".format(os.path.basename(entry_path), url, e.reason)) except http.client.RemoteDisconnected: print("{}: {} - disconnected without response".format(os.path.basename(entry_path), url)) number_checked_links += 1 if number_checked_links % 50 == 0: print("{} links checked".format(number_checked_links)) print("{} links checked".format(number_checked_links))
def check_template_leftovers(self): """ Checks for template leftovers. Should be run only occasionally. """ # load template and get all lines text = utils.read_text(os.path.join(c.root_path, 'template.md')) text = text.split('\n') check_strings = [x for x in text if x and not x.startswith('##')] # iterate over all entries for _, entry_path, content in osg.entry_iterator(): for check_string in check_strings: if content.find(check_string) >= 0: print('{}: found {}'.format(os.path.basename(entry_path), check_string)) print('checked for template leftovers')
def check_external_links(self): """ Checks all external links it can find for validity. Prints those with non OK HTTP responses. Does only need to be run from time to time. """ # regex for finding urls (can be in <> or in ]() or after a whitespace regex = re.compile( r"[\s\n]<(http.+?)>|\]\((http.+?)\)|[\s\n](http[^\s\n,]+?)[\s\n\)]" ) # ignore the following patterns (they give false positives here) ignored_urls = ('https://git.tukaani.org/xz.git', 'https://git.code.sf.net/', 'http://hg.hedgewars.org/hedgewars/', 'https://git.xiph.org/vorbis.git', 'http://svn.uktrainsim.com/svn/openrails', 'https://www.srb2.org/', 'http://wiki.srb2.org/') # some do redirect, but we nedertheless want the original URL in the database redirect_okay = ('https://octaforge.org/', 'https://svn.openttd.org/', 'https://godotengine.org/download') # extract all links from entries import urllib3 urllib3.disable_warnings( ) # otherwise we cannot verify those with SSL errors without getting warnings urls = {} for entry, _, content in osg.entry_iterator(): # apply regex matches = regex.findall(content) # for each match for match in matches: for url in match: if url and not any( (url.startswith(x) for x in ignored_urls)): # ignore bzr.sourceforge, no web address found if 'bzr.sourceforge.net/bzrroot/' in url: continue # add "/" at the end if any( (url.startswith(x) for x in ('https://anongit.freedesktop.org/git', 'https://git.savannah.gnu.org/git/', 'https://git.savannah.nongnu.org/git/', 'https://git.artsoft.org/'))): url += '/' if url.startswith('https://bitbucket.org/' ) and url.endswith('.git'): url = url[:-4] + '/commits/' if url.startswith('https://svn.code.sf.net/p/'): url = 'http' + url[5:] + '/' if url.startswith( 'http://cvs.savannah.nongnu.org:/sources/'): url = 'http://cvs.savannah.nongnu.org/viewvc/' + url[ 40:] + '/' if url.startswith( 'http://cvs.savannah.gnu.org:/sources/'): url = 'http://cvs.savannah.gnu.org/viewvc/' + url[ 37:] + '/' # generally ".git" at the end is not working well, except sometimes if url.endswith('.git') and not any( (url.startswith(x) for x in ('https://repo.or.cz', 'https://git.tuxfamily.org/fanwor/fanwor'))): url = url[:-4] if url in urls: urls[url].add(entry) else: urls[url] = {entry} print('found {} unique links'.format(len(urls))) print("start checking external links (can take a while)") # now iterate over all urls for url, names in urls.items(): names = list(names) # was a set if len(names) == 1: names = names[0] try: verify = True # some have an expired certificate but otherwise still work if any( (url.startswith(x) for x in ('https://perso.b2b2c.ca/~sarrazip/dev/', 'https://dreerally.com/', 'https://henlin.net/', 'https://www.megamek.org/', 'https://pixeldoctrine.com/', 'https://gitorious.org/', 'https://www.opmon-game.ga/'))): verify = False r = requests.head(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)' }, timeout=20, allow_redirects=True, verify=verify) if r.status_code == 405: # head method not supported, try get r = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)' }, timeout=20, allow_redirects=True, verify=verify) # check for bad status if r.status_code != requests.codes.ok: print('{}: {} - {}'.format(names, url, r.status_code)) # check for redirect if r.history and url not in redirect_okay: # only / added or http->https sometimes redirected_url = r.url if redirected_url == url + '/': output = '{}: {} -> {} - redirect "/" at end ' elif redirected_url == 'https' + url[4:]: output = '{}: {} -> {} - redirect "https" at start' else: output = '{}: {} -> {} - redirect ' print(output.format(names, url, redirected_url)) except Exception as e: error_name = type(e).__name__ if error_name == 'SSLError' and any( (url.startswith(x) for x in ('https://gitorious.org/', 'https://www.freedroid.org/download/'))): continue # even though verify is False, these errors still get through print('{}: {} - exception {}'.format(names, url, error_name))
def fix_entries(): """ Fixes the keywords, code dependencies, build systems, .. entries, mostly by automatically sorting them. """ keyword_synonyms = {'RTS': ('real time', 'strategy'), 'realtime': 'real time'} # TODO also sort other fields, only read once and then do all, move to separate file print('fix entries') # keywords regex = re.compile(r"(.*)- Keywords:([^\n]*)(.*)", re.DOTALL) # iterate over all entries for entry, entry_path, content in osg.entry_iterator(): # match with regex matches = regex.findall(content) if len(matches) != 1: raise RuntimeError('Could not find keywords in entry "{}"'.format(entry)) match = matches[0] # get elements out, split, strip, delete duplicates elements = match[1].split(',') elements = [x.strip() for x in elements] elements = list(set(elements)) # get category out for keyword in osg.recommended_keywords: if keyword in elements: elements.remove(keyword) category = keyword break # special treatments here elements = [x if x != 'TBS' and x != 'TB' else 'turn based' for x in elements] elements = [x if x != 'RTS' else 'real time' for x in elements] elements = [x if x != 'MMO' else 'massive multiplayer online' for x in elements] elements = [x if x != 'MMO' else 'multiplayer online' for x in elements] elements = [x if x != 'SP' else 'singleplayer' for x in elements] elements = [x if x != 'MP' else 'multiplayer' for x in elements] elements = [x if x != 'engine' else 'game engine' for x in elements] elements = [x if x != 'rpg' else 'role playing' for x in elements] elements = [x if x != 'turn based' else 'turn-based' for x in elements] for keyword in ('browser', 'misc', 'tools'): if keyword in elements: elements.remove(keyword) # sort elements.sort(key=str.casefold) # add category elements.insert(0, category) keywords = '- Keywords: {}'.format(', '.join(elements)) new_content = match[0] + keywords + match[2] if new_content != content: # write again utils.write_text(entry_path, new_content) # code dependencies regex = re.compile(r"(.*)- Code dependencies:([^\n]*)(.*)", re.DOTALL) # iterate over all entries for entry, entry_path, content in osg.entry_iterator(): # match with regex matches = regex.findall(content) if not matches: # no code dependencies given continue match = matches[0] # get code dependencies out, split, strip, delete duplicates elements = match[1].split(',') elements = [x.strip() for x in elements] elements = list(set(elements)) # special treatments here elements = [x if x != 'Blender' else 'Blender game engine' for x in elements] elements = [x if x.lower() != 'libgdx' else 'libGDX' for x in elements] elements = [x if x != 'SDL 2' else 'SDL2' for x in elements] elements = [x if x.lower() != "ren'py" else "Ren'Py" for x in elements] # sort elements.sort(key=str.casefold) code_dependencies = '- Code dependencies: {}'.format(', '.join(elements)) new_content = match[0] + code_dependencies + match[2] if new_content != content: # write again utils.write_text(entry_path, new_content) # build systems regex = re.compile(r"(.*)- Build system:([^\n]*)(.*)", re.DOTALL) # iterate over all entries for entry, entry_path, content in osg.entry_iterator(): # match with regex matches = regex.findall(content) if not matches: # no build system given continue match = matches[0] # get code dependencies out, split, strip, delete duplicates elements = match[1].split(',') elements = [x.strip() for x in elements] elements = list(set(elements)) # special treatments here # sort elements.sort(key=str.casefold) build_system = '- Build system: {}'.format(', '.join(elements)) new_content = match[0] + build_system + match[2] if new_content != content: # write again utils.write_text(entry_path, new_content)