def get_hits_on_name(name): """ Accepts a `name` of a mathematician and returns the number of hits that mathematician's Wikipedia page received in the last 60 days, as an `int` """ # url_root is a template string that is used to build a URL. url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE' response = simple_get(url_root.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [ a for a in html.select('a') if a['href'].find('latest-60') > -1 ] if len(hit_link) > 0: # Strip commas link_text = hit_link[0].text.replace(',', '') try: # Convert to integer return int(link_text) except: log_error("couldn't parse {} as an `int`".format(link_text)) log_error('No pageviews found for {}'.format(name)) return None
def get_names(url): """ Downloads the page where the list of mathematicians is found and returns a list of strings, one per mathematician """ response = simple_get(url) if response is not None: html = BeautifulSoup(response, 'html.parser') names = set() for li in html.select('li'): for name in li.text.split('\n'): if len(name) > 0: names.add(name.strip()) return list(names) # Raise an exception if we failed to get any data from the url raise Exception('Error retrieving contents at {}'.format(url))
from funcs import simple_get raw_html = simple_get('https://realpython.com/blog/') len(raw_html) no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it') no_html is None
from bs4 import BeautifulSoup from funcs import simple_get raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm') html = BeautifulSoup(raw_html, 'html.parser') for i, li in enumerate(html.select('li')): print(i, li)