def __init__(self, page_config, fetch_config=None): """ page_config is a required dict to create a gatherer.Page fetch_config is a dict used to setup the fetcher. possible keys are: sleep_time - number, default 5 cache - a gatherer.Cache, default None headers - a dict for requests to send with a request, default None """ self.page = Page.from_json(page_config) if fetch_config is None: fetch_config = {} self.fetcher = Fetch(**fetch_config)
class Wiki(object): def __init__(self, page_config, fetch_config=None): """ page_config is a required dict to create a gatherer.Page fetch_config is a dict used to setup the fetcher. possible keys are: sleep_time - number, default 5 cache - a gatherer.Cache, default None headers - a dict for requests to send with a request, default None """ self.page = Page.from_json(page_config) if fetch_config is None: fetch_config = {} self.fetcher = Fetch(**fetch_config) def search(self, term): search_url = "http://en.wikipedia.org/w/index.php?title=Special:Search&search=%s" return self.fetch(search_url % term) def direct(self, term): formatted_term = term.replace(" ", "_") direct_url = "https://en.wikipedia.org/wiki/%s" return self.fetch(direct_url % formatted_term) def fetch(self, url): """ use the fetcher to return the dom of the given url if the get request does not succeed, returns None """ return self.fetcher.get(url) def gather(self, dom): return self.page.gather(dom) def forage(self, term): """ given a search term, get the dom of wikipedia page for the term (if it exists) and return the data for the Page """ dom = self.search(term) if dom is not None: return self.gather(dom)
import json import argparse from gatherer import Page, Fetch, Cache with open("pages/schedule.json") as fp: schedule_json = json.load(fp) c = Cache("cache") f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c) schedule = Page.from_json(schedule_json) def get_season(year): BASE_URL = "http://www.nfl.com/schedules/{}/REG{}" for week in range(1, 18): dom = f.get(BASE_URL.format(year, week)) s = schedule.gather(dom) with open("data/{}-{:02d}.json".format(year, week), "w") as fp: json.dump(s, fp, indent=2) def get_week(year, week): BASE_URL = "http://www.nfl.com/schedules/{}/REG{}" dom = f.get(BASE_URL.format(year, week)) s = schedule.gather(dom) with open("data/{}-{:02d}.json".format(year, week), "w") as fp: json.dump(s, fp, indent=2)
def coordinate_vals(coord): # don't care about precision down to the seconds nums = re.compile(r'(?P<degree>\d+)\u00b0(?P<minutes>\d+)') match = nums.search(coord) if match is None: return return match.groupdict() def coordinate_decimal(coord): return float(coord["degree"]) + (float(coord["minutes"])/60) c = Cache("cache") f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c) with open("pages/stadiums.json") as fp: stadium_json = json.load(fp) stadiums = Page.from_json(stadium_json) with open("pages/coordinates.json") as fp: coord_json = json.load(fp) coords = Page.from_json(coord_json) # get the basic stadium data stadium_dom = f.get("http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums") stadium_data = stadiums.gather(stadium_dom) stadium_coords = {}
roster_rules = json.load(fp) # load a dict with the urls for all of the FBS D1-A teams' roster urls with open("team_pages.json") as fp: team_urls = json.load(fp) cache = Cache("cache") wiki_city = city.City(city.city_rule_set, { "headers": { "User-Agent": "gatherer" }, "sleep_time": 0, "cache": cache }) fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache) roster_page = Page.from_json(roster_rules) KNOWN_CITIES = {} def get_roster(url): """ given the url (on espn.com) for a football team, return an array of dicts with hometown and position keys """ dom = fetcher.get(url) return roster_page.gather(dom) def get_coordinates(hometown): # if a player does not live in the US or Canada, his hometown is listed as --
import os import json import argparse from gatherer import Page, Fetch os.makedirs("data", exist_ok=True) with open("submissions.json") as fp: sub_json = json.load(fp) f = Fetch(headers={"User-Agent": "gatherer"}) p = Page.from_json(sub_json) def fetch_and_save(filename, subreddit=None): if subreddit is None: url = "http://www.reddit.com" else: url = "http://www.reddit.com/r/{}".format(subreddit) dom = f.get(url) if dom is not None: data = p.gather(dom) path = "data/{}".format(filename) with open(path, "w") as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-subreddit', dest='subreddit', help='subreddit to get data from') parser.add_argument('-filename', dest='filename',
import json from gatherer import Fetch, Cache, Page cache = Cache("cache") fetcher = Fetch(cache=cache) with open("rules/www_rottentomatoes_com/actor.json") as fp: actor_json = json.load(fp) actor_page = Page.from_json(actor_json) with open("rules/www_rottentomatoes_com/movie.json") as fp: movie_json = json.load(fp) movie_page = Page.from_json(movie_json) def get_actor(url): """ return a dict with the data from an actor's profile """ dom = fetcher.get(url, True) if dom is not None: return actor_page.gather(dom) def get_movie(url): """ return a dict with the data from a movie's profile """ dom = fetcher.get(url) if dom is not None: return movie_page.gather(dom)
from gatherer import Fetch, Cache fs_cache = Cache("cache") fetcher = Fetch(headers={"User-Agent": "Saturday Night Live Data"}, cache=fs_cache)