def __init__(self, page_config, fetch_config=None): """ page_config is a required dict to create a gatherer.Page fetch_config is a dict used to setup the fetcher. possible keys are: sleep_time - number, default 5 cache - a gatherer.Cache, default None headers - a dict for requests to send with a request, default None """ self.page = Page.from_json(page_config) if fetch_config is None: fetch_config = {} self.fetcher = Fetch(**fetch_config)
import json import argparse from gatherer import Page, Fetch, Cache with open("pages/schedule.json") as fp: schedule_json = json.load(fp) c = Cache("cache") f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c) schedule = Page.from_json(schedule_json) def get_season(year): BASE_URL = "http://www.nfl.com/schedules/{}/REG{}" for week in range(1, 18): dom = f.get(BASE_URL.format(year, week)) s = schedule.gather(dom) with open("data/{}-{:02d}.json".format(year, week), "w") as fp: json.dump(s, fp, indent=2) def get_week(year, week): BASE_URL = "http://www.nfl.com/schedules/{}/REG{}" dom = f.get(BASE_URL.format(year, week)) s = schedule.gather(dom) with open("data/{}-{:02d}.json".format(year, week), "w") as fp: json.dump(s, fp, indent=2)
with open("roster.json") as fp: roster_rules = json.load(fp) # load a dict with the urls for all of the FBS D1-A teams' roster urls with open("team_pages.json") as fp: team_urls = json.load(fp) cache = Cache("cache") wiki_city = city.City(city.city_rule_set, { "headers": {"User-Agent": "gatherer"}, "sleep_time": 0, "cache": cache }) fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache) roster_page = Page.from_json(roster_rules) KNOWN_CITIES = {} def get_roster(url): """ given the url (on espn.com) for a football team, return an array of dicts with hometown and position keys """ dom = fetcher.get(url) return roster_page.gather(dom) def get_coordinates(hometown): # if a player does not live in the US or Canada, his hometown is listed as -- if hometown == "--":
import json import os from gatherer import Page from snl.fetch import fetcher from snl.fetch.helpers import repertory_cast_member LOCAL_DIR = os.path.dirname(__file__) RULES_DIR = os.path.join(LOCAL_DIR, "rules") with open(os.path.join(RULES_DIR, "cast.json")) as fp: cast_json = json.load(fp) cast_page = Page.from_json(cast_json) def sort_actors(casts): repertory = [] featured = [] for cast_group in casts: for members in cast_group.get("members"): if repertory_cast_member(members.get("description")): repertory.extend(members.get("actors")) else: featured.extend(members.get("actors")) return {"repertory": repertory, "featured": featured} def cast(season): """ return a dict with the data for a season of Saturday Night Live episodes.
import json import os from gatherer import Page from snl.fetch import fetcher from snl.fetch.helpers import full_month, infer_gender LOCAL_DIR = os.path.dirname(__file__) RULES_DIR = os.path.join(LOCAL_DIR, "rules") with open(os.path.join(RULES_DIR, "actor.json")) as fp: actor_json = json.load(fp) actor_page = Page.from_json(actor_json) def clean_profile(data): if data is None: return return { "name": data.get("name"), "birthdate": full_month(data.get("birthdate")), "hometown": data.get("hometown"), "gender": infer_gender(data.get("description")), "roles": data.get("roles") } def profile(actor_url): """ return a dict with the data for a season of Saturday Night Live episodes.
import json import os import re from gatherer import Page from snl.fetch import fetcher from snl.fetch.helpers import abbr_month LOCAL_DIR = os.path.dirname(__file__) RULES_DIR = os.path.join(LOCAL_DIR, "rules") with open(os.path.join(RULES_DIR, "episode.json")) as fp: episode_json = json.load(fp) episode_page = Page.from_json(episode_json) """ episode_page gathers data formatted as: { "air_date": <string>, "cast": [ "name": <string>, "profile": <string> ] } """ def episode_url(season, episode): # {index:<fill><len><type>} return "https://www.rottentomatoes.com/tv/saturday-night-live/s{0:02d}/e{1:02d}/".format( season, episode)
import json from gatherer import Page, Fetch, Cache with open("pages/stadiums.json") as fp: stadium_data = json.load(fp) c = Cache("cache") f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c) stadiums = Page.from_json(stadium_data, f) output = stadiums.get( "http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums" ) with open("data/stadium_info.json", "w") as fp: json.dump(output, fp, indent=2)
nums = re.compile(r'(?P<degree>\d+)\u00b0(?P<minutes>\d+)') match = nums.search(coord) if match is None: return return match.groupdict() def coordinate_decimal(coord): return float(coord["degree"]) + (float(coord["minutes"])/60) c = Cache("cache") f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c) with open("pages/stadiums.json") as fp: stadium_json = json.load(fp) stadiums = Page.from_json(stadium_json) with open("pages/coordinates.json") as fp: coord_json = json.load(fp) coords = Page.from_json(coord_json) # get the basic stadium data stadium_dom = f.get("http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums") stadium_data = stadiums.gather(stadium_dom) stadium_coords = {} for stadium in stadium_data['stadiums']: dom = f.get(stadium["url"]) c = coords.gather(dom)
import json import os from gatherer import Page from snl.fetch import fetcher from snl.fetch.helpers import Y_m_d, infer_gender LOCAL_DIR = os.path.dirname(__file__) RULES_DIR = os.path.join(LOCAL_DIR, "rules") with open(os.path.join(RULES_DIR, "profile.json")) as fp: profile_json = json.load(fp) profile_page = Page.from_json(profile_json) def clean_profile(data): if data is None: return return { "name": data.get("name"), "hometown": data.get("birthplace"), "birthdate": Y_m_d(data.get("birthdate")), "gender": infer_gender(data.get("description")) } def profile(url): dom = fetcher.get(url) if dom is None: print("failed to get profile data")
# load a dict with the urls for all of the FBS D1-A teams' roster urls with open("team_pages.json") as fp: team_urls = json.load(fp) cache = Cache("cache") wiki_city = city.City(city.city_rule_set, { "headers": { "User-Agent": "gatherer" }, "sleep_time": 0, "cache": cache }) fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache) roster_page = Page.from_json(roster_rules) KNOWN_CITIES = {} def get_roster(url): """ given the url (on espn.com) for a football team, return an array of dicts with hometown and position keys """ dom = fetcher.get(url) return roster_page.gather(dom) def get_coordinates(hometown): # if a player does not live in the US or Canada, his hometown is listed as -- if hometown == "--":
import json from gatherer import Page, Fetch, Cache with open("rules/en_wikipedia_org/surnames.json") as fp: surname_json = json.load(fp) c = Cache("cache") f = Fetch(cache=c) surnames = Page.from_json(surname_json, f) URL = "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_North_America" names = surnames.get(URL) popular_names = names["names"] sigma = 0 for name in popular_names: count = float(name["frequency"]) name["frequency"] = count sigma += count running_total = 0 for name in names["names"]: percent = name["frequency"]/sigma name["percent"] = percent running_total += 1000*percent name["threshold"] = running_total with open("data/surnames.json".format(names), "w") as fp: json.dump(popular_names, fp, indent=2)
import os import json import argparse from gatherer import Page, Fetch os.makedirs("data", exist_ok=True) with open("submissions.json") as fp: sub_json = json.load(fp) f = Fetch(headers={"User-Agent": "gatherer"}) p = Page.from_json(sub_json) def fetch_and_save(filename, subreddit=None): if subreddit is None: url = "http://www.reddit.com" else: url = "http://www.reddit.com/r/{}".format(subreddit) dom = f.get(url) if dom is not None: data = p.gather(dom) path = "data/{}".format(filename) with open(path, "w") as fp: json.dump(data, fp) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-subreddit', dest='subreddit', help='subreddit to get data from') parser.add_argument('-filename', dest='filename',
import json from gatherer import Fetch, Cache, Page cache = Cache("cache") fetcher = Fetch(cache=cache) with open("rules/www_rottentomatoes_com/actor.json") as fp: actor_json = json.load(fp) actor_page = Page.from_json(actor_json) with open("rules/www_rottentomatoes_com/movie.json") as fp: movie_json = json.load(fp) movie_page = Page.from_json(movie_json) def get_actor(url): """ return a dict with the data from an actor's profile """ dom = fetcher.get(url, True) if dom is not None: return actor_page.gather(dom) def get_movie(url): """ return a dict with the data from a movie's profile """ dom = fetcher.get(url) if dom is not None: return movie_page.gather(dom)
import json from gatherer import Page, Fetch, Cache with open("rules/www_ssa_gov/firstnames.json") as fp: name_json = json.load(fp) c = Cache("cache") f = Fetch(cache=c) names = Page.from_json(name_json, f) URL = "http://www.ssa.gov/oact/babynames/decades/century.html" all_names = names.get(URL) def strip_commas(num): return int(num.replace(",", "")) male_names = [] female_names = [] male_sum, female_sum = 0, 0 for name_pair in all_names["ranks"]: male_name = name_pair["male_name"] male_count = strip_commas(name_pair["male_count"]) male_sum += male_count female_name = name_pair["female_name"] female_count = strip_commas(name_pair["female_count"]) female_sum += female_count
import json import os from gatherer import Page from snl.fetch import fetcher from snl.fetch.helpers import day_month_year LOCAL_DIR = os.path.dirname(__file__) RULES_DIR = os.path.join(LOCAL_DIR, "rules") with open(os.path.join(RULES_DIR, "season.json")) as fp: season_json = json.load(fp) season_page = Page.from_json(season_json) def season_url(season_number): """ returns the url for the imdb page for a given season of saturday night live currently there are 41 seasons. Any numbers outside of the range 1-41 will return the most recent season. """ return "http://www.imdb.com/title/tt0072562/episodes?season={}".format( season_number) def clean_episodes(data, season): """ convert episodes to the desired format """ if data is None: