Beispiel #1
0
 def __init__(self, page_config, fetch_config=None):
     """
     page_config is a required dict to create a gatherer.Page
     fetch_config is a dict used to setup the fetcher. possible keys are:
         sleep_time - number, default 5
         cache - a gatherer.Cache, default None
         headers - a dict for requests to send with a request, default None
     """
     self.page = Page.from_json(page_config)
     if fetch_config is None:
         fetch_config = {}
     self.fetcher = Fetch(**fetch_config)
Beispiel #2
0
class Wiki(object):

    def __init__(self, page_config, fetch_config=None):
        """
        page_config is a required dict to create a gatherer.Page
        fetch_config is a dict used to setup the fetcher. possible keys are:
            sleep_time - number, default 5
            cache - a gatherer.Cache, default None
            headers - a dict for requests to send with a request, default None
        """
        self.page = Page.from_json(page_config)
        if fetch_config is None:
            fetch_config = {}
        self.fetcher = Fetch(**fetch_config)

    def search(self, term):
        search_url = "http://en.wikipedia.org/w/index.php?title=Special:Search&search=%s"
        return self.fetch(search_url % term)

    def direct(self, term):
        formatted_term = term.replace(" ", "_")
        direct_url = "https://en.wikipedia.org/wiki/%s"
        return self.fetch(direct_url % formatted_term)

    def fetch(self, url):
        """
        use the fetcher to return the dom of the given url
        if the get request does not succeed, returns None
        """
        return self.fetcher.get(url)

    def gather(self, dom):
        return self.page.gather(dom)

    def forage(self, term):
        """
        given a search term, get the dom of wikipedia page for the term (if
        it exists) and return the data for the Page
        """
        dom = self.search(term)
        if dom is not None:
            return self.gather(dom)
Beispiel #3
0
import json
import argparse

from gatherer import Page, Fetch, Cache

with open("pages/schedule.json") as fp:
    schedule_json = json.load(fp)

c = Cache("cache")
f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c)

schedule = Page.from_json(schedule_json)


def get_season(year):
    BASE_URL = "http://www.nfl.com/schedules/{}/REG{}"

    for week in range(1, 18):
        dom = f.get(BASE_URL.format(year, week))
        s = schedule.gather(dom)
        with open("data/{}-{:02d}.json".format(year, week), "w") as fp:
            json.dump(s, fp, indent=2)


def get_week(year, week):
    BASE_URL = "http://www.nfl.com/schedules/{}/REG{}"
    dom = f.get(BASE_URL.format(year, week))
    s = schedule.gather(dom)
    with open("data/{}-{:02d}.json".format(year, week), "w") as fp:
        json.dump(s, fp, indent=2)
Beispiel #4
0

def coordinate_vals(coord):
    # don't care about precision down to the seconds
    nums = re.compile(r'(?P<degree>\d+)\u00b0(?P<minutes>\d+)')
    match = nums.search(coord)
    if match is None:
        return
    return match.groupdict()


def coordinate_decimal(coord):
    return float(coord["degree"]) + (float(coord["minutes"])/60)

c = Cache("cache")
f = Fetch(headers={'User-Agent': 'gatherer agent'}, cache=c)

with open("pages/stadiums.json") as fp:
    stadium_json = json.load(fp)
    stadiums = Page.from_json(stadium_json)

with open("pages/coordinates.json") as fp:
    coord_json = json.load(fp)
    coords = Page.from_json(coord_json)


# get the basic stadium data
stadium_dom = f.get("http://en.wikipedia.org/wiki/List_of_current_National_Football_League_stadiums")
stadium_data = stadiums.gather(stadium_dom)

stadium_coords = {}
Beispiel #5
0
    roster_rules = json.load(fp)

# load a dict with the urls for all of the FBS D1-A teams' roster urls
with open("team_pages.json") as fp:
    team_urls = json.load(fp)

cache = Cache("cache")
wiki_city = city.City(city.city_rule_set, {
    "headers": {
        "User-Agent": "gatherer"
    },
    "sleep_time": 0,
    "cache": cache
})

fetcher = Fetch(headers={"User-Agent": "gatherer"}, cache=cache)
roster_page = Page.from_json(roster_rules)
KNOWN_CITIES = {}


def get_roster(url):
    """
    given the url (on espn.com) for a football team, return an array of dicts
    with hometown and position keys
    """
    dom = fetcher.get(url)
    return roster_page.gather(dom)


def get_coordinates(hometown):
    # if a player does not live in the US or Canada, his hometown is listed as --
Beispiel #6
0
import os
import json
import argparse

from gatherer import Page, Fetch

os.makedirs("data", exist_ok=True)
with open("submissions.json") as fp:
    sub_json = json.load(fp)

f = Fetch(headers={"User-Agent": "gatherer"})
p = Page.from_json(sub_json)


def fetch_and_save(filename, subreddit=None):
    if subreddit is None:
        url = "http://www.reddit.com"
    else:
        url = "http://www.reddit.com/r/{}".format(subreddit)
    dom = f.get(url)
    if dom is not None:
        data = p.gather(dom)
        path = "data/{}".format(filename)
        with open(path, "w") as fp:
            json.dump(data, fp)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-subreddit', dest='subreddit',
                        help='subreddit to get data from')
    parser.add_argument('-filename', dest='filename',
Beispiel #7
0
import json
from gatherer import Fetch, Cache, Page

cache = Cache("cache")
fetcher = Fetch(cache=cache)

with open("rules/www_rottentomatoes_com/actor.json") as fp:
    actor_json = json.load(fp)
actor_page = Page.from_json(actor_json)

with open("rules/www_rottentomatoes_com/movie.json") as fp:
    movie_json = json.load(fp)
movie_page = Page.from_json(movie_json)


def get_actor(url):
    """
    return a dict with the data from an actor's profile
    """
    dom = fetcher.get(url, True)
    if dom is not None:
        return actor_page.gather(dom)


def get_movie(url):
    """
    return a dict with the data from a movie's profile
    """
    dom = fetcher.get(url)
    if dom is not None:
        return movie_page.gather(dom)
Beispiel #8
0
from gatherer import Fetch, Cache

fs_cache = Cache("cache")
fetcher = Fetch(headers={"User-Agent": "Saturday Night Live Data"}, cache=fs_cache)