Esempio n. 1
0
def test_scrape_schedule():
    """Test to see if successfully get the correct number of games between two dates"""
    assert len(json_schedule.scrape_schedule("2017-08-01", "2017-09-01")) == 0
    assert len(json_schedule.scrape_schedule("2017-09-01",
                                             "2017-11-15")) == 277
    assert len(
        json_schedule.scrape_schedule("2017-09-01",
                                      "2017-11-15",
                                      preseason="True")) == 385
Esempio n. 2
0
    def get_games(self):
        """
        Get initial game info -> Called with object creation. Includes: players, espn_ids, standard game info
        
        :return: Dict - LiveGame objects for all games today
        """
        game_objs = []

        # Get the initial schedule & espn game ids just in case
        games = json_schedule.scrape_schedule(
            self.date, self.date, live=True, preseason=self.preseason
        )
        games = self.get_espn_ids(games)

        # Only keep the games we want if the user specified games
        if self.user_game_ids:
            games = [game for game in games if game["game_id"] in self.user_game_ids]

        # Get rosters for each game
        for game in games:
            game_objs.append(
                LiveGame(
                    game["game_id"],
                    game["start_time"],
                    game["home_team"],
                    game["away_team"],
                    game["status"],
                    game["espn_id"],
                    self.date,
                    self.if_scrape_shifts,
                )
            )

        return game_objs
def scrape_date_range(from_date, to_date, if_scrape_shifts, data_format='csv', preseason=False, rescrape=False,
                      docs_dir=False):
    """
    Scrape games in given date range
    
    :param from_date: date you want to scrape from
    :param to_date: date you want to scrape to
    :param if_scrape_shifts: Boolean indicating whether to also scrape shifts 
    :param data_format: format you want data in - csv or  pandas (csv is default)
    :param preseason: Boolean indicating whether to include preseason games (default if False)
                      This is may or may not work!!! I don't give a shit.
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None)
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping. When True it'll refer to (or if needed create) such a repository in the home
                     directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse
                     it won't work (I won't make it for you). When False the files won't be saved.
    
    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)
    shared.check_valid_dates(from_date, to_date)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    games = json_schedule.scrape_schedule(from_date, to_date, preseason)
    pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts)

    if data_format.lower() == 'csv':
        shared.to_csv(from_date + '--' + to_date, pbp_df, "nhl", "pbp")
        shared.to_csv(from_date + '--' + to_date, shifts_df, "nhl", "shifts")
    else:
        return {"pbp": pbp_df, "shifts": shifts_df} if if_scrape_shifts else {"pbp": pbp_df}
def scrape_schedule(from_date, to_date, data_format='pandas', rescrape=False, docs_dir=False):
    """
    Scrape the games schedule in a given range.
    
    :param from_date: date you want to scrape from
    :param to_date: date you want to scrape to 
    :param data_format: format you want data in - csv or  pandas (pandas is default)
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None)
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping. When True it'll refer to (or if needed create) such a repository in the home
                     directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse
                     it won't work (I won't make it for you). When False the files won't be saved.
    
    :return: DataFrame of None
    """
    cols = ["game_id", "date", "venue", "home_team", "away_team", "start_time", "home_score", "away_score", "status"]

    # First check if the inputs are good
    shared.check_data_format(data_format)
    shared.check_valid_dates(from_date, to_date)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    print("Scraping the schedule between {} and {}".format(from_date, to_date))

    # live = True allows us to scrape games that aren't final
    sched = json_schedule.scrape_schedule(from_date, to_date, preseason=True, not_over=True)
    sched_df = pd.DataFrame(sched, columns=cols)

    if data_format.lower() == 'csv':
        shared.to_csv(from_date + '--' + to_date, sched_df, "nhl", "schedule")
    else:
        return sched_df
def scrape_seasons(
    seasons,
    if_scrape_shifts,
    data_format="csv",
    preseason=False,
    rescrape=False,
    docs_dir=False,
):
    """
    Given list of seasons it scrapes all the seasons 
    
    :param seasons: list of seasons
    :param if_scrape_shifts: Boolean indicating whether to also scrape shifts 
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param preseason: Boolean indicating whether to include preseason games (default if False)
                      This is may or may not work!!! I don't give a shit.
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir.
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping. When True it'll refer to (or if needed create) such a repository in the home
                     directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse
                     it won't work (I won't make it for you). When False the files won't be saved.
    
    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    # Holds all seasons scraped (if not csv)
    master_pbps, master_shifts = [], []

    for season in seasons:
        from_date = "-".join([str(season), "9", "1"])
        to_date = "-".join([str(season + 1), "7", "1"])

        games = json_schedule.scrape_schedule(from_date, to_date, preseason)
        pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts)

        if data_format.lower() == "csv":
            shared.to_csv(
                str(season) + str(season + 1), pbp_df, shifts_df, "nhl")
        else:
            master_pbps.append(pbp_df)
            master_shifts.append(shifts_df)

    if data_format.lower() == "pandas":
        if if_scrape_shifts:
            return {
                "pbp": pd.concat(master_pbps),
                "shifts": pd.concat(master_shifts),
                "errors": errors,
            }
        else:
            return {"pbp": pd.concat(master_pbps), "errors": errors}