def test_scrape_schedule(): """Test to see if successfully get the correct number of games between two dates""" assert len(json_schedule.scrape_schedule("2017-08-01", "2017-09-01")) == 0 assert len(json_schedule.scrape_schedule("2017-09-01", "2017-11-15")) == 277 assert len( json_schedule.scrape_schedule("2017-09-01", "2017-11-15", preseason="True")) == 385
def get_games(self): """ Get initial game info -> Called with object creation. Includes: players, espn_ids, standard game info :return: Dict - LiveGame objects for all games today """ game_objs = [] # Get the initial schedule & espn game ids just in case games = json_schedule.scrape_schedule( self.date, self.date, live=True, preseason=self.preseason ) games = self.get_espn_ids(games) # Only keep the games we want if the user specified games if self.user_game_ids: games = [game for game in games if game["game_id"] in self.user_game_ids] # Get rosters for each game for game in games: game_objs.append( LiveGame( game["game_id"], game["start_time"], game["home_team"], game["away_team"], game["status"], game["espn_id"], self.date, self.if_scrape_shifts, ) ) return game_objs
def scrape_date_range(from_date, to_date, if_scrape_shifts, data_format='csv', preseason=False, rescrape=False, docs_dir=False): """ Scrape games in given date range :param from_date: date you want to scrape from :param to_date: date you want to scrape to :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :param data_format: format you want data in - csv or pandas (csv is default) :param preseason: Boolean indicating whether to include preseason games (default if False) This is may or may not work!!! I don't give a shit. :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None) :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping. When True it'll refer to (or if needed create) such a repository in the home directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse it won't work (I won't make it for you). When False the files won't be saved. :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) shared.check_valid_dates(from_date, to_date) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) games = json_schedule.scrape_schedule(from_date, to_date, preseason) pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts) if data_format.lower() == 'csv': shared.to_csv(from_date + '--' + to_date, pbp_df, "nhl", "pbp") shared.to_csv(from_date + '--' + to_date, shifts_df, "nhl", "shifts") else: return {"pbp": pbp_df, "shifts": shifts_df} if if_scrape_shifts else {"pbp": pbp_df}
def scrape_schedule(from_date, to_date, data_format='pandas', rescrape=False, docs_dir=False): """ Scrape the games schedule in a given range. :param from_date: date you want to scrape from :param to_date: date you want to scrape to :param data_format: format you want data in - csv or pandas (pandas is default) :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None) :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping. When True it'll refer to (or if needed create) such a repository in the home directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse it won't work (I won't make it for you). When False the files won't be saved. :return: DataFrame of None """ cols = ["game_id", "date", "venue", "home_team", "away_team", "start_time", "home_score", "away_score", "status"] # First check if the inputs are good shared.check_data_format(data_format) shared.check_valid_dates(from_date, to_date) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) print("Scraping the schedule between {} and {}".format(from_date, to_date)) # live = True allows us to scrape games that aren't final sched = json_schedule.scrape_schedule(from_date, to_date, preseason=True, not_over=True) sched_df = pd.DataFrame(sched, columns=cols) if data_format.lower() == 'csv': shared.to_csv(from_date + '--' + to_date, sched_df, "nhl", "schedule") else: return sched_df
def scrape_seasons( seasons, if_scrape_shifts, data_format="csv", preseason=False, rescrape=False, docs_dir=False, ): """ Given list of seasons it scrapes all the seasons :param seasons: list of seasons :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :param data_format: format you want data in - csv or pandas (csv is default) :param preseason: Boolean indicating whether to include preseason games (default if False) This is may or may not work!!! I don't give a shit. :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping. When True it'll refer to (or if needed create) such a repository in the home directory. When provided a string it'll try to use that. Here it must be a valid directory otheriwse it won't work (I won't make it for you). When False the files won't be saved. :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) # Holds all seasons scraped (if not csv) master_pbps, master_shifts = [], [] for season in seasons: from_date = "-".join([str(season), "9", "1"]) to_date = "-".join([str(season + 1), "7", "1"]) games = json_schedule.scrape_schedule(from_date, to_date, preseason) pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts) if data_format.lower() == "csv": shared.to_csv( str(season) + str(season + 1), pbp_df, shifts_df, "nhl") else: master_pbps.append(pbp_df) master_shifts.append(shifts_df) if data_format.lower() == "pandas": if if_scrape_shifts: return { "pbp": pd.concat(master_pbps), "shifts": pd.concat(master_shifts), "errors": errors, } else: return {"pbp": pd.concat(master_pbps), "errors": errors}