def scrape_game(game_id, date, if_scrape_shifts): """ This scrapes the info for the game. The pbp is automatically scraped, and the whether or not to scrape the shifts is left up to the user. :param game_id: game to scrap :param date: ex: 2016-10-24 :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :return: DataFrame of pbp info (optional) DataFrame with shift info otherwise just None """ print(' '.join(['Scraping Game ', game_id, date])) shifts_df = None roster = playing_roster.scrape_roster(game_id) game_json = json_pbp.get_pbp(game_id) # Contains both player info (id's) and plays players, teams = get_teams_and_players(game_json, roster, game_id) # Game fails without any of these if not roster or not game_json or not teams or not players: broken_pbp_games.extend([[game_id, date]]) broken_shifts_games.extend([[game_id, date]]) return None, None pbp_df = scrape_pbp(game_id, date, roster, game_json, players, teams) if if_scrape_shifts and pbp_df is not None: shifts_df = scrape_shifts(game_id, players, date) if pbp_df is None: broken_pbp_games.extend([[game_id, date]]) return pbp_df, shifts_df
def test_combine_players_lists(players): """ Check that it combines the list of players from the json pbp and the html roster correctly """ game_id = "2017020891" json_players = game_scraper.get_players_json( json_pbp.get_pbp(game_id)['gameData']['players']) roster = playing_roster.scrape_roster(game_id)['players'] assert players == game_scraper.combine_players_lists( json_players, roster, game_id)
def test_get_pbp(): """Tests to see we get something when scraping. We want it to return a dictionary""" assert isinstance(json_pbp.get_pbp("2016020001"), dict) assert isinstance(json_pbp.get_pbp("2008020768"), dict)
def test_get_teams(): """Tests how extracting home and away teams from json""" assert json_pbp.get_teams(json_pbp.get_pbp("2014020001")) == { "Home": 'TOR', "Away": 'MTL' }
def scrape_live_game(self, force=False): """ Scrape the live info for a given game :param force: Whether to scrape no matter what (used for intermission here) :return: None """ game_json = json_pbp.get_pbp(str(self.game_id)) # When don't have json...can't do anything without it if game_json is None: return # Shift Game Statuses b4 we do anything self.prev_api_game_status = self.api_game_status self.prev_html_game_status = self.html_game_status # Swap old pbp & shift DataFrames self.prev_pbp_df = self.pbp_df self.prev_shifts_df = self.shifts_df # If json is in intermission: # Update self.api_game_status, get minutes remaining in intermission, and check if html is intermission too. # If both feeds are in intermission we return, otherwise we wait for the html to catch up. # "Intermission" is my own game status so otherwise just take whatever is in the api if game_json['liveData']['linescore']['intermissionInfo'][ 'inIntermission']: self.api_game_status = "Intermission" self.intermission_time_remaining = game_json['liveData'][ 'linescore']['intermissionInfo']["intermissionTimeRemaining"] # If see the both says intermission and we do too, we can just safely return and not bother with scraping. # This will be false if the HTML hasn't updated yet to intermission # If force we scrape no matter what if self.is_intermission() and not force: return else: # Update API Status if NOT in intermission to whatever is there self.api_game_status = game_json["gameData"]["status"][ "abstractGameState"] # Leave if b4 game started if game_json["gameData"]["status"]["abstractGameState"] in ["Preview"]: self.html_game_status = self.api_game_status = game_json[ "gameData"]["status"]["abstractGameState"] return # We get this the 1st time it scrapes the info (or when it's first available) # Don't bother with earlier as it may not be there or we may end up with an old version if not self.players: roster = playing_roster.scrape_roster(self.game_id) if roster is not None: self.players, _ = game_scraper.get_teams_and_players( game_json, roster, self.game_id) self.head_coaches = roster['head_coaches'] else: return # If we try and still can't get it we leave - Termination Reason #2 # Don't bother with scraper warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") # Scrape pbp - pay attention to each argument self.pbp_df, self.html_game_status = game_scraper.scrape_pbp_live( self.game_id, self.date, {"head_coaches": self.head_coaches}, game_json, self.players, { "Home": self.home_team, "Away": self.away_team }, espn_id=self.espn_id) # Get shifts if asked for if self.if_scrape_shifts: self.shifts_df = game_scraper.scrape_shifts( self.game_id, self.players, self.date)