def test_get_season(): """ Tests that this function returns the correct season for a given date""" assert shared.get_season("2017-10-01") == 2017 assert shared.get_season("2016-06-01") == 2015 assert shared.get_season("2020-08-29") == 2019 assert shared.get_season("2020-10-03") == 2019 assert shared.get_season("2020-11-15") == 2020
def scrape_dates(from_date, to_date): """ Get all the games between two dates. We scrape the schedule for each season in the srange and then pick out the correct ones by date. :param from_date: Date Scrape from :param to_date: Date scrape to :return: List of all games """ games = [] season_codes = get_season_codes() first_season = shared.get_season(from_date) last_season = shared.get_season(to_date) # Convert to datetime to easily compare to game dates from_datetime = datetime.strptime(from_date, "%Y-%m-%d") to_datetime = datetime.strptime(to_date, "%Y-%m-%d") for season in range(first_season, last_season + 1): for game in get_season_games(season, season_codes[str(season)]): game_date = datetime.strptime(game['date'], "%Y-%m-%d") if from_datetime <= game_date <= to_datetime: games.append(game) return games
def get_dates(from_date, to_date): """ Get all the date pages that a game occurs in the range :param from_date: Date Scrape from :param to_date: Date scrape to :return: List of Dates where games occurred """ date_range = from_date + "-" + to_date # Get initial info # Just use 2015 season seed_url = "https://www.nwhl.zone/schedule/day/league_instance/46947" soup = BeautifulSoup(get_schedule(seed_url, date_range + "-seed"), "lxml") # By Season (e.g. 2017-2018) sub_seasons = { season['label']: season.find_all("option") for season in soup.find_all("optgroup") } # Add Current season (here 2015 subs) - not found in above dropdown cur_season_subs = soup.find_all( "div", {"class": "currentSeason"})[0].find_all("a") cur_season_subs = [ sub for sub in cur_season_subs if sub['class'][0] != "close" ] cur_season = soup.find_all( "div", {"class": "currentSeason"})[0].find("span").text.strip()[:9] sub_seasons[cur_season] = cur_season_subs # Season o first date to season of last date # Know way to index by date so we start from the season from_season = shared.get_season(from_date) to_season = shared.get_season(to_date) # Get all dates for that season range (season of from_date and season of to_date) base = "https://www.nwhl.zone/" dates = [] for season in range(from_season, to_season + 1): for sub in sub_seasons["-".join([str(season), str(season + 1)])]: # Get dates for season-sub_type combo # Href and value are due to current season try: sub_dates = get_sub_dates(base + sub['value'], str(season), sub.text) except KeyError: sub_dates = get_sub_dates(base + sub['href'], str(season), sub.text) for sub_date in sub_dates: # Only add dates in range if date_obj( sub_date['date']) >= date_obj(from_date) and date_obj( sub_date['date']) <= date_obj(to_date): dates.append(sub_date) return dates
def get_espn_game(date, home_team, away_team, game_id=None): """ Gets the ESPN pbp feed Ex: http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId=400885300 :param date: date of the game :param home_team: home team :param away_team: away team :param game_id: Game id of we already have it - for live scraping. None if not there :return: raw xml """ # Get if not provided if not game_id: game_id = get_espn_game_id(date, home_team.upper(), away_team.upper()) file_info = { "url": 'http://www.espn.com/nhl/gamecast/data/masterFeed?lang=en&isAll=true&gameId={}'.format(game_id), "name": game_id, "type": "espn_pbp", "season": shared.get_season(date), } response = shared.get_file(file_info) if response is None: raise Exception return response
def get_espn_date(date): """ Get the page that contains all the games for that day :param date: YYYY-MM-DD :return: response """ page_info = { "url": 'http://www.espn.com/nhl/scoreboard/_/date/{}'.format( date.replace('-', '')), "name": date, "type": "espn_scoreboard", "season": shared.get_season(date), } response = shared.get_file(page_info) # If can't get or not there throw an exception if not response: raise Exception else: return response
def scrape_shifts(game_id, players, date): """ Scrape the Shift charts (or TOI tables) :param game_id: json game id :param players: dict of players with numbers and id's :param date: date of game :return: DataFrame with info or None if it fails """ shifts_df = None # Control for fact that shift json is only available from 2010 onwards if shared.get_season(date) >= 2010: shifts_df = json_shifts.scrape_game(game_id) if shifts_df is None or shifts_df.empty: shifts_df = html_shifts.scrape_game(game_id, players) if shifts_df is None or shifts_df.empty: shared.print_error("Unable to scrape shifts for game " + game_id) broken_shifts_games.extend([[game_id, date]]) return None shifts_df['Date'] = date return shifts_df
def get_schedule(date_from, date_to): """ Scrapes games in date range Ex: https://statsapi.web.nhl.com/api/v1/schedule?startDate=2010-10-03&endDate=2011-06-20 :param date_from: scrape from this date :param date_to: scrape until this date :return: raw json of schedule of date range """ page_info = { "url": 'https://statsapi.web.nhl.com/api/v1/schedule?startDate={a}&endDate={b}'.format(a=date_from, b=date_to), "name": date_from + "_" + date_to, "type": "json_schedule", "season": shared.get_season(date_from), } return json.loads(shared.get_file(page_info))
def get_dates(games): """ Given a list game_ids it returns the dates for each game. We sort all the games and retrieve the schedule from the beginning of the season from the earliest game until the end of most recent season. :param games: list with game_id's ex: 2016020001 :return: list with game_id and corresponding date for all games """ today = datetime.today() # Determine oldest and newest game games = list(map(str, games)) games.sort() date_from = shared.season_start_bound(games[0][:4]) year_to = int(games[-1][:4]) # If the last game is part of the ongoing season then only request the schedule until Today # We get strange errors if we don't do it like this if year_to == shared.get_season(datetime.strftime(today, "%Y-%m-%d")): date_to = '-'.join([str(today.year), str(today.month), str(today.day)]) else: date_to = datetime.strftime(shared.season_end_bound(year_to + 1), "%Y-%m-%d") # Newest game in sample # TODO: Assume true is live here -> Workaround schedule = scrape_schedule(date_from, date_to, preseason=True, not_over=True) # Only return games we want in range games_list = [] for game in schedule: if str(game['game_id']) in games: games_list.extend([game]) return games_list
def get_dates(games): """ Given a list game_ids it returns the dates for each game. We go from the beginning of the earliest season in the sample to the end of the most recent :param games: list with game_id's ex: 2016020001 :return: list with game_id and corresponding date for all games """ # TODO: Needed??? Scared to change # Convert to str to avoid issues games = list(map(str, games)) # Determine oldest and newest game games.sort() date_from = '-'.join([games[0][:4], '9', '1']) year_to = games[-1][:4] # If the last game is part of the ongoing season then only request the schedule until that day # We get strange errors if we don't do it like this if int(year_to) == shared.get_season(datetime.strftime(datetime.today(), "%Y-%m-%d")): date_to = '-'.join([str(datetime.today().year), str(datetime.today().month), str(datetime.today().day)]) else: # Due to 2020 Global Pandemic, games may happen until end of August date_to = '-'.join([str(int(year_to) + 1), '8', '30']) # Newest game in sample # TODO: Assume true is live here -> Workaround schedule = scrape_schedule(date_from, date_to, preseason=True, not_over=True) # Only return games we want in range games_list = [] for game in schedule: if str(game['game_id']) in games: games_list.extend([game]) return games_list
def parse_event(event, score, teams, date, game_id, players): """ Parses a single event when the info is in a json format :param event: json of event :param score: Current score of the game :param teams: Teams dict (id -> name) :param date: date of the game :param game_id: game id for game :param players: Dict of player ids to player names :return: dictionary with the info """ play = dict() # Basic shit play['play_index'] = event['play_index'] play['date'] = date play['game_id'] = game_id play['season'] = shared.get_season(date) play['period'] = event['time_interval'] play['seconds_elapsed'] = shared.convert_to_seconds( event['clock_time_string']) if event['clock_time_string'] else None play['home_score'], play['away_score'] = score['home'], score['away'] # If shootout go with 'play_by_play_string' field -> more descriptive play['event'] = event['play_type'] if event[ 'play_type'] != "Shootout" else event['play_by_play_string'].strip() # Teams play['home_team'], play['away_team'] = teams['home']['name'], teams[ 'away']['name'] if event['play_summary']['off_team_id'] == teams['home']['id']: play['ev_team'] = teams['home']['name'] else: play['ev_team'] = teams['away']['name'] # Player Id play['p1_id'] = event.get('primary_player_id') play['away_goalie_id'] = event['play_actions'][0].get('away_team_goalie') play['home_goalie_id'] = event['play_actions'][0].get('home_team_goalie') play['away_goalie'] = players.get( int(play['away_goalie_id']) if play['away_goalie_id'] not in ['', None] else 0) play['home_goalie'] = players.get( int(play['home_goalie_id']) if play['home_goalie_id'] not in ['', None] else 0) # Event specific stuff if event['play_type'] == 'Faceoff': play['p2_id'] = event['play_summary'].get("loser_id") elif event['play_type'] == 'Penalty': # TODO: Format better? play['details'] = ",".join([ str(event['play_summary'].get("infraction_type", " ")), str(event['play_summary'].get("penalty_type", " ")), str(event['play_summary'].get("penalty_minutes", " ")) ]) elif event['play_type'] == "Goal": get_goal_players(play, event, players) play['p2_id'] = event['play_summary'].get("assist_1_id") play['p3_id'] = event['play_summary'].get("assist_2_id") # Update Score if event['play_summary']['off_team_id'] == teams['home']['id']: score['home'] += 1 else: score['away'] += 1 # Player Id's --> Player Names for num in range(1, 4): player_id = play.get('p{num}_id'.format(num=num), 0) # Control for None player_id = player_id if player_id else 0 play['p{num}_name'.format(num=num)] = players.get(int(player_id)) # Coords play['xC'] = event['play_summary'].get('x_coord') play['yC'] = event['play_summary'].get('y_coord') return play