def test_get_game_data_from_schedule(mocker): get_season_schedule_mock = mocker.patch( "scrapenhl2.scrape.schedules.get_season_schedule") get_game_data_from_schedule(2017, 1234) get_season_schedule_mock.assert_called_once_with(2017) get_season_schedule_mock().query.assert_called_once_with('Game == 1234') get_season_schedule_mock().query().to_dict.assert_called_once_with( orient='series')
def parse_game_toi_from_html(season, game, force_overwrite=False): """ Parses TOI from the html shift log from this game. :param season: int, the season :param game: int, the game :param force_overwrite: bool. If True, will execute. If False, executes only if file does not exist yet. :return: nothing """ # TODO force_overwrite support filenames = (scrape_toi.get_home_shiftlog_filename(season, game), scrape_toi.get_road_shiftlog_filename(season, game)) if force_overwrite is False and os.path.exists(scrape_toi.get_home_shiftlog_filename(season, game)) and \ os.path.exists(scrape_toi.get_home_shiftlog_filename(season, game)): return False gameinfo = schedules.get_game_data_from_schedule(season, game) try: parsedtoi = read_shifts_from_html_pages( scrape_toi.get_raw_html_toi(season, game, 'H'), scrape_toi.get_raw_html_toi(season, game, 'R'), gameinfo['Home'], gameinfo['Road'], season, game) except ValueError as ve: # ed.print_and_log('Error with {0:d} {1:d}'.format(season, game), 'warning') # ed.print_and_log(str(ve), 'warning') parsedtoi = None save_parsed_toi(parsedtoi, season, game) # ed.print_and_log('Parsed shifts for {0:d} {1:d}'.format(season, game)) return True
def update_player_logs_from_page(pbp, season, game): """ Takes the game play by play and adds players to the master player log file, noting that they were on the roster for this game, which team they played for, and their status (P for played, S for scratch). :param season: int, the season :param game: int, the game :param pbp: json, the pbp of the game :return: nothing """ # Get players who played, and scratches, from boxscore home_played = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'home', 'players') road_played = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'away', 'players') home_scratches = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'home', 'scratches') road_scratches = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'away', 'scratches') # Played are both dicts, so make them lists home_played = [int(pid[2:]) for pid in home_played] road_played = [int(pid[2:]) for pid in road_played] # Played may include scratches, so make sure to remove them home_played = list(set(home_played).difference(set(home_scratches))) road_played = list(set(road_played).difference(set(road_scratches))) # Get home and road names gameinfo = schedules.get_game_data_from_schedule(season, game) # Update player logs update_player_log_file(home_played, season, game, gameinfo['Home'], 'P') update_player_log_file(home_scratches, season, game, gameinfo['Home'], 'S') update_player_log_file(road_played, season, game, gameinfo['Road'], 'P') update_player_log_file(road_scratches, season, game, gameinfo['Road'], 'S')
def update_schedule_with_result_using_pbp(pbp, season, game): """ Uses the PbP to update results for this game. :param pbp: json, the pbp for this game :param season: int, the season :param game: int, the game :return: nothing """ gameinfo = schedules.get_game_data_from_schedule(season, game) result = None # In case they have the same score. Like 2006 10009 has incomplete data, shows 0-0 # If game is not final yet, don't do anything if gameinfo['Status'] != 'Final': return False # If one team one by at least two, we know it was a regulation win if gameinfo['HomeScore'] >= gameinfo['RoadScore'] + 2: result = 'W' elif gameinfo['RoadScore'] >= gameinfo['HomeScore'] + 2: result = 'L' else: # Check for the final period finalplayperiod = helpers.try_to_access_dict(pbp, 'liveData', 'linescore', 'currentPeriodOrdinal') # Identify SO vs OT vs regulation if finalplayperiod is None: pass elif finalplayperiod == 'SO': if gameinfo['HomeScore'] > gameinfo['RoadScore']: result = 'SOW' elif gameinfo['RoadScore'] > gameinfo['HomeScore']: result = 'SOL' elif finalplayperiod[-2:] == 'OT': if gameinfo['HomeScore'] > gameinfo['RoadScore']: result = 'OTW' elif gameinfo['RoadScore'] > gameinfo['HomeScore']: result = 'OTL' else: if gameinfo['HomeScore'] > gameinfo['RoadScore']: result = 'W' elif gameinfo['RoadScore'] > gameinfo['HomeScore']: result = 'L' update_schedule_with_result(season, game, result)
def read_events_from_page(rawpbp, season, game): """ This method takes the json pbp and returns a pandas dataframe with the following columns: * Index: int, index of event * Period: str, period of event. In regular season, could be 1, 2, 3, OT, or SO. In playoffs, 1, 2, 3, 4, 5... * MinSec: str, m:ss, time elapsed in period * Time: int, time elapsed in game * Event: str, the event name * Team: int, the team id. Note that this is switched to blocked team for blocked shots to ease Corsi calculations. * Actor: int, the acting player id. Switched with recipient for blocks (see above) * ActorRole: str, e.g. for faceoffs there is a "Winner" and "Loser". Switched with recipient for blocks (see above) * Recipient: int, the receiving player id. Switched with actor for blocks (see above) * RecipientRole: str, e.g. for faceoffs there is a "Winner" and "Loser". Switched with actor for blocks (see above) * X: int, the x coordinate of event (or NaN) * Y: int, the y coordinate of event (or NaN) * Note: str, additional notes, which may include penalty duration, assists on a goal, etc. :param rawpbp: json, the raw json pbp :param season: int, the season :param game: int, the game :return: pandas dataframe, the pbp in a nicer format """ pbp = helpers.try_to_access_dict(rawpbp, 'liveData', 'plays', 'allPlays') if pbp is None: return gameinfo = schedules.get_game_data_from_schedule(season, game) pbpdf = _create_pbp_df_json(pbp, gameinfo) if len(pbpdf) == 0: return pbpdf pbpdf = _add_scores_to_pbp(pbpdf, gameinfo) pbpdf = _add_times_to_pbp(pbpdf) return pbpdf
def _finish_toidf_manipulations(df, season, game): """ Takes dataframe of shifts (one row per shift) and makes into a matrix of players on ice for each second. :param df: dataframe :param season: int, the season :param game: int, the game :return: dataframe """ gameinfo = schedules.get_game_data_from_schedule(season, game) # TODO don't read end times. Use duration, which has good coverage, to infer end. Then end + 1200 not needed below. # Sometimes shifts have the same start and time. # By the time we're here, they'll have start = end + 1 # So let's remove shifts with duration -1 df = df[df.Start != df.End + 1] # Sometimes you see goalies with a shift starting in one period and ending in another # This is to help in those cases. if sum(df.End < df.Start) > 0: # ed.print_and_log('Have to adjust a shift time', 'warn') # TODO I think I'm making a mistake with overtime shifts--end at 3900! # TODO also, maybe only go to the end of the period, not to 1200 # ed.print_and_log(df[df.End < df.Start]) df.loc[df.End < df.Start, 'End'] = df.loc[df.End < df.Start, 'End'] + 1200 # One issue coming up is when the above line comes into play--missing times are filled in as 0:00 tempdf = df[['PlayerID', 'Start', 'End', 'Team', 'Duration']].query("Duration > 0") tempdf = tempdf.assign(Time=tempdf.Start) # print(tempdf.head(20)) # Let's filter out goalies for now. We can add them back in later. # This will make it easier to get the strength later pids = players.get_player_ids_file() tempdf = tempdf.merge(pids[['ID', 'Pos']], how='left', left_on='PlayerID', right_on='ID') # toi = pd.DataFrame({'Time': [i for i in range(0, max(df.End) + 1)]}) toi = pd.DataFrame( {'Time': [i for i in range(0, int(round(max(df.End))))]}) # Originally used a hacky way to fill in times between shift start and end: increment tempdf by one, filter, join # Faster to work with base structures # Or what if I join each player to full df, fill backward on start and end, and filter out rows where end > time # toidict = toi.to_dict(orient='list') # players_by_sec = [[] for _ in range(min(toidict['Start'], toidict['End'] + 1))] # for i in range(len(players_by_sec)): # for j in range(toidict['Start'][i], toidict['End'][i] + 1): # players_by_sec[j].append(toidict['PlayerID'][i]) # Maybe I can create a matrix with rows = time and columns = players # Loop over start and end, and use iloc[] to set booleans en masse. # Then melt and filter # Create one row per second alltimes = toi.Time newdf = pd.DataFrame(index=alltimes) # Add rows and set times to True simultaneously for i, (pid, start, end, team, duration, time, pid, pos) in tempdf.iterrows(): newdf.loc[start:end, pid] = True # Fill NAs to False for col in newdf: newdf.loc[:, col] = newdf[col].fillna(False) # Go wide to long and then drop unneeded rows newdf = helpers.melt_helper( newdf.reset_index(), id_vars= 'Time', # value_vars=newdf.columns, # cols with num colnames causing errors var_name='PlayerID', value_name='OnIce') newdf = newdf[newdf.OnIce].drop('OnIce', axis=1) newdf = newdf.merge(tempdf.drop('Time', axis=1), how='left', on='PlayerID') \ .query("Time <= End & Time >= Start") \ .drop('ID', axis=1) # In case there were rows that were all missing, join onto TOI tempdf = toi.merge(newdf, how='left', on='Time') # TODO continue here--does newdf match tempdf after sort_values? # Old method # toidfs = [] # while len(tempdf.index) > 0: # temptoi = toi.merge(tempdf, how='inner', on='Time') # toidfs.append(temptoi) # tempdf = tempdf.assign(Time=tempdf.Time + 1) # tempdf = tempdf.query('Time <= End') # tempdf = pd.concat(toidfs) # tempdf = tempdf.sort_values(by='Time') goalies = tempdf[tempdf.Pos == 'G'].drop({'Pos'}, axis=1) tempdf = tempdf[tempdf.Pos != 'G'].drop({'Pos'}, axis=1) # Append team name to start of columns by team home = str(gameinfo['Home']) road = str(gameinfo['Road']) # Goalies # Let's assume we get only one goalie per second per team. # TODO: flag if there are multiple listed and pick only one goalies.loc[:, 'GTeam'] = goalies.Team.apply( lambda x: 'HG' if str(int(x)) == home else 'RG') try: goalies2 = goalies[['Time', 'PlayerID', 'GTeam']] \ .pivot(index='Time', columns='GTeam', values='PlayerID') \ .reset_index() except ValueError: # Duplicate entries in index error. # ed.print_and_log('Multiple goalies for a team in {0:d} {1:d}, picking one with the most TOI'.format( # season, game), 'warn') # Find times with multiple goalies too_many_goalies_h = goalies[goalies.GTeam == 'HG'][['Time']] \ .assign(GoalieCount=1) \ .groupby('Time').count() \ .reset_index() \ .query('GoalieCount > 1') too_many_goalies_r = goalies[goalies.GTeam == 'RG'][['Time']] \ .assign(GoalieCount=1) \ .groupby('Time').count() \ .reset_index() \ .query('GoalieCount > 1') # Find most common goalie for each team if len(too_many_goalies_h) == 0: problem_times_revised_h = goalies else: # i.e. if len(too_many_goalies_h) > 0: top_goalie_h = goalies[goalies.GTeam == 'HG'][['PlayerID']] \ .assign(GoalieCount=1) \ .groupby('PlayerID').count() \ .reset_index() \ .sort_values('GoalieCount', ascending=False) \ .PlayerID.iloc[0] # and now finally drop problem times problem_times_revised_h = goalies \ .merge(too_many_goalies_h[['Time']], how='outer', on='Time', indicator=True) problem_times_revised_h.loc[:, 'ToDrop'] = (problem_times_revised_h._merge == 'both') & \ (problem_times_revised_h.PlayerID != top_goalie_h) problem_times_revised_h = problem_times_revised_h[problem_times_revised_h.ToDrop == False] \ .drop({'_merge', 'ToDrop'}, axis=1) if len(too_many_goalies_r) == 0: problem_times_revised_r = problem_times_revised_h else: # i.e. if len(too_many_goalies_r) > 0: top_goalie_r = goalies[goalies.GTeam == 'RG'][['PlayerID']] \ .assign(GoalieCount=1) \ .groupby('PlayerID').count() \ .reset_index() \ .sort_values('GoalieCount', ascending=False) \ .PlayerID.iloc[0] problem_times_revised_r = problem_times_revised_h \ .merge(too_many_goalies_r[['Time']], how='outer', on='Time', indicator=True) problem_times_revised_r.loc[:, 'ToDrop'] = (problem_times_revised_r._merge == 'both') & \ (problem_times_revised_r.PlayerID != top_goalie_r) problem_times_revised_r = problem_times_revised_r[problem_times_revised_r.ToDrop == False] \ .drop({'_merge', 'ToDrop'}, axis=1) # Pivot again goalies2 = problem_times_revised_r[['Time', 'PlayerID', 'GTeam']] \ .pivot(index='Time', columns='GTeam', values='PlayerID') \ .reset_index() # Home hdf = tempdf.query('Team == "' + home + '"').sort_values( ['Time', 'Duration'], ascending=[True, False]) if len(hdf) == 0: # Earlier versions of pandas can have diff behavior hdf = tempdf.query('Team == ' + home).sort_values( ['Time', 'Duration'], ascending=[True, False]) hdf2 = hdf[['Time', 'Duration']].groupby('Time').rank(method='first', ascending=False) hdf2 = hdf2.rename(columns={'Duration': 'rank'}) hdf2.loc[:, 'rank'] = hdf2['rank'].apply(lambda x: int(x)) hdf.loc[:, 'rank'] = 'H' + hdf2['rank'].astype('str') rdf = tempdf.query('Team == "' + road + '"').sort_values( ['Time', 'Duration'], ascending=[True, False]) if len(rdf) == 0: rdf = tempdf.query('Team == ' + road).sort_values( ['Time', 'Duration'], ascending=[True, False]) rdf2 = rdf[['Time', 'Duration']].groupby('Time').rank(method='first', ascending=False) rdf2 = rdf2.rename(columns={'Duration': 'rank'}) rdf2.loc[:, 'rank'] = rdf2['rank'].apply(lambda x: int(x)) rdf.loc[:, 'rank'] = 'R' + rdf2['rank'].astype('str') # Remove values above 6--looking like there won't be many # But in those cases take shifts with longest durations # That's why we create hdf and rdf by also sorting by Time and Duration above, and select duration for rank() if len(hdf[hdf['rank'] == "H7"]) > 0: # ed.print_and_log('Some times from {0:d} {1:d} have too many home players; cutting off at 6'.format( # season, game), 'warn') # ed.print_and_log('Longest shift being lost was {0:d} seconds'.format( # hdf[hdf['rank'] == "H7"].Duration.max()), 'warn') pass if len(rdf[rdf['rank'] == "R7"]) > 0: # ed.print_and_log('Some times from {0:d} {1:d} have too many road players; cutting off at 6'.format( # season, game), 'warn') # ed.print_and_log('Longest shift being lost was {0:d} seconds'.format( # rdf[rdf['rank'] == "H7"].Duration.max()), 'warn') pass hdf = hdf.pivot(index='Time', columns='rank', values='PlayerID').iloc[:, 0:6] hdf.reset_index(inplace=True) # get time back as a column rdf = rdf.pivot(index='Time', columns='rank', values='PlayerID').iloc[:, 0:6] rdf.reset_index(inplace=True) toi = toi.merge(hdf, how='left', on='Time') \ .merge(rdf, how='left', on='Time') \ .merge(goalies2, how='left', on='Time') column_order = list(toi.columns.values) column_order = ['Time'] + [x for x in sorted(column_order[1:]) ] # First entry is Time; sort rest toi = toi[column_order] # Now should be Time, H1, H2, ... HG, R1, R2, ..., RG # For games in the first, HG and RG may not exist yet. Have dummy replacements in there. # Will be wrong for when goalie is pulled in first, but oh well... if 'HG' not in toi.columns: newcol = [0 for _ in range(len(toi))] toi.insert(loc=toi.columns.get_loc('R1'), column='HG', value=newcol) if 'RG' not in toi.columns: toi.loc[:, 'RG'] = 0 toi.loc[:, 'HomeSkaters'] = 0 for col in toi.loc[:, 'H1':'HG'].columns[:-1]: toi.loc[:, 'HomeSkaters'] = toi[col].notnull() + toi.HomeSkaters toi.loc[:, 'HomeSkaters'] = 100 * toi['HG'].notnull( ) + toi.HomeSkaters # a hack to make it easy to recognize toi.loc[:, 'RoadSkaters'] = 0 for col in toi.loc[:, 'R1':'RG'].columns[:-1]: toi.loc[:, 'RoadSkaters'] = toi[col].notnull() + toi.RoadSkaters toi.loc[:, 'RoadSkaters'] = 100 * toi['RG'].notnull( ) + toi.RoadSkaters # a hack to make it easy to recognize # This is how we label strengths: 5 means 5 skaters plus goalie; five skaters w/o goalie is 4+1. toi.loc[:, 'HomeStrength'] = toi.HomeSkaters.apply(lambda x: '{0:d}'.format( x - 100) if x >= 100 else '{0:d}+1'.format(x - 1)) toi.loc[:, 'RoadStrength'] = toi.RoadSkaters.apply(lambda x: '{0:d}'.format( x - 100) if x >= 100 else '{0:d}+1'.format(x - 1)) toi.drop({'HomeSkaters', 'RoadSkaters'}, axis=1, inplace=True) # Also drop -1+1 and 0+1 cases, which are clearly errors, and the like. # Need at least 3 skaters apiece, 1 goalie apiece, time, and strengths to be non-NA = 11 non NA values toi2 = toi.dropna(axis=0, thresh=11) # drop rows without at least 11 non-NA values if len(toi2) < len(toi): # ed.print_and_log('Dropped {0:d}/{1:d} times in {2:d} {3:d} because of invalid strengths'.format( # len(toi) - len(toi2), len(toi), season, game), 'warn') pass # TODO data quality check that I don't miss times in the middle of the game return toi2
def read_shifts_from_page(rawtoi, season, game): """ Turns JSON shift start-ends into TOI matrix with one row per second and one col per player :param rawtoi: dict, json from NHL API :param season: int, the season :param game: int, the game :return: dataframe """ toi = rawtoi['data'] if len(toi) == 0: return ids = ['' for _ in range(len(toi))] periods = [0 for _ in range(len(toi))] starts = ['0:00' for _ in range(len(toi))] ends = ['0:00' for _ in range(len(toi))] teams = ['' for _ in range(len(toi))] durations = [0 for _ in range(len(toi))] # The shifts are ordered shortest duration to longest. for i, dct in enumerate(toi): ids[i] = helpers.try_to_access_dict(dct, 'playerId', default_return='') periods[i] = helpers.try_to_access_dict(dct, 'period', default_return=0) starts[i] = helpers.try_to_access_dict(dct, 'startTime', default_return='0:00') ends[i] = helpers.try_to_access_dict(dct, 'endTime', default_return='0:00') durations[i] = helpers.try_to_access_dict(dct, 'duration', default_return=0) teams[i] = helpers.try_to_access_dict(dct, 'teamId', default_return='') gameinfo = schedules.get_game_data_from_schedule(season, game) # I originally took start times at face value and subtract 1 from end times # This caused problems with joining events--when there's a shot and the goalie freezes immediately # then, when you join this to the pbp, you'll get the players on the ice for the next draw as having # been on ice for the shot. # So I switch to adding 1 to start times, and leaving end times as-are. # That means that when joining on faceoffs, add 1 to faceoff times. # Exception: start time 1 --> start time 0 startmin = [x[:x.index(':')] for x in starts] startsec = [x[x.index(':') + 1:] for x in starts] starttimes = [ 1200 * (p - 1) + 60 * int(m) + int(s) + 1 for p, m, s in zip(periods, startmin, startsec) ] # starttimes = [0 if x == 1 else x for x in starttimes] endmin = [x[:x.index(':')] for x in ends] endsec = [x[x.index(':') + 1:] for x in ends] # There is an extra -1 in endtimes to avoid overlapping start/end endtimes = [ 1200 * (p - 1) + 60 * int(m) + int(s) for p, m, s in zip(periods, endmin, endsec) ] durationtime = [e - s for s, e in zip(starttimes, endtimes)] df = pd.DataFrame({ 'PlayerID': ids, 'Period': periods, 'Start': starttimes, 'End': endtimes, 'Team': teams, 'Duration': durationtime }) return _finish_toidf_manipulations(df, season, game)
def on_success(self, data): if 'text' in data: print(data['text']) if r'https://t.co/' in data['text']: print('This looks like an image') return if data['text'][:3] == 'RT ': print('This looks like a retweet') return global LAST_UPDATE, SCRAPED_NEW try: if player_cf_graphs(data): return try: season, gameid = games.find_playoff_game(data['text']) except ValueError: season = None gameid = None # Get season with a 4-digit regex if season is None: text = data['text'] + ' ' if re.search(r'\s\d{4}\s', text) is not None: season = int(re.search(r'\s\d{4}\s', text).group(0)) if season < 2015 or season > schedules.get_current_season(): tweet_error("Sorry, I don't have data for this season yet", data) print('Invalid season') return else: season = schedules.get_current_season() # Get game with a 5-digit regex if gameid is None: if re.search(r'\s\d{5}\s', text) is not None: gameid = int(re.search(r'\s\d{5}\s', text).group(0)) if not schedules.check_valid_game(season, gameid): tweet_error("Sorry, this game ID doesn't look right", data) print('Game ID not right') return else: pass if gameid is None: # Get team names parts = data['text'].replace('@h2hbot', '').strip().split(' ') teams = [] for part in parts: if re.match(r'[A-z]{3}', part.strip()): part = part.upper() if team_info.team_as_id(part) is not None: teams.append(part) if len(teams) == 0: print('Think this was a tagged discussion') return elif len(teams) != 2: tweet_error("Sorry, I need 2 teams. Found {0:d}. Make sure abbreviations are correct" .format(len(teams)), data) return team1, team2 = teams[:2] gameid = games.most_recent_game_id(team1, team2) h2hfile = 'bot/{0:d}0{1:d}h2h.png'.format(season, gameid) tlfile = 'bot/{0:d}0{1:d}tl.png'.format(season, gameid) oldstatus = schedules.get_game_status(season, gameid) # Scrape only if: # Game is in current season AND # Game is today, and my schedule says it's "scheduled", OR # Game is today, and my schedule doesn't say it's final yet, and it's been at least # 5 min since last scrape, OR # Game was before today and my schedule doesn't say "final" # Update in these cases scrapeagain = False if season == schedules.get_current_season(): today = datetime.datetime.now().strftime('%Y-%m-%d') gdata = schedules.get_game_data_from_schedule(season, gameid) if gdata['Date'] == today: if gdata['Status'] == 'Scheduled': scrapeagain = True elif gdata['Status'] != 'Final' and \ (LAST_UPDATE is None or time.time() - LAST_UPDATE >= 60 * 5): scrapeagain = True elif gdata['Date'] < today and gdata['Status'] != 'Final': scrapeagain = True if scrapeagain: autoupdate.autoupdate(season, update_team_logs=False) LAST_UPDATE = time.time() SCRAPED_NEW = True hname = schedules.get_home_team(season, gameid) rname = schedules.get_road_team(season, gameid) status = schedules.get_game_status(season, gameid) if 'In Progress' in oldstatus or status != oldstatus or not os.path.exists(tlfile): try: game_timeline.game_timeline(season, gameid, save_file=tlfile) game_h2h.game_h2h(season, gameid, save_file=h2hfile) tweet_game_images(h2hfile, tlfile, hname, rname, status, data) print('Success!') except Exception as e: print(data['text'], time.time(), e, e.args) tweet_error("Sorry, there was an unknown error while making the charts (cc @muneebalamcu)", data) except Exception as e: print('Unexpected error') print(time.time(), data['text'], e, e.args)