def scrape_shifts(game_id, players, date): """ Scrape the Shift charts (or TOI tables) :param game_id: json game id :param players: dict of players with numbers and id's :param date: date of game :return: DataFrame with info or None if it fails """ shifts_df = None # Control for fact that shift json is only available from 2010 onwards if shared.get_season(date) >= 2010: shifts_df = json_shifts.scrape_game(game_id) if shifts_df is None: shifts_df = html_shifts.scrape_game(game_id, players) if shifts_df is None: shared.print_warning("Unable to scrape shifts for game" + game_id) broken_shifts_games.extend([[game_id, date]]) return None # Both failed so just return nothing shifts_df['Date'] = date return shifts_df
def get_players_json(players_json): """ Return dict of players for that game :param players_json: players section of json :return: dict of players->keys are the name (in uppercase) """ players = dict() for key in players_json.keys(): name = shared.fix_name(players_json[key]['fullName'].upper()) players[name] = { 'id': ' ', 'last_name': players_json[key]['lastName'].upper() } try: players[name]['id'] = players_json[key]['id'] except KeyError: shared.print_warning( '{name} is missing an ID number in the pbp json'.format( name=name)) players[name]['id'] = 'NA' return players
def parse_espn(espn_xml): """ Parse feed :param espn_xml: raw xml of feed :return: DataFrame with info """ columns = ['period', 'time_elapsed', 'event', 'xC', 'yC'] # Occasionally we get malformed XML because of the presence of \x13 characters # Let's just replace them with dashes espn_xml = espn_xml.replace(u'\x13', '-') try: tree = etree.fromstring(espn_xml) except etree.ParseError: shared.print_warning("Espn pbp isn't valid xml, therefore coordinates can't be obtained for this game") return pd.DataFrame([], columns=columns) events = tree[1] plays = [parse_event(event.text) for event in events] plays = [play for play in plays if play is not None] # Get rid of plays that are None return pd.DataFrame(plays, columns=columns)
def parse_json(game_json, game_id,): """ Scrape the json for a game plus, minus players :param game_json: raw json :param game_id: game id for game :return: Either a DataFrame with info for the game """ cols = ['game_id', 'date', 'season', 'period', 'seconds_elapsed', 'event', 'ev_team', 'home_team', 'away_team', 'p1_name', 'p1_id', 'p2_name', 'p2_id', 'p3_name', 'p3_id', "homePlayer1", "homePlayer1_id", "homePlayer2", "homePlayer2_id", "homePlayer3", "homePlayer3_id", "homePlayer4", "homePlayer4_id", "homePlayer5", "homePlayer5_id", "homePlayer6", "homePlayer6_id", "awayPlayer1", "awayPlayer1_id", "awayPlayer2", "awayPlayer2_id", "awayPlayer3", "awayPlayer3_id", "awayPlayer4", "awayPlayer4_id", "awayPlayer5", "awayPlayer5_id", "awayPlayer6", "awayPlayer6_id", 'home_goalie', 'home_goalie_id', 'away_goalie', 'away_goalie_id', 'details', 'home_score', 'away_score', 'xC', 'yC', 'play_index'] # B4 anything - if there are no plays we leave if len(game_json['plays']) == 0: shared.print_warning("The Json pbp for game {} contains no plays and therefore can't be parsed".format(game_id)) return pd.DataFrame() # Get all the players in the game players = get_roster(game_json) # Initialize & Update as we go along score = {"home": 0, "away": 0} teams = {"home": {"id": game_json['game']['home_team'], "name": game_json['team_instance'][0]['abbrev']}, "away": {"id": game_json['game']['away_team'], "name": game_json['team_instance'][1]['abbrev']} } # Get date from UTC timestamp date = game_json['plays'][0]['created_at'] date = datetime.datetime.strptime(date[:date.rfind("-")], "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d") try: events = [parse_event(play, score, teams, date, game_id, players) for play in game_json['plays']] except Exception as e: shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e)) return pd.DataFrame() df = pd.DataFrame(events, columns=cols) # Get rid of null events and order by play index df = df[(~pd.isnull(df['event'])) & (df['event'] != "")] df = df.sort_values(by=['play_index']) df = df.drop(['play_index'], axis=1) return df.reset_index(drop=True)
def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team, home_team): """ Merge the coordinate from the espn feed into the html DataFrame Can't join here on event_id because the plays are often out of order and pre-2009 are often missing events. :param html_df: DataFrame with info from html pbp :param espn_df: DataFrame with info from espn pbp :param game_id: json game id :param date: ex: 2016-10-24 :param away_team: away team :param home_team: home team :return: merged DataFrame """ if espn_df is not None: try: espn_df.period = espn_df.period.astype(int) game_df = pd.merge(html_df, espn_df, left_on=['Period', 'Seconds_Elapsed', 'Event'], right_on=['period', 'time_elapsed', 'event'], how='left') # Shit happens game_df = game_df.drop_duplicates( subset=['Period', 'Event', 'Description', 'Seconds_Elapsed']) df = game_df.drop(['period', 'time_elapsed', 'event'], axis=1) except Exception as e: shared.print_warning( 'Error for combining espn and html pbp for game {}'.format( game_id)) return None else: df = html_df df['Game_Id'] = game_id[-5:] df['Date'] = date df['Away_Team'] = away_team df['Home_Team'] = home_team return pd.DataFrame(df, columns=pbp_columns)
def get_teams_and_players(game_json, roster, game_id): """ Get list of players and teams for game :param game_json: json pbp for game :param roster: players from roster html :param game_id: id for game :return: dict for both - players and teams """ try: teams = json_pbp.get_teams(game_json) player_ids = get_players_json(game_json['gameData']['players']) players = combine_players_lists(player_ids, roster['players'], game_id) except Exception as e: shared.print_warning('Problem with getting the teams or players') return None, None return players, teams
def scrape_pbp(game_id): """ Scrape the pbp data for a given game :param game_id: Given Game id (e.g. 18507472) :return: DataFrame with pbp info """ game_json = get_pbp(game_id) if not game_json: shared.print_warning("Json pbp for game {} is not either not there or can't be obtained".format(game_id)) return None try: game_df = parse_json(game_json, game_id) except Exception as e: shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e)) return pd.DataFrame() return game_df
def scrape_roster(game_id): """ For a given game scrapes the roster :param game_id: id for game :return: dict of players (home and away) an dict for both head coaches """ roster = get_roster(game_id) if not roster: shared.print_warning("Roster for game {} is either not there or can't be obtained".format(game_id)) return None try: players, head_coaches = get_content(roster) except Exception as e: shared.print_warning('Error parsing Roster for game {} {}'.format(game_id, e)) return None return {'players': players, 'head_coaches': head_coaches}
def scrape_pbp(game_html, game_id, players, teams): """ Scrape the data for the pbp :param game_html: Html doc for the game :param game_id: game to scrape :param players: dict with player info :param teams: dict with home and away teams :return: DataFrame of game info or None if it fails """ if not game_html: shared.print_warning( "Html pbp for game {} is either not there or can't be obtained". format(game_id)) return None cleaned_html = clean_html_pbp(game_html) if len(cleaned_html) == 0: shared.print_warning( "Html pbp contains no plays, this game can't be scraped") return None try: game_df = parse_html(cleaned_html, players, teams) except Exception as e: shared.print_warning('Error parsing Html pbp for game {} {}'.format( game_id, e)) return None return game_df
def scrape_game(date, home_team, away_team, game_id=None): """ Scrape the game :param date: ex: 2016-20-24 :param home_team: tricode :param away_team: tricode :param game_id: Only provided for live games. :return: DataFrame with info """ try: shared.print_warning('Using espn for pbp') espn_xml = get_espn_game(date, home_team, away_team, game_id) except Exception as e: shared.print_warning("Espn pbp for game {a} {b} {c} is either not there or can't be obtained {d}".format(a=date, b=home_team, c=away_team, d=e)) return None try: espn_df = parse_espn(espn_xml) except Exception as e: shared.print_warning("Error parsing Espn pbp for game {a} {b} {c} {d}".format(a=date, b=home_team, c=away_team, d=e)) return None return espn_df
def scrape_game(game_id, players): """ Scrape the game. :param game_id: id for game :param players: list of players :return: DataFrame with info for the game """ columns = [ 'Game_Id', 'Period', 'Team', 'Player', 'Player_Id', 'Start', 'End', 'Duration' ] home_html, away_html = get_shifts(game_id) if home_html is None or away_html is None: shared.print_warning( "Html shifts for game {} is either not there or can't be obtained". format(game_id)) return None try: away_df = parse_html(away_html, players, game_id) home_df = parse_html(home_html, players, game_id) except Exception as e: shared.print_warning('Error parsing Html shifts for game {} {}'.format( game_id, e)) return None # Combine the two game_df = pd.concat([away_df, home_df], ignore_index=True) game_df = pd.DataFrame(game_df, columns=columns) game_df = game_df.sort_values(by=['Period', 'Start'], ascending=[True, True]) game_df = game_df.reset_index(drop=True) return game_df
def scrape_game(game_id): """ Scrape the game. :param game_id: game :return: DataFrame with info for the game """ shifts_json = get_shifts(game_id) if not shifts_json: shared.print_warning( "Json shifts for game {} is either not there or can't be obtained". format(game_id)) return None try: game_df = parse_json(shifts_json, game_id) except Exception as e: shared.print_warning('Error parsing Json shifts for game {} {}'.format( game_id, e)) return None return game_df if not game_df.empty else None
def scrape_game(game_id): """ Used for debugging. HTML depends on json so can't follow this structure :param game_id: game to scrape :return: DataFrame of game info """ game_json = get_pbp(game_id) if not game_json: shared.print_warning( "Json pbp for game {} is not either not there or can't be obtained" .format(game_id)) return None try: game_df = parse_json(game_json, game_id) except Exception as e: shared.print_warning('Error parsing Json pbp for game {} {}'.format( game_id, e)) return None return game_df
def parse_json(game_json, game_id): """ Scrape the json for a game :param game_json: raw json :param game_id: game id for game :return: Either a DataFrame with info for the game """ columns = [ 'period', 'event', 'seconds_elapsed', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID', 'xC', 'yC' ] # 'PERIOD READY' & 'PERIOD OFFICIAL'..etc aren't found in html...so get rid of them events_to_ignore = [ 'PERIOD_READY', 'PERIOD_OFFICIAL', 'GAME_READY', 'GAME_OFFICIAL', 'GAME_SCHEDULED' ] try: plays = game_json['liveData']['plays']['allPlays'] events = [ parse_event(play) for play in plays if play['result']['eventTypeId'] not in events_to_ignore ] except Exception as e: shared.print_warning('Error parsing Json pbp for game {} {}'.format( game_id, e)) return None # Sort by event id. # Sometimes it's not in order of the assigned id in the json. Like, 156...155 (not sure how this happens). sorted_events = sorted(events, key=itemgetter('event_id')) return pd.DataFrame(sorted_events, columns=columns)
def combine_html_json_pbp(json_df, html_df, game_id, date): """ Join both data sources. First try merging on event id (which is the DataFrame index) if both DataFrames have the same number of rows. If they don't have the same number of rows, merge on: Period', Event, Seconds_Elapsed, p1_ID. :param json_df: json pbp DataFrame :param html_df: html pbp DataFrame :param game_id: id of game :param date: date of game :return: finished pbp """ # Don't need those columns to merge in json_df = json_df.drop(['p1_name', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID'], axis=1) try: html_df.Period = html_df.Period.astype(int) # If they aren't equal it's usually due to the HTML containing a challenge event if html_df.shape[0] == json_df.shape[0]: json_df = json_df[[ 'period', 'event', 'seconds_elapsed', 'xC', 'yC' ]] game_df = pd.merge(html_df, json_df, left_index=True, right_index=True, how='left') else: # We always merge if they aren't equal but we check if it's due to a challenge so we can print out a better # warning message for the user. # NOTE: May be slightly incorrect. It's possible for there to be a challenge and another issue for one game. if 'CHL' in list(html_df.Event): shared.print_warning( "The number of columns in the Html and Json pbp are different because the" " Json pbp, for some reason, does not include challenges. Will instead merge on " "Period, Event, Time, and p1_id.") else: shared.print_warning( "The number of columns in the Html and json pbp are different because " "someone f****d up. Will instead merge on Period, Event, Time, and p1_id." ) # Actual Merging game_df = pd.merge( html_df, json_df, left_on=['Period', 'Event', 'Seconds_Elapsed', 'p1_ID'], right_on=['period', 'event', 'seconds_elapsed', 'p1_ID'], how='left') # This is always done - because merge doesn't work well with shootouts game_df = game_df.drop_duplicates( subset=['Period', 'Event', 'Description', 'Seconds_Elapsed']) except Exception as e: shared.print_warning( 'Problem combining Html Json pbp for game {}'.format(game_id, e)) return game_df['Game_Id'] = game_id[-5:] game_df['Date'] = date return pd.DataFrame(game_df, columns=pbp_columns)