Exemple #1
0
def scrape_game(date, home_team, away_team, game_id=None):
    """
    Scrape the game
    
    :param date: ex: 2016-20-24
    :param home_team: tricode
    :param away_team: tricode
    :param game_id: Only provided for live games.
    
    :return: DataFrame with info 
    """
    try:
        shared.print_warning('Using espn for pbp')
        espn_xml = get_espn_game(date, home_team, away_team, game_id)
    except Exception as e:
        shared.print_error("Espn pbp for game {a} {b} {c} is either not there or can't be obtained {d}".format(a=date,
                                                                                                                 b=home_team,
                                                                                                                 c=away_team, d=e))
        return pd.DataFrame()

    try:
        espn_df = parse_espn(espn_xml)
    except Exception as e:
        shared.print_error("Issue parsing Espn pbp for game {a} {b} {c} {d}".format(a=date, b=home_team, c=away_team, d=e))
        return pd.DataFrame()

    if espn_df.shape[0] == 0:
        shared.print_error("Espn is missing coordinates for game {a} {b} {c}".format(a=date, b=home_team, c=away_team))
    
    return espn_df
Exemple #2
0
def parse_espn(espn_xml):
    """
    Parse feed 
    
    :param espn_xml: raw xml of feed
    
    :return: DataFrame with info
    """
    columns = ['period', 'time_elapsed', 'event', 'xC', 'yC']

    # Occasionally we get malformed XML because of the presence of \x13 characters
    # Let's just replace them with dashes
    espn_xml = espn_xml.replace(u'\x13', '-')

    try:
        tree = etree.fromstring(espn_xml)
    except etree.ParseError:
        shared.print_warning("Espn pbp isn't valid xml, therefore coordinates can't be obtained for this game")
        return pd.DataFrame([], columns=columns)

    events = tree[1]
    plays = [parse_event(event.text) for event in events]
    plays = [play for play in plays if play is not None]    # Get rid of plays that are None

    return pd.DataFrame(plays, columns=columns)
def scrape_game(game_id):
    """
    Used for debugging. HTML depends on json so can't follow this structure
    
    :param game_id: game to scrape
    
    :return: DataFrame of game info
    """
    game_json = get_pbp(game_id)

    if not game_json:
        shared.print_warning(
            "Json pbp for game {} is not either not there or can't be obtained".format(
                game_id
            )
        )
        return None

    try:
        game_df = parse_json(game_json, game_id)
    except Exception as e:
        shared.print_warning("Error parsing Json pbp for game {} {}".format(game_id, e))
        return None

    return game_df
def scrape_shifts(game_id, players, date):
    """
    Scrape the Shift charts (or TOI tables)
    
    :param game_id: json game id
    :param players: dict of players with numbers and id's
    :param date: date of game
    
    :return: DataFrame with info or None if it fails
    """
    shifts_df = None

    # Control for fact that shift json is only available from 2010 onwards
    if shared.get_season(date) >= 2010:
        shifts_df = json_shifts.scrape_game(game_id)

    if shifts_df is None:
        shifts_df = html_shifts.scrape_game(game_id, players)

        if shifts_df is None:
            shared.print_warning("Unable to scrape shifts for game" + game_id)
            broken_shifts_games.extend([[game_id, date]])
            return None  # Both failed so just return nothing

    shifts_df['Date'] = date

    return shifts_df
def scrape_game(game_id, players):
    """
    Scrape the game. 
    
    :param game_id: id for game
    :param players: list of players
    
    :return: DataFrame with info for the game
    """
    columns = ['Game_Id', 'Period', 'Team', 'Player', 'Player_Id', 'Start', 'End', 'Duration']

    home_html, away_html = get_shifts(game_id)

    if home_html is None or away_html is None:
        shared.print_warning("Html shifts for game {} is either not there or can't be obtained".format(game_id))
        return None

    try:
        away_df = parse_html(away_html, players, game_id)
        home_df = parse_html(home_html, players, game_id)
    except Exception as e:
        shared.print_warning('Error parsing Html shifts for game {} {}'.format(game_id, e))
        return None

    # Combine the two
    game_df = pd.concat([away_df, home_df], ignore_index=True)
    game_df = pd.DataFrame(game_df, columns=columns)

    game_df = game_df.sort_values(by=['Period', 'Start'], ascending=[True, True])
    game_df = game_df.reset_index(drop=True)

    return game_df
def get_players_json(players_json):
    """
    Return dict of players for that game

    :param players_json: players section of json

    :return: dict of players->keys are the name (in uppercase)  
    """
    players = dict()

    for key in players_json.keys():
        name = shared.fix_name(players_json[key]['fullName'].upper())
        players[name] = {
            'id': ' ',
            'last_name': players_json[key]['lastName'].upper()
        }
        try:
            players[name]['id'] = players_json[key]['id']
        except KeyError:
            shared.print_warning(
                '{name} is missing an ID number in the pbp json'.format(
                    name=name))
            players[name]['id'] = 'NA'

    return players
Exemple #7
0
def parse_json(game_json, game_id,):
    """
    Scrape the json for a game
    
    plus, minus players

    :param game_json: raw json
    :param game_id: game id for game

    :return: Either a DataFrame with info for the game 
    """
    cols = ['game_id', 'date', 'season', 'period', 'seconds_elapsed', 'event', 'ev_team', 'home_team', 'away_team',
            'p1_name', 'p1_id', 'p2_name', 'p2_id', 'p3_name', 'p3_id',
            "homePlayer1", "homePlayer1_id", "homePlayer2", "homePlayer2_id", "homePlayer3", "homePlayer3_id",
            "homePlayer4", "homePlayer4_id", "homePlayer5", "homePlayer5_id", "homePlayer6", "homePlayer6_id",
            "awayPlayer1", "awayPlayer1_id", "awayPlayer2", "awayPlayer2_id", "awayPlayer3", "awayPlayer3_id",
            "awayPlayer4", "awayPlayer4_id", "awayPlayer5", "awayPlayer5_id", "awayPlayer6", "awayPlayer6_id",
            'home_goalie', 'home_goalie_id', 'away_goalie', 'away_goalie_id', 'details', 'home_score', 'away_score',
            'xC', 'yC', 'play_index']

    # B4 anything - if there are no plays we leave
    if len(game_json['plays']) == 0:
        shared.print_warning("The Json pbp for game {} contains no plays and therefore can't be parsed".format(game_id))
        return pd.DataFrame()

    # Get all the players in the game
    players = get_roster(game_json)

    # Initialize & Update as we go along
    score = {"home": 0, "away": 0}
    teams = {"home": {"id": game_json['game']['home_team'], "name": game_json['team_instance'][0]['abbrev']},
             "away": {"id": game_json['game']['away_team'], "name": game_json['team_instance'][1]['abbrev']}
             }

    # Get date from UTC timestamp
    date = game_json['plays'][0]['created_at']
    date = datetime.datetime.strptime(date[:date.rfind("-")], "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d")

    try:
        events = [parse_event(play, score, teams, date, game_id, players) for play in game_json['plays']]
    except Exception as e:
        shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e))
        return pd.DataFrame()

    df = pd.DataFrame(events, columns=cols)

    # Get rid of null events and order by play index
    df = df[(~pd.isnull(df['event'])) & (df['event'] != "")]
    df = df.sort_values(by=['play_index'])
    df = df.drop(['play_index'], axis=1)

    return df.reset_index(drop=True)
def parse_json(game_json, game_id):
    """
    Scrape the json for a game
    
    :param game_json: raw json
    :param game_id: game id for game
    
    :return: Either a DataFrame with info for the game 
    """
    columns = [
        "period",
        "event",
        "seconds_elapsed",
        "p1_name",
        "p1_ID",
        "p2_name",
        "p2_ID",
        "p3_name",
        "p3_ID",
        "xC",
        "yC",
    ]

    # 'PERIOD READY' & 'PERIOD OFFICIAL'..etc aren't found in html...so get rid of them
    events_to_ignore = [
        "PERIOD READY",
        "PERIOD OFFICIAL",
        "GAME READY",
        "GAME OFFICIAL",
        "GAME SCHEDULED",
    ]

    try:
        plays = game_json["liveData"]["plays"]["allPlays"]
        events = [
            parse_event(play)
            for play in plays
            if play["result"]["event"].upper() not in events_to_ignore
        ]
    except Exception as e:
        shared.print_warning("Error parsing Json pbp for game {} {}".format(game_id, e))
        return None

    # Sort by event id.
    # Sometimes it's not in order of the assigned id in the json. Like, 156...155 (not sure how this happens).
    sorted_events = sorted(events, key=itemgetter("event_id"))

    return pd.DataFrame(sorted_events, columns=columns)
Exemple #9
0
def get_teams(response):
    """
    Extract Teams for date from doc

    ul-> class = ScoreCell__Competitors

    div -> class = ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db
    
    :param response: doc
    
    :return: list of teams    
    """
    teams = []
    soup = BeautifulSoup(response, "lxml")

    uls = soup.findAll("div", {"class": "ScoreCell__Team"})

    for ul in uls:
        actual_tm = None
        tm = ul.find(
            "div",
            {
                "class":
                "ScoreCell__TeamName ScoreCell__TeamName--shortDisplayName truncate db"
            },
        ).text

        # ESPN stores the name and not the city
        for real_tm in list(shared.TEAMS.keys()):
            if tm.upper() in real_tm:
                actual_tm = shared.TEAMS[real_tm]

        # If not found we'll let the user know...this may happens
        if actual_tm is None:
            shared.print_warning(
                "The team {} in the espn pbp is unknown. We use the supplied team name"
                .format(tm))
            actual_tm = tm

        teams.append(actual_tm)

    # Make a list of both teams for each game
    games = [teams[i:i + 2] for i in range(0, len(teams), 2)]

    print(games)

    return games
def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team, home_team):
    """
    Merge the coordinate from the espn feed into the html DataFrame
    
    Can't join here on event_id because the plays are often out of order and pre-2009 are often missing events. 
    
    :param html_df: DataFrame with info from html pbp
    :param espn_df: DataFrame with info from espn pbp
    :param game_id: json game id
    :param date: ex: 2016-10-24
    :param away_team: away team
    :param home_team: home team
    
    :return: merged DataFrame
    """
    if espn_df is not None:
        try:
            espn_df.period = espn_df.period.astype(int)
            game_df = pd.merge(
                html_df,
                espn_df,
                left_on=["Period", "Seconds_Elapsed", "Event"],
                right_on=["period", "time_elapsed", "event"],
                how="left",
            )

            # Shit happens
            game_df = game_df.drop_duplicates(
                subset=["Period", "Event", "Description", "Seconds_Elapsed"]
            )

            df = game_df.drop(["period", "time_elapsed", "event"], axis=1)
        except Exception as e:
            shared.print_warning(
                "Error for combining espn and html pbp for game {}".format(game_id)
            )
            return None
    else:
        df = html_df

    df["Game_Id"] = game_id[-5:]
    df["Date"] = date
    df["Away_Team"] = away_team
    df["Home_Team"] = home_team

    return pd.DataFrame(df, columns=pbp_columns)
def combine_espn_html_pbp(html_df, espn_df, game_id, date, away_team,
                          home_team):
    """
    Merge the coordinate from the espn feed into the html DataFrame
    
    Can't join here on event_id because the plays are often out of order and pre-2009 are often missing events. 
    
    :param html_df: DataFrame with info from html pbp
    :param espn_df: DataFrame with info from espn pbp
    :param game_id: json game id
    :param date: ex: 2016-10-24
    :param away_team: away team
    :param home_team: home team
    
    :return: merged DataFrame
    """
    if espn_df is not None:
        try:
            game_df = pd.merge(html_df,
                               espn_df,
                               left_on=['Period', 'Seconds_Elapsed', 'Event'],
                               right_on=['period', 'time_elapsed', 'event'],
                               how='left')

            # Shit happens
            game_df = game_df.drop_duplicates(
                subset=['Period', 'Event', 'Description', 'Seconds_Elapsed'])

            df = game_df.drop(['period', 'time_elapsed', 'event'], axis=1)
        except Exception as e:
            shared.print_warning(
                'Error for combining espn and html pbp for game {}'.format(
                    game_id))
            return None
    else:
        df = html_df

    df['Game_Id'] = game_id[-5:]
    df['Date'] = date
    df['Away_Team'] = away_team
    df['Home_Team'] = home_team

    return pd.DataFrame(df, columns=pbp_columns)
def get_teams_and_players(game_json, roster, game_id):
    """
    Get list of players and teams for game

    :param game_json: json pbp for game
    :param roster: players from roster html
    :param game_id: id for game

    :return: dict for both - players and teams
    """
    try:
        teams = json_pbp.get_teams(game_json)
        player_ids = get_players_json(game_json['gameData']['players'])
        players = combine_players_lists(player_ids, roster['players'], game_id)
    except Exception as e:
        shared.print_warning('Problem with getting the teams or players')
        return None, None

    return players, teams
Exemple #13
0
def scrape_pbp(game_id):
    """
    Scrape the pbp data for a given game
    
    :param game_id: Given Game id (e.g. 18507472)
    
    :return: DataFrame with pbp info
    """
    game_json = get_pbp(game_id)

    if not game_json:
        shared.print_warning("Json pbp for game {} is not either not there or can't be obtained".format(game_id))
        return None

    try:
        game_df = parse_json(game_json, game_id)
    except Exception as e:
        shared.print_warning('Error parsing Json pbp for game {} {}'.format(game_id, e))
        return pd.DataFrame()

    return game_df
Exemple #14
0
def scrape_pbp(game_html, game_id, players, teams):
    """
    Scrape the data for the pbp

    :param game_html: Html doc for the game
    :param game_id: game to scrape
    :param players: dict with player info
    :param teams: dict with home and away teams

    :return: DataFrame of game info or None if it fails
    """
    if not game_html:
        shared.print_warning(
            "Html pbp for game {} is either not there or can't be obtained".format(
                game_id
            )
        )
        return None

    cleaned_html = clean_html_pbp(game_html)
    if len(cleaned_html) == 0:
        shared.print_warning("Html pbp contains no plays, this game can't be scraped")
        return None

    try:
        game_df = parse_html(cleaned_html, players, teams)
    except Exception as e:
        shared.print_warning("Error parsing Html pbp for game {} {}".format(game_id, e))
        return None

    return game_df
Exemple #15
0
def scrape_game(date, home_team, away_team, game_id=None):
    """
    Scrape the game
    
    :param date: ex: 2016-20-24
    :param home_team: tricode
    :param away_team: tricode
    :param game_id: Only provided for live games.
    
    :return: DataFrame with info 
    """
    try:
        shared.print_warning('Using espn for pbp')
        espn_xml = get_espn_game(date, home_team, away_team, game_id)
    except Exception as e:
        shared.print_warning("Espn pbp for game {a} {b} {c} is either not there or can't be obtained {d}".format(a=date,
                                                                                                                 b=home_team,
                                                                                                                 c=away_team, d=e))
        return None

    try:
        espn_df = parse_espn(espn_xml)
    except Exception as e:
        shared.print_warning("Error parsing Espn pbp for game {a} {b} {c} {d}".format(a=date, b=home_team, c=away_team, d=e))
        return None

    espn_df.period = espn_df.period.astype(int)
    
    return espn_df
def get_players_json(players_json):
    """
    Return dict of players for that game

    :param players_json: players section of json

    :return: dict of players->keys are the name (in uppercase)  
    """
    players = dict()

    for key in players_json.keys():
        name = shared.fix_name(players_json[key]["fullName"].upper())
        players[name] = {"id": " ", "last_name": players_json[key]["lastName"].upper()}
        try:
            players[name]["id"] = players_json[key]["id"]
        except KeyError:
            shared.print_warning(
                "{name} is missing an ID number in the pbp json".format(name=name)
            )
            players[name]["id"] = "NA"

    return players
def scrape_roster(game_id):
    """
    For a given game scrapes the roster
    
    :param game_id: id for game
    
    :return: dict of players (home and away) an dict for both head coaches 
    """
    roster = get_roster(game_id)

    if not roster:
        shared.print_warning(
            "Roster for game {} is either not there or can't be obtained".
            format(game_id))
        return None

    try:
        players, head_coaches = get_content(roster)
    except Exception as e:
        shared.print_warning("Error parsing Roster for game {} {}".format(
            game_id, e))
        return None

    return {"players": players, "head_coaches": head_coaches}
def scrape_game(game_id):
    """
    Scrape the game. 
    
    :param game_id: game
    
    :return: DataFrame with info for the game
    """
    shifts_json = get_shifts(game_id)

    if not shifts_json:
        shared.print_warning(
            "Json shifts for game {} is either not there or can't be obtained".
            format(game_id))
        return None

    try:
        game_df = parse_json(shifts_json, game_id)
    except Exception as e:
        shared.print_warning("Error parsing Json shifts for game {} {}".format(
            game_id, e))
        return None

    return game_df if not game_df.empty else None
def combine_html_json_pbp(json_df, html_df, game_id, date):
    """
    Join both data sources. First try merging on event id (which is the DataFrame index) if both DataFrames have the
    same number of rows. If they don't have the same number of rows, merge on: Period', Event, Seconds_Elapsed, p1_ID. 
    
    :param json_df: json pbp DataFrame
    :param html_df: html pbp DataFrame
    :param game_id: id of game
    :param date: date of game
    
    :return: finished pbp
    """
    # Don't need those columns to merge in
    json_df = json_df.drop(['p1_name', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID'],
                           axis=1)

    try:
        # If they aren't equal it's usually due to the HTML containing a challenge event
        if html_df.shape[0] == json_df.shape[0]:
            json_df = json_df[[
                'period', 'event', 'seconds_elapsed', 'xC', 'yC'
            ]]
            game_df = pd.merge(html_df,
                               json_df,
                               left_index=True,
                               right_index=True,
                               how='left')
        else:
            # We always merge if they aren't equal but we check if it's due to a challenge so we can print out a better
            # warning message for the user.
            # NOTE: May be slightly incorrect. It's possible for there to be a challenge and another issue for one game.
            if 'CHL' in list(html_df.Event):
                shared.print_warning(
                    "The number of rows in the Html and Json pbp are different because the"
                    " Json pbp, for some reason, does not include challenges. Will instead merge on "
                    "Period, Event, Time, and p1_id.")
            else:
                shared.print_warning(
                    "The number of rows in the Html and json pbp are different because "
                    "someone f****d up. Will instead merge on Period, Event, Time, and p1_id."
                )

            # Actual Merging
            game_df = pd.merge(
                html_df,
                json_df,
                left_on=['Period', 'Event', 'Seconds_Elapsed', 'p1_ID'],
                right_on=['period', 'event', 'seconds_elapsed', 'p1_ID'],
                how='left')

        # This is always done - because merge doesn't work well with shootouts
        game_df = game_df.drop_duplicates(
            subset=['Period', 'Event', 'Description', 'Seconds_Elapsed'])
    except Exception as e:
        shared.print_error(
            'Problem combining Html Json pbp for game {}'.format(game_id))
        return

    game_df['Game_Id'] = game_id[-5:]
    game_df['Date'] = date

    return pd.DataFrame(game_df, columns=pbp_columns)