Example #1
0
def scrape_date_range(start_date, end_date):
    """

    :param start_date: start date
    :param end_date: end date
    :return: full df with nhl.com game_ids added
    """

    # pull up nhl schedule so that we can incorporate official game_ids
    schedule_json = get_schedule(start_date, end_date)

    espn_coordinates = []

    for day in schedule_json['dates']:
        for game in day['games']:
            if 20000 <= int(
                    str(game['gamePk'])
                [5:]) < 40000:  # Do not include pre season or all star games
                date = day['date']
                away = fix_team(game['teams']['away']['team']['name'].upper())
                home = fix_team(game['teams']['home']['team']['name'].upper())
                coords = scrape_game(date, home, away)
                coords['Game_Id'] = game['gamePk']
                espn_coordinates.append(coords)

    espn_coordinates = pd.concat(espn_coordinates, ignore_index=False)

    return espn_coordinates
def parse_player(player_list, player):
    """
    :param player_list = list of players from raw json
    :param player = player in player_list
    :return: dict of home & away playing rosters
    """

    players = dict()

    players['Player_Id'] = player_list[player]['id']
    players['Name'] = fix_name(player_list[player]['fullName'].upper())
    # often attributes are missing so we need to check first
    if 'primaryPosition' in player_list[player]:
        players['Pos'] = player_list[player]['primaryPosition']['abbreviation']
    if 'shootsCatches' in player_list[player]:
        players['Shoots'] = player_list[player]['shootsCatches']
    if 'birthDate' in player_list[player]:
        players['Birth_Date'] = player_list[player]['birthDate']
    if 'birthCity' in player_list[player]:
        players['Birth_City'] = player_list[player]['birthCity']
    if 'birthStateProvince' in player_list[player]:
        players['Birth_Region'] = player_list[player]['birthStateProvince']
    if 'birthCountry' in player_list[player]:
        players['Birth_Country'] = player_list[player]['birthCountry']
    if 'nationality' in player_list[player]:
        players['Nationality'] = player_list[player]['nationality']
    if 'height' in player_list[player]:
        players['Height'] = player_list[player]['height']
    if 'weight' in player_list[player]:
        players['Weight'] = player_list[player]['weight']

    # get draft info from player html page as it is not included in the json
    url = 'https://www.nhl.com/player/{}-{}-{}'.format(
        player_list[player]['firstName'], player_list[player]['lastName'],
        player_list[player]['id'])
    html = get_url(url)
    time.sleep(1)
    soup = BeautifulSoup(html.content, 'html.parser')

    spans = soup.find_all(
        'div', {'class': 'player-overview__bio'})  # find bio section
    bio = [i.get_text() for i in spans][0].split()  # split into list
    try:
        draft = bio[bio.index('Draft:'):bio.index('Draft:') +
                    9]  # find index for draft info.
        players['Draft_Year'] = int(draft[1])
        players['Draft_Team'] = fix_team(draft[2].strip(','))
        players['Draft_Round'] = int(re.findall("\d+", draft[3])[0])
        players['Draft_Pick'] = int(re.findall("\d+", draft[5])[0])
        players['Draft_Overall'] = int(re.findall("\d+", draft[7])[0])
    except:
        pass  # if player is undrafted this section does not exist. not as error so we will just skip this step

    old_players.append(player_list[player]['id'])

    return players
Example #3
0
def get_teams(response):
    """
    Extract Teams for date from doc

    :param response: doc

    :return: list of teams
    """
    soup = BeautifulSoup(response.text, 'lxml')

    td = soup.findAll('td', {'class': "team"})
    teams = [fix_team(t.get_text().upper()) for t in td if t.get_text() != '']

    # Make a list of both teams for each game
    games = [teams[i:i + 2] for i in range(0, len(teams), 2)]

    return games
def analyze_shifts(shift, name, team):
    """
    Analyze shifts for each player when using.
    Prior to this each player (in a dictionary) has a list with each entry being a shift.
    This function is only used for the html
    :param shift: info on shift
    :param name: player name
    :param team: given team
    :return: dict with info for shift
    """
    shifts = dict()

    shifts['Player'] = name.upper()
    shifts['Period'] = '4' if shift[1] == 'OT' else shift[1]
    shifts['Team'] = fix_team(team.strip(' '))
    shifts['Shift'] = shift[0]
    shifts['Start'] = convert_to_seconds(shift[2].split('/')[0])
    shifts['End'] = convert_to_seconds(shift[3].split('/')[0])
    shifts['Duration'] = convert_to_seconds(shift[4].split('/')[0])

    return shifts
Example #5
0
def get_players(soup):
    """
    scrape roster for players, scratches, captains, teams
    :param soup: html
    :return: dict for home & away players, dict for home & away scratches, dict for home & away captains, dict for teams
    """
    players = dict()
    scratches = dict()
    captains = dict()
    team = dict()

    tables = soup.find_all(
        'table', {
            'align': 'center',
            'border': '0',
            'cellpadding': '0',
            'cellspacing': '0',
            'width': '100%'
        })
    """
    There are 5 tables which correspond to the above criteria.
    tables[0] is game info
    tables[1] is away starters
    tables[2] is home starters
    tables[3] is away scratches
    tables[4] is home scratches
    """

    del tables[0]
    player_info = [table.find_all('td') for table in tables]

    player_info = [[x.get_text() for x in group] for group in player_info]

    # Make list of list of 3 each. The three are: number, position, name (in that order)
    player_info = [[group[i:i + 3] for i in range(0, len(group), 3)]
                   for group in player_info]

    # Get rid of header column
    player_info = [[player for player in group if player[0] != '#']
                   for group in player_info]

    # Create dict that records captains for the given game
    # {'Away Captain': 'AWAY CAPTAIN', 'Away Assistants': 'AWAY ASSISTANTS',
    # 'Home Captain': 'HOME CAPTAIN', 'Home Assistants': 'HOME ASSISTANTS'}
    captains['Away Captain'] = [
        i for i in player_info[0] if i[0] != '\xa0' and i[2].find('(C)') != -1
    ]
    captains['Away Assistants'] = [
        i for i in player_info[0] if i[0] != '\xa0' and i[2].find('(A)') != -1
    ]
    captains['Home Captain'] = [
        i for i in player_info[1] if i[0] != '\xa0' and i[2].find('(C)') != -1
    ]
    captains['Home Assistants'] = [
        i for i in player_info[1] if i[0] != '\xa0' and i[2].find('(A)') != -1
    ]

    away_players = player_info[0] + player_info[2]
    home_players = player_info[1] + player_info[3]
    away_scratches = player_info[2]
    home_scratches = player_info[3]

    def fix_capt(player):
        """
        Sometimes a player had a (A) or (C) attached to their name
        :param player: list of player info -> [number, position, name]
        :return: fixed list
        """
        player[2] = player[2][:player[2].find('(')]
        player[2] = player[2].strip()

        return player

    # For those with (A) or (C) in name field get rid of it
    # First condition is to control when we get whitespace as one of the indices
    players['Away'] = [
        fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i
        for i in away_players
    ]
    players['Home'] = [
        fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i
        for i in home_players
    ]
    scratches['Away Scratch'] = [
        fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i
        for i in away_scratches
    ]
    scratches['Home Scratch'] = [
        fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i
        for i in home_scratches
    ]

    # Get rid when just whitespace
    players['Away'] = [i for i in away_players if i[0] != u'\xa0']
    players['Home'] = [i for i in home_players if i[0] != u'\xa0']
    scratches['Away Scratch'] = [i for i in away_scratches if i[0] != u'\xa0']
    scratches['Home Scratch'] = [i for i in home_scratches if i[0] != u'\xa0']

    # Returns home and away team
    teams = soup.find_all(class_='teamHeading')
    teams = [i.get_text() for i in teams]
    team['Away'] = fix_team(teams[0])
    team['Home'] = fix_team(teams[1])

    return players, scratches, captains, team
Example #6
0
def scrape_schedule(date_from, date_to):
    """
    Calls getSchedule and scrapes the raw schedule JSON
    :param date_from: scrape from this date e.g. '2010-10-03'
    :param date_to: scrape until this date e.g. '2011-06-20'
    :return: pd DF with game data for given date range
    """
    games = []

    schedule_json = get_schedule(date_from, date_to)

    for day in schedule_json['dates']:
        for game in day['games']:
            if 20000 <= int(str(game['gamePk'])[5:]) < 40000: # do not include pre season or all star games
                schedule = dict()
                schedule['Date'] = day['date']
                schedule['Game_Id'] = game['gamePk']
                if game['gameType'] == 'R':
                    schedule['Game_Type'] = 'Regular Season'
                elif game['gameType'] == 'P':
                    schedule['Game_Type'] = 'Playoff'
                    schedule['Round'] = int(str(game['gamePk'])[7])
                    schedule['Series'] = int(str(game['gamePk'])[8])
                    schedule['Game'] = int(str(game['gamePk'])[9])
                schedule['Season'] = game['season']
                schedule['Game_State'] = game['status']['detailedState']

                schedule['Away_Team'] = fix_team(game['teams']['away']['team']['name'].upper())
                schedule['Away_Team_Id'] = game['teams']['away']['team']['id']
                schedule['Away_Score'] = game['teams']['away']['score']
                schedule['Away_Wins'] = game['teams']['away']['leagueRecord']['wins']
                schedule['Away_Losses'] = game['teams']['away']['leagueRecord']['losses']
                if 'ot' in game['teams']['away']['leagueRecord']:
                    schedule['Away_OT'] = game['teams']['away']['leagueRecord']['ot']

                schedule['Home_Team'] = fix_team(game['teams']['home']['team']['name'].upper())
                schedule['Home_Team_Id'] = game['teams']['home']['team']['id']
                schedule['Home_Score'] = game['teams']['home']['score']
                schedule['Home_Wins'] = game['teams']['home']['leagueRecord']['wins']
                schedule['Home_Losses'] = game['teams']['home']['leagueRecord']['losses']
                if 'ot' in game['teams']['home']['leagueRecord']:
                    schedule['Home_OT'] = game['teams']['home']['leagueRecord']['ot']

                schedule['Venue'] = game['venue']['name']

                try:
                    html = get_html(game['gamePk'])
                    time.sleep(1)
                    soup = BeautifulSoup(html.content, 'html.parser')
                    schedule['Attendance'] = get_gameinfo(soup)['Attendance']
                    schedule['Start'] = get_gameinfo(soup)['Start']
                    schedule['End'] = get_gameinfo(soup)['End']
                    schedule['Timezone'] = get_gameinfo(soup)['TZ']
                except:
                    schedule['Attendance'] = np.NaN
                    schedule['Start'] = np.NaN
                    schedule['End'] = np.NaN
                    schedule['Timezone'] = np.NaN

                games.append(schedule)

    columns = ['Game_Id', 'Season', 'Date', 'Game_Type', 'Round', 'Series', 'Game', 'Game_State', 'Home_Team_Id',
               'Home_Team', 'Away_Team_Id', 'Away_Team', 'Home_Score', 'Away_Score', 'Home_Wins', 'Home_Losses',
               'Home_OT', 'Away_Wins', 'Away_Losses', 'Away_OT', 'Venue', 'Attendance', 'Start', 'End', 'Timezone']

    games = pd.DataFrame(games, columns=columns)

    return games