def scrape_date_range(start_date, end_date): """ :param start_date: start date :param end_date: end date :return: full df with nhl.com game_ids added """ # pull up nhl schedule so that we can incorporate official game_ids schedule_json = get_schedule(start_date, end_date) espn_coordinates = [] for day in schedule_json['dates']: for game in day['games']: if 20000 <= int( str(game['gamePk']) [5:]) < 40000: # Do not include pre season or all star games date = day['date'] away = fix_team(game['teams']['away']['team']['name'].upper()) home = fix_team(game['teams']['home']['team']['name'].upper()) coords = scrape_game(date, home, away) coords['Game_Id'] = game['gamePk'] espn_coordinates.append(coords) espn_coordinates = pd.concat(espn_coordinates, ignore_index=False) return espn_coordinates
def parse_player(player_list, player): """ :param player_list = list of players from raw json :param player = player in player_list :return: dict of home & away playing rosters """ players = dict() players['Player_Id'] = player_list[player]['id'] players['Name'] = fix_name(player_list[player]['fullName'].upper()) # often attributes are missing so we need to check first if 'primaryPosition' in player_list[player]: players['Pos'] = player_list[player]['primaryPosition']['abbreviation'] if 'shootsCatches' in player_list[player]: players['Shoots'] = player_list[player]['shootsCatches'] if 'birthDate' in player_list[player]: players['Birth_Date'] = player_list[player]['birthDate'] if 'birthCity' in player_list[player]: players['Birth_City'] = player_list[player]['birthCity'] if 'birthStateProvince' in player_list[player]: players['Birth_Region'] = player_list[player]['birthStateProvince'] if 'birthCountry' in player_list[player]: players['Birth_Country'] = player_list[player]['birthCountry'] if 'nationality' in player_list[player]: players['Nationality'] = player_list[player]['nationality'] if 'height' in player_list[player]: players['Height'] = player_list[player]['height'] if 'weight' in player_list[player]: players['Weight'] = player_list[player]['weight'] # get draft info from player html page as it is not included in the json url = 'https://www.nhl.com/player/{}-{}-{}'.format( player_list[player]['firstName'], player_list[player]['lastName'], player_list[player]['id']) html = get_url(url) time.sleep(1) soup = BeautifulSoup(html.content, 'html.parser') spans = soup.find_all( 'div', {'class': 'player-overview__bio'}) # find bio section bio = [i.get_text() for i in spans][0].split() # split into list try: draft = bio[bio.index('Draft:'):bio.index('Draft:') + 9] # find index for draft info. players['Draft_Year'] = int(draft[1]) players['Draft_Team'] = fix_team(draft[2].strip(',')) players['Draft_Round'] = int(re.findall("\d+", draft[3])[0]) players['Draft_Pick'] = int(re.findall("\d+", draft[5])[0]) players['Draft_Overall'] = int(re.findall("\d+", draft[7])[0]) except: pass # if player is undrafted this section does not exist. not as error so we will just skip this step old_players.append(player_list[player]['id']) return players
def get_teams(response): """ Extract Teams for date from doc :param response: doc :return: list of teams """ soup = BeautifulSoup(response.text, 'lxml') td = soup.findAll('td', {'class': "team"}) teams = [fix_team(t.get_text().upper()) for t in td if t.get_text() != ''] # Make a list of both teams for each game games = [teams[i:i + 2] for i in range(0, len(teams), 2)] return games
def analyze_shifts(shift, name, team): """ Analyze shifts for each player when using. Prior to this each player (in a dictionary) has a list with each entry being a shift. This function is only used for the html :param shift: info on shift :param name: player name :param team: given team :return: dict with info for shift """ shifts = dict() shifts['Player'] = name.upper() shifts['Period'] = '4' if shift[1] == 'OT' else shift[1] shifts['Team'] = fix_team(team.strip(' ')) shifts['Shift'] = shift[0] shifts['Start'] = convert_to_seconds(shift[2].split('/')[0]) shifts['End'] = convert_to_seconds(shift[3].split('/')[0]) shifts['Duration'] = convert_to_seconds(shift[4].split('/')[0]) return shifts
def get_players(soup): """ scrape roster for players, scratches, captains, teams :param soup: html :return: dict for home & away players, dict for home & away scratches, dict for home & away captains, dict for teams """ players = dict() scratches = dict() captains = dict() team = dict() tables = soup.find_all( 'table', { 'align': 'center', 'border': '0', 'cellpadding': '0', 'cellspacing': '0', 'width': '100%' }) """ There are 5 tables which correspond to the above criteria. tables[0] is game info tables[1] is away starters tables[2] is home starters tables[3] is away scratches tables[4] is home scratches """ del tables[0] player_info = [table.find_all('td') for table in tables] player_info = [[x.get_text() for x in group] for group in player_info] # Make list of list of 3 each. The three are: number, position, name (in that order) player_info = [[group[i:i + 3] for i in range(0, len(group), 3)] for group in player_info] # Get rid of header column player_info = [[player for player in group if player[0] != '#'] for group in player_info] # Create dict that records captains for the given game # {'Away Captain': 'AWAY CAPTAIN', 'Away Assistants': 'AWAY ASSISTANTS', # 'Home Captain': 'HOME CAPTAIN', 'Home Assistants': 'HOME ASSISTANTS'} captains['Away Captain'] = [ i for i in player_info[0] if i[0] != '\xa0' and i[2].find('(C)') != -1 ] captains['Away Assistants'] = [ i for i in player_info[0] if i[0] != '\xa0' and i[2].find('(A)') != -1 ] captains['Home Captain'] = [ i for i in player_info[1] if i[0] != '\xa0' and i[2].find('(C)') != -1 ] captains['Home Assistants'] = [ i for i in player_info[1] if i[0] != '\xa0' and i[2].find('(A)') != -1 ] away_players = player_info[0] + player_info[2] home_players = player_info[1] + player_info[3] away_scratches = player_info[2] home_scratches = player_info[3] def fix_capt(player): """ Sometimes a player had a (A) or (C) attached to their name :param player: list of player info -> [number, position, name] :return: fixed list """ player[2] = player[2][:player[2].find('(')] player[2] = player[2].strip() return player # For those with (A) or (C) in name field get rid of it # First condition is to control when we get whitespace as one of the indices players['Away'] = [ fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i for i in away_players ] players['Home'] = [ fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i for i in home_players ] scratches['Away Scratch'] = [ fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i for i in away_scratches ] scratches['Home Scratch'] = [ fix_capt(i) if i[0] != '\xa0' and i[2].find('(') != -1 else i for i in home_scratches ] # Get rid when just whitespace players['Away'] = [i for i in away_players if i[0] != u'\xa0'] players['Home'] = [i for i in home_players if i[0] != u'\xa0'] scratches['Away Scratch'] = [i for i in away_scratches if i[0] != u'\xa0'] scratches['Home Scratch'] = [i for i in home_scratches if i[0] != u'\xa0'] # Returns home and away team teams = soup.find_all(class_='teamHeading') teams = [i.get_text() for i in teams] team['Away'] = fix_team(teams[0]) team['Home'] = fix_team(teams[1]) return players, scratches, captains, team
def scrape_schedule(date_from, date_to): """ Calls getSchedule and scrapes the raw schedule JSON :param date_from: scrape from this date e.g. '2010-10-03' :param date_to: scrape until this date e.g. '2011-06-20' :return: pd DF with game data for given date range """ games = [] schedule_json = get_schedule(date_from, date_to) for day in schedule_json['dates']: for game in day['games']: if 20000 <= int(str(game['gamePk'])[5:]) < 40000: # do not include pre season or all star games schedule = dict() schedule['Date'] = day['date'] schedule['Game_Id'] = game['gamePk'] if game['gameType'] == 'R': schedule['Game_Type'] = 'Regular Season' elif game['gameType'] == 'P': schedule['Game_Type'] = 'Playoff' schedule['Round'] = int(str(game['gamePk'])[7]) schedule['Series'] = int(str(game['gamePk'])[8]) schedule['Game'] = int(str(game['gamePk'])[9]) schedule['Season'] = game['season'] schedule['Game_State'] = game['status']['detailedState'] schedule['Away_Team'] = fix_team(game['teams']['away']['team']['name'].upper()) schedule['Away_Team_Id'] = game['teams']['away']['team']['id'] schedule['Away_Score'] = game['teams']['away']['score'] schedule['Away_Wins'] = game['teams']['away']['leagueRecord']['wins'] schedule['Away_Losses'] = game['teams']['away']['leagueRecord']['losses'] if 'ot' in game['teams']['away']['leagueRecord']: schedule['Away_OT'] = game['teams']['away']['leagueRecord']['ot'] schedule['Home_Team'] = fix_team(game['teams']['home']['team']['name'].upper()) schedule['Home_Team_Id'] = game['teams']['home']['team']['id'] schedule['Home_Score'] = game['teams']['home']['score'] schedule['Home_Wins'] = game['teams']['home']['leagueRecord']['wins'] schedule['Home_Losses'] = game['teams']['home']['leagueRecord']['losses'] if 'ot' in game['teams']['home']['leagueRecord']: schedule['Home_OT'] = game['teams']['home']['leagueRecord']['ot'] schedule['Venue'] = game['venue']['name'] try: html = get_html(game['gamePk']) time.sleep(1) soup = BeautifulSoup(html.content, 'html.parser') schedule['Attendance'] = get_gameinfo(soup)['Attendance'] schedule['Start'] = get_gameinfo(soup)['Start'] schedule['End'] = get_gameinfo(soup)['End'] schedule['Timezone'] = get_gameinfo(soup)['TZ'] except: schedule['Attendance'] = np.NaN schedule['Start'] = np.NaN schedule['End'] = np.NaN schedule['Timezone'] = np.NaN games.append(schedule) columns = ['Game_Id', 'Season', 'Date', 'Game_Type', 'Round', 'Series', 'Game', 'Game_State', 'Home_Team_Id', 'Home_Team', 'Away_Team_Id', 'Away_Team', 'Home_Score', 'Away_Score', 'Home_Wins', 'Home_Losses', 'Home_OT', 'Away_Wins', 'Away_Losses', 'Away_OT', 'Venue', 'Attendance', 'Start', 'End', 'Timezone'] games = pd.DataFrame(games, columns=columns) return games