def restructure_ncaa_schedule(df, year):
    '''
    Purpose: Restructure a schedule from collegefootballdata.com into the 
        desired format.

    Inputs
    ------
        df : Pandas DataFrame
            Contains the schedule of all games from a single NCAA season
        year : int
            Desired year in which to restructure a desired schedule
    
    Outputs
    -------
        df : Pandas DataFrame
            Cleaned up version of original dataframe
    '''
    # fix problems with special characters
    df['home_team'] = df['home_team'].apply(lambda x: x.replace('�', 'e'))
    df['away_team'] = df['away_team'].apply(lambda x: x.replace('�', 'e'))

    # standardize team names
    df['home_team'] = rename_teams(df['home_team'], 'ncaa', 'Team')
    df['away_team'] = rename_teams(df['away_team'], 'ncaa', 'Team')

    # drop unnecessary columns
    df = df.drop(columns=[
        'id', 'season_type', 'conference_game', 'attendance', 'venue_id',
        'home_conference', 'home_line_scores', 'away_conference'
    ])

    # convert date to Eastern Standard Time (US) and then to YYYYMMDD format
    df['date'] = pd.to_datetime(df['start_date']).dt.tz_convert(
        'US/Eastern').dt.strftime('%Y%m%d').astype(int)

    # insert team codes
    df['away_code'] = rename_teams(df['away_team'], 'ncaa', 'TeamCode')
    df['home_code'] = rename_teams(df['home_team'], 'ncaa', 'TeamCode')

    # remove columns that are no longer needed and reorder in desired sequence
    df = df[[
        'season', 'week', 'date', 'home_team', 'home_points', 'home_code',
        'away_team', 'away_points', 'away_code', 'neutral_site', 'venue'
    ]]

    return df
예제 #2
0
def scrape_team(url_team, league):
    '''
    Purpose: Scrapes the colors for an individual team.

    Inputs
    ------
        url_team : string
            hyperlink to an individual team's color page
        league : string
            Specifies the type of teams being scraped (i.e. 'nfl' or 'ncaa')      

    Outputs
    -------
        dict_team : dictionary
             Contains the name of the team, the name of their colors (as well
             as the RGB and Hex value of each color), and the url of the 
             team's logo
    '''
    # scrape the team's page
    soup = soupify_url(url_team)
    while '503 Service' in str(soup):
        print('Connection error: Sleeping 10 seconds')
        time.sleep(10)
        soup = soupify_url(url_team)

    dict_team = {}

    # extract the full team name and standardize it
    team_name = soup.find('div', {'class': 'description'}).find('h2')
    team_name = html.unescape(str(team_name)).split(' color codes')[0].replace(
        '<h2>', '')
    dict_team['Team'] = rename_teams([team_name], league)[0]

    # isolate the colors and logo information
    team_info = soup.find('div', {'class': 'colors'})

    list_colors = []
    colors = team_info.find_all('tbody')
    for color in colors:
        dict_color = {}
        dict_color['color'] = color.find('th').text
        dict_color['hex'] = color.find_all('tr')[0].find_all(
            'td')[1].text.strip()
        dict_color['rgb'] = color.find_all('tr')[1].find_all(
            'td')[1].text.strip()
        dict_color['rgb'] = dict_color['rgb'].replace(' ', ', ')
        list_colors.append(dict_color)
    dict_team['colors'] = list_colors

    dict_team['logo'] = team_info.find('img')['src']

    return dict_team
def restructure_nfl_schedule(df, year):
    '''
    Purpose: Restructure a schedule from pro-football-reference.com into the 
        desired format.

    Inputs
    ------
        df : Pandas DataFrame
            Contains the schedule of all games from a single NCAA season
        year : int
            Desired year in which to restructure a desired schedule
    
    Outputs
    -------
        df : Pandas DataFrame
            Cleaned up version of original dataframe
    '''
    # add the year to all dates as it does not exist
    list_dates = []
    for row in df['date']:
        # Account for the change of year
        if (('January' in row) or ('February' in row)):
            list_dates.append(row + ', ' + str(year + 1))
        else:
            list_dates.append(row + ', ' + str(year))
    df['date'] = list_dates

    # convert dates from strings to datetimes format
    df['date'] = pd.to_datetime(df['date'])

    # convert dates to 'YYYYMMDD' format
    df['date'] = df['date'].dt.strftime('%Y%m%d').astype(int)

    # create home and away teams and home/away points
    list_home = []
    list_home_pts = []
    list_away = []
    list_away_pts = []

    for index, row in df.iterrows():
        if pd.isna(row['at']):
            list_home.append(row['Winner/tie'])
            list_home_pts.append(row['PtsW'])
            list_away.append(row['Loser/tie'])
            list_away_pts.append(row['PtsL'])
        else:
            list_home.append(row['Loser/tie'])
            list_home_pts.append(row['PtsL'])
            list_away.append(row['Winner/tie'])
            list_away_pts.append(row['PtsW'])
    df['home_team'] = list_home
    df['home_points'] = list_home_pts
    df['away_team'] = list_away
    df['away_points'] = list_away_pts

    # standardize team names
    df['home_team'] = rename_teams(df['home_team'], 'nfl', 'Team')
    df['away_team'] = rename_teams(df['away_team'], 'nfl', 'Team')

    # create team IDs for each team
    df['home_code'] = rename_teams(df['home_team'], 'nfl', 'TeamCode')
    df['away_code'] = rename_teams(df['away_team'], 'nfl', 'TeamCode')

    # remove unnecessary columns and reorder variables
    df = df[[
        'season', 'week', 'date', 'home_team', 'home_points', 'home_code',
        'away_team', 'away_points', 'away_code'
    ]]

    return df
예제 #4
0
def process_player_teams(soup):
    '''
    Purpose: Scrapes player metadata from sports-reference.com

    Inputs
    ------
        soup : BeautifulSoup Data
            A BeautifulSoup representation of a player's profile page
    
    Outputs
    -------
        list_teams : list of tuples
            Contains a series of year-team key-value pairs for each year in a 
            player's career (i.e. year the player played and the team they 
            played for)
    '''
    try:
        df_body = pd.read_html(
            str(soup.find('div', {
                'id': 'content'
            }).find_all('table')[-1]))[0]
    except:
        df_body = pd.DataFrame()

    # remove any rows subsequent to (and including) "Career"
    if isinstance(df_body.columns, pd.MultiIndex):
        df_body.columns = df_body.columns.droplevel()
    try:
        df_body = df_body[:df_body[df_body['Year'] == 'Career'].index[0]]
    except:
        if 'Career' not in df_body['Year']:
            pass

    # if a player misses a year due to injury, default to the previous team
    list_teams = []
    prev_team = ''
    for index, row in df_body.iterrows():
        if len(row['Tm']) > 3:
            list_teams.append(prev_team)
        else:
            prev_team = row['Tm']
            list_teams.append(row['Tm'])
    df_body['Tm'] = list_teams

    # correct issues where a player plays for multiple teams in one year
    #   only keep the total and make the last team played for the year's team
    list_rows = []
    for index, row in df_body.iterrows():
        if row['Tm'] == '2TM':
            row['Tm'] = df_body.iloc[index + 2]['Tm']
            row['No.'] = df_body.iloc[index + 2]['No.']
        elif row['Tm'] == '3TM':
            row['Tm'] = df_body.iloc[index + 3]['Tm']
            row['No.'] = df_body.iloc[index + 3]['No.']
        elif row['Tm'] == '4TM':
            row['Tm'] = df_body.iloc[index + 4]['Tm']
            row['No.'] = df_body.iloc[index + 4]['No.']
        if (not pd.isna(row['Year'])) and (len(str(row['Year'])) > 3):
            list_rows.append(row.loc['Year':'Tm'])
    df_body = pd.DataFrame(list_rows)

    # remove any non integer characters from year values (i.e. pro bowl indicators)
    df_body['Year'] = df_body['Year'].apply(
        lambda x: int(''.join([char for char in str(x) if char in digits])))

    # standardize the name of the player's school
    dict_teams = {}
    for index, row in df_body.iterrows():
        team = rename_teams([row['Tm']], 'nfl', 'Team')[0]
        dict_teams[row['Year']] = team

    # isolate the teams the player played for each year
    df_teams = pd.DataFrame.from_dict(dict_teams,
                                      orient='index',
                                      columns=['team'])

    # iterate over each year and fill in missing years for a player with their
    #   previous team (i.e. if a player plays in 2012 and 2014, assign their
    #   2013 season to their 2012 team)
    for year in range(df_teams.iloc[0].name, df_teams.iloc[-1].name + 1):
        if year not in dict_teams.keys():
            dict_teams[year] = dict_teams[year - 1]

    # convert dictionary to list of tuples for easier writing-to-disk
    list_teams = sorted([(year, team) for year, team in dict_teams.items()],
                        key=(lambda x: x[0]))

    return list_teams
def process_player_teams(soup):
    '''
    Purpose: Scrapes player metadata from sports-reference.com

    Inputs
    ------
        soup : BeautifulSoup Data
            A BeautifulSoup representation of a player's profile page
    
    Outputs
    -------
        list_teams : list of tuples
            Contains a series of year-team key-value pairs for each year in a 
            player's career (i.e. year the player played and the team they 
            played for)
    '''
    # scrape the player's data
    try:
        headers = soup.find('div', {
            'class': 'table_outer_container'
        }).find('table').find('thead').find_all('tr')[-1].find_all('th')
        # convert the column headers to a list of strings
        list_headers = [x.text for x in headers]
        # extract the player's statistical data (in table format)
        df_body = pd.read_html(
            str(
                soup.find('div', {
                    'class': 'table_outer_container'
                }).find('table')))[0]
        # insert the column headers
        df_body.columns = list_headers
    except:
        return []

    # remove any rows subsequent to (and including) "Career"
    if isinstance(df_body.columns, pd.MultiIndex):
        df_body.columns = df_body.columns.droplevel()
    df_body = df_body[:df_body[df_body['Year'] == 'Career'].index[0]]

    # remove any non integer characters from year values (i.e. pro bowl indicators)
    df_body['Year'] = df_body['Year'].apply(
        lambda x: int(''.join([char for char in x if char in digits])))

    # standardize the name of the player's school
    dict_teams = {}
    for index, row in df_body.iterrows():
        team = rename_teams([row['School']], 'ncaa', 'Team')[0]
        dict_teams[row['Year']] = team

    # isolate the teams the player played for each year
    df_teams = pd.DataFrame.from_dict(dict_teams,
                                      orient='index',
                                      columns=['team'])
    # iterate over each year and fill in missing years for a player with their
    #   previous team (i.e. if a player plays in 2012 and 2014, assign their
    #   2013 season to their 2012 team)
    for year in range(df_teams.iloc[0].name, df_teams.iloc[-1].name + 1):
        if year not in dict_teams.keys():
            dict_teams[year] = dict_teams[year - 1]

    # convert dictionary to list of tuples for easier writing-to-disk
    list_teams = sorted([(year, school)
                         for year, school in dict_teams.items()],
                        key=(lambda x: x[0]))

    return list_teams