def restructure_ncaa_schedule(df, year):
    Purpose: Restructure a schedule from into the 
        desired format.

        df : Pandas DataFrame
            Contains the schedule of all games from a single NCAA season
        year : int
            Desired year in which to restructure a desired schedule
        df : Pandas DataFrame
            Cleaned up version of original dataframe
    # fix problems with special characters
    df['home_team'] = df['home_team'].apply(lambda x: x.replace('�', 'e'))
    df['away_team'] = df['away_team'].apply(lambda x: x.replace('�', 'e'))

    # standardize team names
    df['home_team'] = rename_teams(df['home_team'], 'ncaa', 'Team')
    df['away_team'] = rename_teams(df['away_team'], 'ncaa', 'Team')

    # drop unnecessary columns
    df = df.drop(columns=[
        'id', 'season_type', 'conference_game', 'attendance', 'venue_id',
        'home_conference', 'home_line_scores', 'away_conference'

    # convert date to Eastern Standard Time (US) and then to YYYYMMDD format
    df['date'] = pd.to_datetime(df['start_date']).dt.tz_convert(

    # insert team codes
    df['away_code'] = rename_teams(df['away_team'], 'ncaa', 'TeamCode')
    df['home_code'] = rename_teams(df['home_team'], 'ncaa', 'TeamCode')

    # remove columns that are no longer needed and reorder in desired sequence
    df = df[[
        'season', 'week', 'date', 'home_team', 'home_points', 'home_code',
        'away_team', 'away_points', 'away_code', 'neutral_site', 'venue'

    return df
예제 #2
def scrape_team(url_team, league):
    Purpose: Scrapes the colors for an individual team.

        url_team : string
            hyperlink to an individual team's color page
        league : string
            Specifies the type of teams being scraped (i.e. 'nfl' or 'ncaa')      

        dict_team : dictionary
             Contains the name of the team, the name of their colors (as well
             as the RGB and Hex value of each color), and the url of the 
             team's logo
    # scrape the team's page
    soup = soupify_url(url_team)
    while '503 Service' in str(soup):
        print('Connection error: Sleeping 10 seconds')
        soup = soupify_url(url_team)

    dict_team = {}

    # extract the full team name and standardize it
    team_name = soup.find('div', {'class': 'description'}).find('h2')
    team_name = html.unescape(str(team_name)).split(' color codes')[0].replace(
        '<h2>', '')
    dict_team['Team'] = rename_teams([team_name], league)[0]

    # isolate the colors and logo information
    team_info = soup.find('div', {'class': 'colors'})

    list_colors = []
    colors = team_info.find_all('tbody')
    for color in colors:
        dict_color = {}
        dict_color['color'] = color.find('th').text
        dict_color['hex'] = color.find_all('tr')[0].find_all(
        dict_color['rgb'] = color.find_all('tr')[1].find_all(
        dict_color['rgb'] = dict_color['rgb'].replace(' ', ', ')
    dict_team['colors'] = list_colors

    dict_team['logo'] = team_info.find('img')['src']

    return dict_team
def restructure_nfl_schedule(df, year):
    Purpose: Restructure a schedule from into the 
        desired format.

        df : Pandas DataFrame
            Contains the schedule of all games from a single NCAA season
        year : int
            Desired year in which to restructure a desired schedule
        df : Pandas DataFrame
            Cleaned up version of original dataframe
    # add the year to all dates as it does not exist
    list_dates = []
    for row in df['date']:
        # Account for the change of year
        if (('January' in row) or ('February' in row)):
            list_dates.append(row + ', ' + str(year + 1))
            list_dates.append(row + ', ' + str(year))
    df['date'] = list_dates

    # convert dates from strings to datetimes format
    df['date'] = pd.to_datetime(df['date'])

    # convert dates to 'YYYYMMDD' format
    df['date'] = df['date'].dt.strftime('%Y%m%d').astype(int)

    # create home and away teams and home/away points
    list_home = []
    list_home_pts = []
    list_away = []
    list_away_pts = []

    for index, row in df.iterrows():
        if pd.isna(row['at']):
    df['home_team'] = list_home
    df['home_points'] = list_home_pts
    df['away_team'] = list_away
    df['away_points'] = list_away_pts

    # standardize team names
    df['home_team'] = rename_teams(df['home_team'], 'nfl', 'Team')
    df['away_team'] = rename_teams(df['away_team'], 'nfl', 'Team')

    # create team IDs for each team
    df['home_code'] = rename_teams(df['home_team'], 'nfl', 'TeamCode')
    df['away_code'] = rename_teams(df['away_team'], 'nfl', 'TeamCode')

    # remove unnecessary columns and reorder variables
    df = df[[
        'season', 'week', 'date', 'home_team', 'home_points', 'home_code',
        'away_team', 'away_points', 'away_code'

    return df
예제 #4
def process_player_teams(soup):
    Purpose: Scrapes player metadata from

        soup : BeautifulSoup Data
            A BeautifulSoup representation of a player's profile page
        list_teams : list of tuples
            Contains a series of year-team key-value pairs for each year in a 
            player's career (i.e. year the player played and the team they 
            played for)
        df_body = pd.read_html(
            str(soup.find('div', {
                'id': 'content'
        df_body = pd.DataFrame()

    # remove any rows subsequent to (and including) "Career"
    if isinstance(df_body.columns, pd.MultiIndex):
        df_body.columns = df_body.columns.droplevel()
        df_body = df_body[:df_body[df_body['Year'] == 'Career'].index[0]]
        if 'Career' not in df_body['Year']:

    # if a player misses a year due to injury, default to the previous team
    list_teams = []
    prev_team = ''
    for index, row in df_body.iterrows():
        if len(row['Tm']) > 3:
            prev_team = row['Tm']
    df_body['Tm'] = list_teams

    # correct issues where a player plays for multiple teams in one year
    #   only keep the total and make the last team played for the year's team
    list_rows = []
    for index, row in df_body.iterrows():
        if row['Tm'] == '2TM':
            row['Tm'] = df_body.iloc[index + 2]['Tm']
            row['No.'] = df_body.iloc[index + 2]['No.']
        elif row['Tm'] == '3TM':
            row['Tm'] = df_body.iloc[index + 3]['Tm']
            row['No.'] = df_body.iloc[index + 3]['No.']
        elif row['Tm'] == '4TM':
            row['Tm'] = df_body.iloc[index + 4]['Tm']
            row['No.'] = df_body.iloc[index + 4]['No.']
        if (not pd.isna(row['Year'])) and (len(str(row['Year'])) > 3):
    df_body = pd.DataFrame(list_rows)

    # remove any non integer characters from year values (i.e. pro bowl indicators)
    df_body['Year'] = df_body['Year'].apply(
        lambda x: int(''.join([char for char in str(x) if char in digits])))

    # standardize the name of the player's school
    dict_teams = {}
    for index, row in df_body.iterrows():
        team = rename_teams([row['Tm']], 'nfl', 'Team')[0]
        dict_teams[row['Year']] = team

    # isolate the teams the player played for each year
    df_teams = pd.DataFrame.from_dict(dict_teams,

    # iterate over each year and fill in missing years for a player with their
    #   previous team (i.e. if a player plays in 2012 and 2014, assign their
    #   2013 season to their 2012 team)
    for year in range(df_teams.iloc[0].name, df_teams.iloc[-1].name + 1):
        if year not in dict_teams.keys():
            dict_teams[year] = dict_teams[year - 1]

    # convert dictionary to list of tuples for easier writing-to-disk
    list_teams = sorted([(year, team) for year, team in dict_teams.items()],
                        key=(lambda x: x[0]))

    return list_teams
def process_player_teams(soup):
    Purpose: Scrapes player metadata from

        soup : BeautifulSoup Data
            A BeautifulSoup representation of a player's profile page
        list_teams : list of tuples
            Contains a series of year-team key-value pairs for each year in a 
            player's career (i.e. year the player played and the team they 
            played for)
    # scrape the player's data
        headers = soup.find('div', {
            'class': 'table_outer_container'
        # convert the column headers to a list of strings
        list_headers = [x.text for x in headers]
        # extract the player's statistical data (in table format)
        df_body = pd.read_html(
                soup.find('div', {
                    'class': 'table_outer_container'
        # insert the column headers
        df_body.columns = list_headers
        return []

    # remove any rows subsequent to (and including) "Career"
    if isinstance(df_body.columns, pd.MultiIndex):
        df_body.columns = df_body.columns.droplevel()
    df_body = df_body[:df_body[df_body['Year'] == 'Career'].index[0]]

    # remove any non integer characters from year values (i.e. pro bowl indicators)
    df_body['Year'] = df_body['Year'].apply(
        lambda x: int(''.join([char for char in x if char in digits])))

    # standardize the name of the player's school
    dict_teams = {}
    for index, row in df_body.iterrows():
        team = rename_teams([row['School']], 'ncaa', 'Team')[0]
        dict_teams[row['Year']] = team

    # isolate the teams the player played for each year
    df_teams = pd.DataFrame.from_dict(dict_teams,
    # iterate over each year and fill in missing years for a player with their
    #   previous team (i.e. if a player plays in 2012 and 2014, assign their
    #   2013 season to their 2012 team)
    for year in range(df_teams.iloc[0].name, df_teams.iloc[-1].name + 1):
        if year not in dict_teams.keys():
            dict_teams[year] = dict_teams[year - 1]

    # convert dictionary to list of tuples for easier writing-to-disk
    list_teams = sorted([(year, school)
                         for year, school in dict_teams.items()],
                        key=(lambda x: x[0]))

    return list_teams