Ejemplo n.º 1
0
def get_all_players(players_soups):
    '''Takes soups of bref players and returns a df containing info of all players

    Keywork arguments:
    players_soups - A list of all the soups to be processed.
    '''
    players_dfs = []
    for p_soup in players_soups:
        players_df = get_bref_tables(p_soup, ['all_players'])
        players_dfs.append(players_df['all_players'])

    players = pd.concat(players_dfs)
    for i in [1, 2]:
        players.loc[:, 'pos' + str(i)] = players.pos.apply(
            lambda x: get_split(x, '-', i - 1))
        players.loc[:, 'college' + str(i)] = players.colleges.apply(
            lambda x: get_split(x, ',', i - 1))
    players.birth_date = players.birth_date.apply(
        lambda x: '' if x == '' else dt.datetime.strptime(x, '%B %d, %Y').date(
        ).strftime('%Y-%m-%d'))
    for column in ['year_max', 'year_min', 'weight']:
        players[column] = players[column].apply(lambda x: to_int(x))
    players.year_max = players.year_max.apply(lambda x: np.nan
                                              if x == CURRENT_SEASON else x)
    for column, idx in zip(['first_name', 'last_name'], [0, 1]):
        players.loc[:, column] = players.apply(
            lambda x: split_first_last(x['player'], x['bref'])[idx], axis=1)
    players.height = players.height.apply(lambda x: convert_feet(x))

    drop_columns = ['colleges', 'player', 'pos']
    players = players.drop(drop_columns, axis=1).reset_index(drop=True)

    return players
Ejemplo n.º 2
0
def get_boxscore_htmls_month(year, month, headers=None, url_template=None):
    '''Returns a df containing info for all games in the given month.

    Keyword arguments:
    year -- the year the season ends in
    month -- the month as an integer
    headers -- override headers to use for the soup object (default None)
    url_template -- override template to use for url (default None)
    '''
    assert type(year) == int and type(
        month) == int, 'Year and month must be int'
    assert year <= CURRENT_YEAR + 1, 'Year must be before %s' % (CURRENT_YEAR +
                                                                 1)
    assert month >= 1 and month <= 12, 'Month must be between 1 and 12'

    if url_template == None:
        url_template = "https://www.basketball-reference.com/leagues/NBA_%year%_games-%month%.html"
    month_url = url_template.replace('%year%', str(year)).replace(
        '%month%', calendar.month_name[month].lower())
    soup = get_soup(month_url, headers)

    if soup:
        try:
            boxscores_month = get_bref_tables(soup, ['all_schedule'],
                                              'box_score_text')['all_schedule']
        except KeyError as e:
            logger_build.info(
                "Games table does not exist. Year: %s, month: %s." %
                (year, month))
            return None
        except:
            raise

        drop_columns = [
            'attendance', 'box_score_text', 'game_remarks', 'overtimes'
        ]
        boxscores_month.drop(drop_columns, inplace=True, axis=1)
        boxscores_month.rename(columns={
            'game_start_time': 'start_time',
            'home_team_name': 'home_team',
            'visitor_team_name': 'visitor_team'
        },
                               inplace=True)
        boxscores_month.date_game = boxscores_month.date_game.apply(
            lambda x: dt.datetime.strptime(x, '%a, %b %d, %Y').date().strftime(
                '%Y-%m-%d'))
        if 'start_time' in boxscores_month.columns:
            boxscores_month.start_time = boxscores_month.start_time.apply(
                lambda x: column_time(x))

        # keep only games that have been played
        boxscores_month = boxscores_month[
            boxscores_month.loc[:, 'home_pts'] != '']

        for home_visitor in ['home', 'visitor']:
            boxscores_month[home_visitor +
                            '_pts'] = boxscores_month[home_visitor +
                                                      '_pts'].astype(int)

        return boxscores_month
Ejemplo n.º 3
0
def get_teams(url=None, headings=None):
    '''Returns a df containing the abbreviation and team name of all teams from bref page.

    Keywork arguments:
    url - the url to scrape, bref team page if none given (default None)
    headings - the headings to use when scraping, if none given uses default behaviour of get_soup (default None)
    '''
    if url == None:
        url = BREF_HTML + '/teams/'
    team_soup = get_soup(url, headings)

    tables = get_bref_tables(team_soup,
                             ['all_teams_active', 'all_teams_defunct'],
                             'franch_name')

    for key in tables.keys():
        tables[key].loc[:, 'team'] = tables[key].apply(
            lambda row: combine_columns(row['franch_name'], row['team_name']),
            axis=1)
    teams = pd.concat(tables).reset_index(drop=True)
    teams = teams.drop_duplicates('team').reset_index(drop=True)
    teams.loc[:, 'abbreviation'] = teams.bref.apply(lambda x: re.findall(
        '(?<=/teams/)[A-Z]{3}', x)[0] if type(x) == str else x)

    return teams[['abbreviation', 'team']]
Ejemplo n.º 4
0
def get_boxscore(boxscore_soup, advanced=False):
    '''Returns a df containing boxscore data for both teams, given the soup of the boxscore url.
    pct fields are removed as these can be inferred from data.
    Advanced box score option is in development stage. Will return df but formatting not refined.

    Keyword arguments:
    boxscore_soup -- A soup object of the boxscore url
    advanced -- If True, returns the advanced box score (Default False)
    '''
    # start_time = time.time()
    table_dict = {}
    re_match = 'all_box-[A-Z]{3}-game-advanced' if advanced else 'all_box-[A-Z]{3}-game-basic'
    re_compile = re.compile(re_match)
    find_team_regex = '(?<=all_box_)[a-z]{3}(?=_advanced)' if advanced else '(?<=all_box_)[a-z]{3}(?=_basic)'

    tables = get_bref_tables(boxscore_soup, [re_compile])
    teams = get_away_home_teams(boxscore_soup)

    for key in tables.keys():
        if 'reason' in tables[key].keys():
            tables[key].loc[:, 'starter'] = tables[key].apply(
                lambda row: is_starter(row.name, row.reason), axis=1)
        else:
            tables[key].loc[:, 'starter'] = tables[key].apply(
                lambda row: is_starter(row.name), axis=1)
    #     team_abb = re.findall(find_team_regex, key)[0].upper()
    #     tables[key].loc[:,'team'] = team_abb
        tables[key].loc[:, 'team'] = teams[0]
        teams.pop(0)

    try:
        boxscore = pd.concat([tables[key] for key in tables.keys()],
                             sort=False).reset_index(drop=True)
    except ValueError as e:
        return pd.DataFrame()
    except:
        raise
    boxscore = boxscore[boxscore.player != 'Reserves']

    if advanced:
        column_drops = [
            'reason', 'player', 'efg_pct', 'ts_pct', 'fg3a_per_fga_pct',
            'fta_per_fga_pct', 'starter', 'team', 'mp', 'bpm'
        ]  #bpm newly added, should add at some point
    else:
        column_drops = ['reason', 'player'] + [
            header for header in boxscore.keys() if 'pct' in header
        ]
        boxscore['mp'] = boxscore['mp'].apply(lambda x: convert_mp(x))

    column_drops = [x for x in column_drops if x in boxscore.keys()]
    non_number = ['mp', 'player', 'starter', 'team']
    boxscore.drop(column_drops, axis=1, inplace=True)
    boxscore.rename(columns={'bref': 'player'}, inplace=True)
    for column in boxscore.columns:
        if column not in non_number:
            boxscore[column] = boxscore[column].apply(
                lambda x: to_int(x, 'pct' in column))

    # end_time = time.time()
    # export_txt(str(end_time - start_time) + '\n', 'boxscore_times_%label%.csv'.replace('%label%', test_csv_name))

    return boxscore