def get_players_urls(players_url=None): '''Returns soup objects of bref player pages (a-z) Keyword arguments: players_url - The url used to scrape the soups (default None) ''' players_soups = [] if players_url == None: players_url = BREF_HTML + '/players/' letters = [chr(n) for n in range(97, 123)] success_count, http_error_count = 0, 0 start_time = time.time() for letter in letters: players_soup = get_soup(players_url + letter) if players_soup != None: players_soups.append(players_soup) success_count += 1 else: http_error_count += 1 end_time = time.time() logger_build.info('Per run: {}, Successes: {}, Failures: {}'.format( (end_time - start_time) / (success_count + http_error_count), success_count, http_error_count)) return players_soups
def get_boxscore_htmls_month(year, month, headers=None, url_template=None): '''Returns a df containing info for all games in the given month. Keyword arguments: year -- the year the season ends in month -- the month as an integer headers -- override headers to use for the soup object (default None) url_template -- override template to use for url (default None) ''' assert type(year) == int and type( month) == int, 'Year and month must be int' assert year <= CURRENT_YEAR + 1, 'Year must be before %s' % (CURRENT_YEAR + 1) assert month >= 1 and month <= 12, 'Month must be between 1 and 12' if url_template == None: url_template = "https://www.basketball-reference.com/leagues/NBA_%year%_games-%month%.html" month_url = url_template.replace('%year%', str(year)).replace( '%month%', calendar.month_name[month].lower()) soup = get_soup(month_url, headers) if soup: try: boxscores_month = get_bref_tables(soup, ['all_schedule'], 'box_score_text')['all_schedule'] except KeyError as e: logger_build.info( "Games table does not exist. Year: %s, month: %s." % (year, month)) return None except: raise drop_columns = [ 'attendance', 'box_score_text', 'game_remarks', 'overtimes' ] boxscores_month.drop(drop_columns, inplace=True, axis=1) boxscores_month.rename(columns={ 'game_start_time': 'start_time', 'home_team_name': 'home_team', 'visitor_team_name': 'visitor_team' }, inplace=True) boxscores_month.date_game = boxscores_month.date_game.apply( lambda x: dt.datetime.strptime(x, '%a, %b %d, %Y').date().strftime( '%Y-%m-%d')) if 'start_time' in boxscores_month.columns: boxscores_month.start_time = boxscores_month.start_time.apply( lambda x: column_time(x)) # keep only games that have been played boxscores_month = boxscores_month[ boxscores_month.loc[:, 'home_pts'] != ''] for home_visitor in ['home', 'visitor']: boxscores_month[home_visitor + '_pts'] = boxscores_month[home_visitor + '_pts'].astype(int) return boxscores_month
def get_teams(url=None, headings=None): '''Returns a df containing the abbreviation and team name of all teams from bref page. Keywork arguments: url - the url to scrape, bref team page if none given (default None) headings - the headings to use when scraping, if none given uses default behaviour of get_soup (default None) ''' if url == None: url = BREF_HTML + '/teams/' team_soup = get_soup(url, headings) tables = get_bref_tables(team_soup, ['all_teams_active', 'all_teams_defunct'], 'franch_name') for key in tables.keys(): tables[key].loc[:, 'team'] = tables[key].apply( lambda row: combine_columns(row['franch_name'], row['team_name']), axis=1) teams = pd.concat(tables).reset_index(drop=True) teams = teams.drop_duplicates('team').reset_index(drop=True) teams.loc[:, 'abbreviation'] = teams.bref.apply(lambda x: re.findall( '(?<=/teams/)[A-Z]{3}', x)[0] if type(x) == str else x) return teams[['abbreviation', 'team']]
def get_game_soups(games_table, check_tables=['boxscores', 'fourfactors'], limit=500, crawl_sleep=True, max_errors=3): """Returns a list containing the game id, bref and soup of all games in games table not already in check tables. Will only add game to the list if it has not been already added to each check table. Keyword arguments: games_table -- the df of games, includes game id and bref check_tables -- the tables to check if the game has already been added (default ['boxscores', 'linescores']) limit -- the max number of new entries in list (default 500) """ current_ids = [] start_time = time.time() for table in check_tables: game_ids = list((stats_db.read_table( get_str='SELECT DISTINCT game_id from {};'.format(table)).game_id)) # game_ids = list((stats_db.read_table(table,['game_id'], distinct_only=True).game_id)) if table == check_tables[0]: # for first iteration current_ids = game_ids else: current_ids = [ game_id for game_id in current_ids if game_id in game_ids ] new_games = games_table[~games_table.game_id.isin(current_ids)] # read most recent games first new_games = new_games.sort_values('game_id', ascending=False) if len(new_games) == 0: logger_build.info("No new games to add to database") return [] id_bref = zip(new_games.game_id, new_games.bref) id_bref_soup = [] count = 0 logger_build.info( 'Finished prep: {:.1f} seconds since start'.format(time.time() - start_time)) start_time = time.time() pbar = progressbar.ProgressBar(max_value=limit, widgets=[ ' [', progressbar.Timer(), '] ', progressbar.Percentage(), progressbar.Bar(), ' (', progressbar.ETA(), ') ', ]) handshake_errors = 0 pbar.start() for game_id, bref in id_bref: if count < limit: try: soup = get_soup(BREF_HTML + bref, timeout=10) except Exception as e: if handshake_errors < max_errors: logger_build.error("Scraping soup error: ", e) handshake_errors += 1 time.sleep(60) continue else: logger_build.error( "Exceeded handshake error limit. Errors: %s, Max: %s" % (handshake_errors, max_errors)) raise id_bref_soup.append((game_id, bref, soup)) count += 1 if crawl_sleep: time.sleep(CRAWL_DELAY) pbar.update(count) pbar.finish() end_time = time.time() logger_build.info('Average run time excluding sleep: %s' % ((end_time - start_time - count * CRAWL_DELAY) / count)) return id_bref_soup