def scrape_games(loops, count_max, action): stats_db = SqlDataframes() games_table = stats_db.read_table('games',['game_id','bref']) for i in range(loops): logger.info('Current Loop: {}'.format(i+1)) logger.info('Running game soups...') cfg = get_dbconfig(section='scraping') checktables_cfg = cfg['check_tables'].split(',') id_bref_soup = get_game_soups(games_table, limit=count_max, check_tables=checktables_cfg) if not id_bref_soup: break add_basic_gamestats(id_bref_soup, commit_changes=action) if i == loops - 1: stats_db = SqlDataframes() games_max = stats_db.read_max('games','game_id') boxs_max = stats_db.read_max('boxscores', 'game_id') logger.info('FINISHED...Games remaining to scrape: {}'.format(games_max-boxs_max))
def periodic_scrape(action): stats_db = SqlDataframes() add_allnbateams(action)
def update_games(year): stats_db = SqlDataframes() players_soups = get_players_urls() players = get_all_players(players_soups) colleges = get_colleges(players) teams = get_teams() stats_db.add_to_db(colleges, 'colleges', check_column='college') players_ids = stats_db.apply_mappings(players, 'colleges', ['college1', 'college2']) stats_db.add_to_db(players_ids, 'players', check_column='bref') season_boxscore_htmls = get_boxscore_htmls_year(year, regular_length=False) games_ids = stats_db.apply_mappings(season_boxscore_htmls, 'teams', ['home_team', 'visitor_team']) stats_db.add_to_db(games_ids, 'games', 'bref', 'date_game') playoffs_ids = get_playoff_games((year, year)) if playoffs_ids.empty: logger.info('No playoff games added for season: {}'.format(year)) else: stats_db.add_to_db(playoffs_ids, 'playoffgames', 'game_id', 'game_id')
import logging from nba_stats.scraping.base_functions import get_soup, get_bref_soup, get_bref_tables, get_table from nba_stats.scraping.functions import split_first_last, get_split, convert_feet, combine_columns, is_starter, to_int, convert_mp, include_comments, column_time from nba_stats.read_write.db_insert import SqlDataframes from nba_stats.read_write.functions import export_txt, create_schema_str CURRENT_YEAR = dt.datetime.now().year CURRENT_SEASON = CURRENT_YEAR + 1 if dt.datetime.now( ).month > 7 else CURRENT_YEAR BREF_HTML = 'https://www.basketball-reference.com' CRAWL_DELAY = 3 SEASON_TEAMS = {1977: 22, 1981: 23, 1989: 25, 1990: 27, 1996: 29, 2005: 30} PLAYOFF_TEAMS = {1954: 6, 1967: 8, 1975: 10, 1977: 12, 1984: 16} stats_db = SqlDataframes() logger_build = logging.getLogger(__name__) # handler = logging.StreamHandler() # file_handler = logging.FileHandler("logging\\%s.log" % dt.datetime.today().strftime('%Y%m%d')) # formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-10s %(message)s') # for a_handler in [handler, file_handler]: # a_handler.setFormatter(formatter) # logger_build.addHandler(handler) # logger_build.addHandler(file_handler) # logger_build.setLevel(logging.INFO) def get_players_urls(players_url=None): '''Returns soup objects of bref player pages (a-z)