def _roster(team, season, checked=False): """ Pulls retrosheet roster files """ GH_TOKEN = os.getenv('GH_TOKEN', '') if not checked: g = Github(GH_TOKEN) try: repo = g.get_repo('chadwickbureau/retrosheet') tree = repo.get_git_tree('master') for t in tree.tree: if t.path == 'rosters': subtree = t rosters = [t.path for t in repo.get_git_tree(subtree.sha).tree] file_name = f'{team}{season}.ROS' if file_name not in rosters: raise ValueError( f'Roster not available for {team} in {season}') except RateLimitExceededException: warnings.warn( 'Github rate limit exceeded. Cannot check if the file you want exists.', UserWarning) s = get_text_file(roster_url.format(team, season)) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = roster_columns return data
def lcs_logs(): """ Pull Retrosheet LCS Game Logs """ s = get_text_file(gamelog_url.format('LC')) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = gamelog_columns return data
def division_series_logs(): """ Pull Retrosheet Division Series Game Logs """ s = get_text_file(gamelog_url.format('DV')) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = gamelog_columns return data
def all_star_game_logs(): """ Pull Retrosheet All Star Game Logs """ s = get_text_file(gamelog_url.format('AS')) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = gamelog_columns return data
def park_codes(): """ Pulls retrosheet Park IDs """ s = get_text_file(parkid_url) data = pd.read_csv(StringIO(s), sep=',', quotechar='"') data.columns = parkcode_columns return data
def season_game_logs(season): """ Pull Retrosheet game logs for a given season """ GH_TOKEN = os.getenv('GH_TOKEN', '') # validate input g = Github(GH_TOKEN) repo = g.get_repo('chadwickbureau/retrosheet') gamelogs = [ f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('gamelog') ] file_name = f'GL{season}.TXT' if file_name not in gamelogs: raise ValueError(f'Season game logs not available for {season}') s = get_text_file(gamelog_url.format(season)) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = gamelog_columns return data
def schedules(season): """ Pull retrosheet schedule for a given season """ GH_TOKEN = os.getenv('GH_TOKEN', '') # validate input g = Github(GH_TOKEN) repo = g.get_repo('chadwickbureau/retrosheet') schedules = [ f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('schedule') ] file_name = f'{season}SKED.TXT' if file_name not in schedules: raise ValueError(f'Schedule not available for {season}') s = get_text_file(schedule_url.format(season)) data = pd.read_csv(StringIO(s), header=None, sep=',', quotechar='"') data.columns = schedule_columns return data
def events(season, type='regular', export_dir='.'): """ Pulls retrosheet event files for an entire season. The `type` argument specifies whether to pull regular season, postseason or asg files. Valid arguments are 'regular', 'post', and 'asg'. Right now, pybaseball does not parse the retrosheet files but downloads and saves them. """ GH_TOKEN = os.getenv('GH_TOKEN', '') if not os.path.exists(export_dir): os.mkdir(export_dir) try: g = Github(GH_TOKEN) repo = g.get_repo('chadwickbureau/retrosheet') tree = repo.get_git_tree('master') for t in tree.tree: if t.path == 'event': subtree = t subtree = repo.get_git_tree(subtree.sha) for t in subtree.tree: if t.path == type: subsubtree = t event_files = [ t.path for t in repo.get_git_tree(subsubtree.sha).tree if str(season) in t.path ] if len(event_files) == 0: raise ValueError(f'Event files not available for {season}') except RateLimitExceededException: warnings.warn( 'Github rate limit exceeded. Cannot check if the file you want exists.', UserWarning) for filename in event_files: print(f'Downloading {filename}') s = get_text_file(event_url.format(type, filename)) with open(os.path.join(export_dir, filename), 'w') as f: f.write(s)