def get_team_info(team_name, year_of_interest=None, team_soup=None): URL = "/about/parkadjust.shtml" try: team_abbreviation = BaseballReference.team_dict.inv[team_name] except KeyError: raise BaseballReference.InvalidTeamName(team_name) if year_of_interest is None: year_of_interest = date.today().year if team_soup is None: team_soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/teams/" + team_abbreviation + "/" + str(year_of_interest) + ".shtml") sub_nodes = team_soup.find("a", {"href": URL}).parent.parent.findAll("strong") for sub_node in sub_nodes: for content in sub_node.contents: if "multi-year:" in content: factor_string = sub_node.next_sibling.split(",") hitter_factor = int(factor_string[0].split("-")[1].strip().split(" ")[0]) pitcher_factor = int(factor_string[1].split("-")[1].strip().split(" ")[0]) return hitter_factor, pitcher_factor return None
def get_pitcher_soup(year=None): if year is None: year = date.today().year pitcher_year_url = BaseballReference.BASE_URL + "/leagues/MLB/" + str( year) + "-standard-pitching.shtml" return BeautifulSoupHelper.get_soup_from_url(pitcher_year_url)
def get_vs_pitcher_stats(batter_id, pitcher_id, soup=None): if soup is None: url = BaseballReference.BASE_URL + "/play-index/batter_vs_pitcher.cgi?batter=" + str(batter_id) print url soup = BeautifulSoupHelper.get_soup_from_url(url) return BaseballReference.get_vs_table_row_dict(soup, batter_id, pitcher_id)
def get_team_info(team_name, year_of_interest=None, team_soup=None): URL = "/about/parkadjust.shtml" try: team_abbreviation = BaseballReference.team_dict.inv[team_name] except KeyError: raise BaseballReference.InvalidTeamName(team_name) if year_of_interest is None: year_of_interest = date.today().year if team_soup is None: team_soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/teams/" + team_abbreviation + "/" + str(year_of_interest) + ".shtml") sub_nodes = team_soup.find("a", { "href": URL }).parent.parent.findAll("strong") for sub_node in sub_nodes: for content in sub_node.contents: if "multi-year:" in content: factor_string = sub_node.next_sibling.split(",") hitter_factor = int( factor_string[0].split("-")[1].strip().split(" ")[0]) pitcher_factor = int( factor_string[1].split("-")[1].strip().split(" ")[0]) return hitter_factor, pitcher_factor return None
def get_name_from_id(rotowire_id): """ Use the acquired RotoWire ID to resolve the name in case it is too long for the daily lineups page. :param rotowire_id: unique ID for a player in RotoWire :return: str representation of the name of the player """ player_soup = BeautifulSoupHelper.get_soup_from_url(PLAYER_PAGE_BASE_URL + str(rotowire_id)) return player_soup.find("div", {"class": PLAYER_PAGE_LABEL}).find("h1").text.strip()
def get_recent_pitcher_stats(baseball_reference_id, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=p") return BaseballReference.get_table_row_dict(soup, "total_extra", "Last 14 days", "Split")
def get_career_hitting_stats(baseball_reference_id, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b") return BaseballReference.get_table_row_dict(soup, "total", "Career Totals", "Split")
def get_yesterdays_hitting_game_log(baseball_reference_id, soup=None): yesterdays_date = date.today() - timedelta(days=1) if soup is None: soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/gl.cgi?id=" + str(baseball_reference_id) + "&t=b&year=" + str(yesterdays_date.year)) return BaseballReference.get_table_row_dict(soup, "batting_gamelogs", BaseballReference.date_abbreviations[yesterdays_date.month] + " " + str(yesterdays_date.day), "Date")
def get_vs_pitcher_stats(batter_id, pitcher_id, soup=None): if soup is None: url = BaseballReference.BASE_URL + "/play-index/batter_vs_pitcher.cgi?batter=" + str( batter_id) print url soup = BeautifulSoupHelper.get_soup_from_url(url) return BaseballReference.get_vs_table_row_dict(soup, batter_id, pitcher_id)
def get_season_pitcher_stats(baseball_reference_id, year=None, soup=None): if year is None: year = date.today().year if soup is None: url = BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=" + \ str(year) + "&t=p" print url soup = BeautifulSoupHelper.get_soup_from_url(url) return BaseballReference.get_table_row_dict(soup, "total_extra", str(year) + " Totals", "Split")
def get_name_from_id(rotowire_id): """ Use the acquired RotoWire ID to resolve the name in case it is too long for the daily lineups page. :param rotowire_id: unique ID for a player in RotoWire :return: str representation of the name of the player """ player_soup = BeautifulSoupHelper.get_soup_from_url(PLAYER_PAGE_BASE_URL + str(rotowire_id)) return player_soup.find("div", { "class": PLAYER_PAGE_LABEL }).find("h1").text.strip()
def get_yesterdays_hitting_game_log(baseball_reference_id, soup=None): yesterdays_date = date.today() - timedelta(days=1) if soup is None: soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/gl.cgi?id=" + str(baseball_reference_id) + "&t=b&year=" + str(yesterdays_date.year)) try: return BaseballReference.get_table_row_dict(soup, "batting_gamelogs", BaseballReference.date_abbreviations[yesterdays_date.month] + " " + str(yesterdays_date.day), "Date") # TODO: just try again for now, explore BeautifulSoup built-in options for this except BaseballReference.TableNotFound as e: print e return BaseballReference.get_table_row_dict(soup, "batting_gamelogs", BaseballReference.date_abbreviations[yesterdays_date.month] + " " + str(yesterdays_date.day), "Date")
def get_vs_hand_hitting_stats(baseball_reference_id, hand_value, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b") if hand_value is BaseballReference.HandEnum.LHP: hand = "vs LHP" elif hand_value is BaseballReference.HandEnum.RHP: hand = "vs RHP" else: print "Invalid hand enum." return None return BaseballReference.get_table_row_dict(soup, "plato", hand, "Split")
def get_pitching_game_log(baseball_reference_id, soup=None, game_date=None): if game_date is None: game_date = date.today() - timedelta(days=1) if soup is None: soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/gl.cgi?id=" + str(baseball_reference_id) + "&t=p&year=" + str(game_date.year)) return BaseballReference.get_table_row_dict( soup, "pitching_gamelogs", BaseballReference.date_abbreviations[game_date.month] + " " + str(game_date.day), "Date")
def get_vs_hand_hitting_stats(baseball_reference_id, hand_value, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b") if hand_value is BaseballReference.HandEnum.LHP: hand = "vs LHP" elif hand_value is BaseballReference.HandEnum.RHP: hand = "vs RHP" else: print "Invalid hand enum." return None return BaseballReference.get_table_row_dict(soup, "plato", hand, "Split")
def get_yesterdays_hitting_game_log(baseball_reference_id, soup=None): yesterdays_date = date.today() - timedelta(days=1) if soup is None: soup = BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/gl.cgi?id=" + str(baseball_reference_id) + "&t=b&year=" + str(yesterdays_date.year)) try: return BaseballReference.get_table_row_dict( soup, "batting_gamelogs", BaseballReference.date_abbreviations[yesterdays_date.month] + " " + str(yesterdays_date.day), "Date") # TODO: just try again for now, explore BeautifulSoup built-in options for this except BaseballReference.TableNotFound as e: print e return BaseballReference.get_table_row_dict( soup, "batting_gamelogs", BaseballReference.date_abbreviations[yesterdays_date.month] + " " + str(yesterdays_date.day), "Date")
def get_recent_pitcher_stats(baseball_reference_id, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=p") return BaseballReference.get_table_row_dict(soup, "total_extra", "Last 14 days", "Split")
def get_hitter_page_career_soup(baseball_reference_id): return BeautifulSoupHelper.get_soup_from_url( BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b")
def get_pitcher_page_career_soup(baseball_reference_id): url = BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=p" print url return BeautifulSoupHelper.get_soup_from_url(url)
def get_hitter_page_career_soup(baseball_reference_id): return BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b")
def get_pitcher_page_career_soup(baseball_reference_id): url = BaseballReference.BASE_URL + "/players/split.cgi?id=" + str( baseball_reference_id) + "&year=Career&t=p" print url return BeautifulSoupHelper.get_soup_from_url(url)
def get_game_lineups(database_session): """ Mine the RotoWire daily lineups page and get the players' name, team, and RotoWire ID Note: longer names are abbreviated by RotoWire and need to be resolved by another source :return: list of Game objects representing the lineups for the day """ #TODO: add feature to look if it's going to rain lineup_soup = BeautifulSoupHelper.get_soup_from_url(DAILY_LINEUPS_URL) header_nodes = lineup_soup.findAll("div", {"class": TEAM_REGION_LABEL}) games = list() for header_node in header_nodes: game_node = header_node.parent home_team_lineup = list() away_team_lineup = list() away_team_abbreviation = game_node.find("div", {"class": AWAY_TEAM_REGION_LABEL}).text.split()[0] home_team_abbreviation = game_node.find("div", {"class": HOME_TEAM_REGION_LABEL}).text.split()[0] game_main_soup = game_node.find("div", {"class": LINEUPS_CLASS_LABEL}) for away_player in game_main_soup.findAll("div", {"class": AWAY_TEAM_PLAYER_LABEL}): away_team_lineup.append(get_hitter(away_player, away_team_abbreviation, database_session)) for home_player in game_main_soup.findAll("div", {"class": HOME_TEAM_PLAYER_LABEL}): home_team_lineup.append(get_hitter(home_player, home_team_abbreviation, database_session)) try: pitchers = game_node.find("div", PITCHERS_REGION_LABEL).findAll("div") away_team_pitcher = get_pitcher(pitchers[0], away_team_abbreviation, database_session) home_team_pitcher = get_pitcher(pitchers[1], home_team_abbreviation, database_session) # No pitchers present on page except AttributeError: print "Game between %s and %s is not valid." % (away_team_abbreviation, home_team_abbreviation) continue current_game = Game(away_team_lineup, away_team_pitcher, home_team_lineup, home_team_pitcher) # TODO: since they only release the ump data ~1 hour before the game, we'll have to make this robust later try: game_time = game_node.find("div", {"class": TIME_REGION_LABEL}).find("a").text.replace("ET", "").strip() game_time = datetime.strptime(game_time, '%I:%M %p').strftime("%H:%M") game_entry = GameEntry(date.today(), game_time, home_team_abbreviation, away_team_abbreviation) game_entry.wind_speed = get_wind_speed(game_node) game_entry.ump_ks_per_game = get_ump_ks_per_game(game_node) game_entry.ump_runs_per_game = get_ump_runs_per_game(game_node) game_entry.park_hitter_score, game_entry.park_pitcher_score = BaseballReference.get_team_info(team_dict[home_team_abbreviation]) database_session.add(game_entry) database_session.commit() except IntegrityError: database_session.rollback() print "Warning: attempt to duplicate game entry: %s %s %s %s" % (str(home_team_abbreviation), str(away_team_abbreviation), str(game_entry.game_date), str(game_entry.game_time)) except Exception as e: print e pass if current_game.is_valid(): games.append(current_game) else: print "Game between %s and %s is not valid." % (away_team_abbreviation, home_team_abbreviation) return games
def get_pitcher_soup(year=None): if year is None: year = date.today().year pitcher_year_url = BaseballReference.BASE_URL + "/leagues/MLB/" + str(year) + "-standard-pitching.shtml" return BeautifulSoupHelper.get_soup_from_url(pitcher_year_url)
def get_game_lineups(database_session): """ Mine the RotoWire daily lineups page and get the players' name, team, and RotoWire ID Note: longer names are abbreviated by RotoWire and need to be resolved by another source :return: list of Game objects representing the lineups for the day """ #TODO: add feature to look if it's going to rain lineup_soup = BeautifulSoupHelper.get_soup_from_url(DAILY_LINEUPS_URL) header_nodes = lineup_soup.findAll("div", {"class": TEAM_REGION_LABEL}) games = list() for header_node in header_nodes: game_node = header_node.parent home_team_lineup = list() away_team_lineup = list() away_team_abbreviation = game_node.find("div", { "class": AWAY_TEAM_REGION_LABEL }).text.split()[0] home_team_abbreviation = game_node.find("div", { "class": HOME_TEAM_REGION_LABEL }).text.split()[0] game_main_soup = game_node.find("div", {"class": LINEUPS_CLASS_LABEL}) for away_player in game_main_soup.findAll( "div", {"class": AWAY_TEAM_PLAYER_LABEL}): away_team_lineup.append( get_hitter(away_player, away_team_abbreviation, database_session)) for home_player in game_main_soup.findAll( "div", {"class": HOME_TEAM_PLAYER_LABEL}): home_team_lineup.append( get_hitter(home_player, home_team_abbreviation, database_session)) try: pitchers = game_node.find("div", PITCHERS_REGION_LABEL).findAll("div") away_team_pitcher = get_pitcher(pitchers[0], away_team_abbreviation, database_session) home_team_pitcher = get_pitcher(pitchers[1], home_team_abbreviation, database_session) # No pitchers present on page except AttributeError: print "Game between %s and %s is not valid." % ( away_team_abbreviation, home_team_abbreviation) continue current_game = Game(away_team_lineup, away_team_pitcher, home_team_lineup, home_team_pitcher) # TODO: since they only release the ump data ~1 hour before the game, we'll have to make this robust later try: game_time = game_node.find("div", { "class": TIME_REGION_LABEL }).find("a").text.replace("ET", "").strip() game_time = datetime.strptime(game_time, '%I:%M %p').strftime("%H:%M") game_entry = GameEntry(date.today(), game_time, home_team_abbreviation, away_team_abbreviation) game_entry.wind_speed = get_wind_speed(game_node) game_entry.ump_ks_per_game = get_ump_ks_per_game(game_node) game_entry.ump_runs_per_game = get_ump_runs_per_game(game_node) game_entry.park_hitter_score, game_entry.park_pitcher_score = BaseballReference.get_team_info( team_dict[home_team_abbreviation]) database_session.add(game_entry) database_session.commit() except IntegrityError: database_session.rollback() print "Warning: attempt to duplicate game entry: %s %s %s %s" % ( str(home_team_abbreviation), str(away_team_abbreviation), str(game_entry.game_date), str(game_entry.game_time)) except Exception as e: print e pass if current_game.is_valid(): games.append(current_game) else: print "Game between %s and %s is not valid." % ( away_team_abbreviation, home_team_abbreviation) return games
def get_career_hitting_stats(baseball_reference_id, soup=None): if soup is None: soup = BeautifulSoupHelper.get_soup_from_url(BaseballReference.BASE_URL + "/players/split.cgi?id=" + str(baseball_reference_id) + "&year=Career&t=b") return BaseballReference.get_table_row_dict(soup, "total", "Career Totals", "Split")