def __init__(self): self.source = "site" self.all_players = False self.scrape_bvp = False self.player_id = None self.url = None self.player_list_parser = PlayerListParser() self.player_season_stats_parser = PlayerSeasonStatsParser() self.player_gamelog_parser = PlayerGamelogParser() self.player_splits_parser = PlayerSplitsParser() self.player_bvp_parser = PlayerBvPParser() self.player_manager = PlayerManager()
def __init__(self, sleep_time=2, testing=False): self.source = "site" self.player_regex = re.compile("/baseball/player.htm\?id=\d+") self.player_manager = PlayerManager(testing=testing) self.lineup_manager = LineupManager(testing=testing) self.name_mapping_manager = NameMappingManager(testing=testing) self.bbr_scraper = BaseballReferenceScraper() self.bbr_scraper.sleep_time = sleep_time
def __init__(self): self.lineup_manager = LineupManager() self.player_manager = PlayerManager() self.ballpark_collection = MongoClient( 'localhost', 27017)[MLBConstants.MONGO_MLB_DB_NAME][ MLBConstants.MONGO_MLB_BALLPARK_FACTORS_COLLECTION] self.game_date = date.today() self.season = str(self.game_date.year)
def __init__(self): self.source = "site" self.scrape_yesterdays_players = True self.scrape_bvp = False self.sleep_time = 2 self.player_manager = PlayerManager() self.lineup_manager = LineupManager() self.bbr_scraper = BaseballReferenceScraper() self.bbr_scraper.sleep_time = self.sleep_time
class TestPlayerManager(unittest.TestCase): def setUp(self): self.player_manager = PlayerManager(testing=True) def tearDown(self): self.player_manager.drop_collection() self.player_manager = None def test_db_operations(self): self.assertTrue(self.player_manager.players_collection.count() == 0) player = self.player_manager.read("dmaclean") self.assertTrue(player is None) new_player = { MLBConstants.PLAYER_ID: "dmaclean", MLBConstants.NAME: "Dan MacLean", MLBConstants.POSITION: "Pitcher" } self.player_manager.save(new_player) player = self.player_manager.read({MLBConstants.PLAYER_ID: "dmaclean"}) self.assertTrue(player[MLBConstants.PLAYER_ID] == "dmaclean") self.assertTrue(player[MLBConstants.NAME] == "Dan MacLean") self.assertTrue(player[MLBConstants.POSITION] == "Pitcher") self.assertTrue(self.player_manager.players_collection.count() == 1) player[MLBConstants.NAME] = "Steph MacLean" self.player_manager.save(player) player = self.player_manager.read({MLBConstants.PLAYER_ID: "dmaclean"}) self.assertTrue(player[MLBConstants.PLAYER_ID] == "dmaclean") self.assertTrue(player[MLBConstants.NAME] == "Steph MacLean") self.assertTrue(player[MLBConstants.POSITION] == "Pitcher") self.assertTrue(self.player_manager.players_collection.count() == 1)
def __init__(self, testing=False): self.testing_mode = testing """ Initialization for the LineupManager. Here we make a connection to MongoDB, grab a database handle, and also a handle on the lineups collection. """ self.client = MongoClient('localhost', 27017) # Get a database handle. It'll either be live data or the testing database # depending on the "testing" flag that gets passed in. if testing: self.db = self.client[MLBConstants.MONGO_MLB_TEST_DB_NAME] else: self.db = self.client[MLBConstants.MONGO_MLB_DB_NAME] # Get a handle on the players collection. self.lineups_collection = self.db[ MLBConstants.MONGO_MLB_LINEUPS_COLLECTION] self.player_manager = PlayerManager(testing=self.testing_mode) self.processed_players = None
def setUp(self): self.player_manager = PlayerManager(testing=True)
class BaseballReferenceScraper: """ Scraper for baseball-reference.com """ def __init__(self): self.source = "site" self.all_players = False self.scrape_bvp = False self.player_id = None self.url = None self.player_list_parser = PlayerListParser() self.player_season_stats_parser = PlayerSeasonStatsParser() self.player_gamelog_parser = PlayerGamelogParser() self.player_splits_parser = PlayerSplitsParser() self.player_bvp_parser = PlayerBvPParser() self.player_manager = PlayerManager() def process(self): self.readCLI() if self.player_id: self.process_player(self.player_id, self.url, True) else: self.process_players() def readCLI(self): for arg in sys.argv: if arg == "bbr_scraper.py": pass else: pieces = arg.split("=") if pieces[0] == "season": self.season = int(pieces[1]) elif pieces[0] == "all_players": self.all_players = pieces[1] == "true" elif pieces[0] == "yesterday_only": self.yesterday_only = pieces[1] == "true" elif pieces[0] == "sleep": self.sleep_time = int(pieces[1]) elif pieces[0] == "player_id": self.player_id = pieces[1] elif pieces[0] == "url": self.url = pieces[1] def process_players(self): """ Performs fetching of player data. """ # alphabet = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", # "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"] alphabet = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] # Go through all players by letter for letter in alphabet: url = "/players/{}/".format(letter) data = MLBUtilities.fetch_data("www.baseball-reference.com", url, True) self.player_list_parser.parse(data, letter) # Go through odd player ids for player_id in self.player_list_parser.odd_player_ids: odd_url = "/players/{}/".format(player_id[0]) self.process_player(player_id[1], odd_url) if self.all_players: # Go through retired players for player_id in self.player_list_parser.retired_player_ids: self.process_player(player_id, url) # Go through active players for player_id in self.player_list_parser.active_player_ids: self.process_player(player_id, url, True) def process_player(self, player_id, url, active=False): start = time.time() player = self.player_manager.read({MLBConstants.PLAYER_ID: player_id}) end = time.time() print "\t\tDEBUG: Read from player_manager in {} seconds".format(end - start) if player is None: player = {MLBConstants.PLAYER_ID: player_id} # We can skip this player elif MLBConstants.BATTER_VS_PITCHER in player and not active: logging.info( "Looks like all info for {} has already been scraped. Moving on..." .format(player_id)) return player_url = "{}{}.shtml".format(url, player_id) player_page_data = MLBUtilities.fetch_data( "www.baseball-reference.com", player_url, True) self.player_season_stats_parser.player_data = player self.player_season_stats_parser.parse(player_page_data) ############################################# # Should we get the pitcher or batter page? ############################################# if self.player_season_stats_parser.player_data[ MLBConstants.POSITION] == "Pitcher": player_season_stats_detail_url = "{}{}-pitch.shtml".format( url, player_id) else: player_season_stats_detail_url = "{}{}-bat.shtml".format( url, player_id) ############################################### # Fetch detailed season stats for the player. ############################################### start = time.time() player_season_stats_detail_data = MLBUtilities.fetch_data( "www.baseball-reference.com", player_season_stats_detail_url, True) end = time.time() print "\t\tDEBUG: Fetched detailed season stats in {} seconds".format( end - start) start = time.time() self.player_season_stats_parser.parse(player_season_stats_detail_data) end = time.time() print "\t\tDEBUG: Parsed detailed season stats in {} seconds".format( end - start) # start = time.time() # self.player_manager.save(player) # end = time.time() # print "\t\tDEBUG: Saved detailed season stats in {} seconds".format(end-start) active_seasons = self.determine_active_seasons(player) type = "p" if player[MLBConstants.POSITION] == "Pitcher" else "b" ################# # Grab gamelogs ################# for season in active_seasons: if season not in ["2014"]: continue player_gamelog_url = "/players/gl.cgi?id={}&t={}&year={}".format( player_id, type, season) data = MLBUtilities.fetch_data("www.baseball-reference.com", player_gamelog_url, True) self.player_gamelog_parser.player_data = player self.player_gamelog_parser.type = MLBConstants.PITCHER_TYPE if player[ MLBConstants. POSITION] == "Pitcher" else MLBConstants.BATTER_TYPE self.player_gamelog_parser.season = season self.player_gamelog_parser.parse(data) # self.player_manager.save(player) ############### # Grab splits ############### active_seasons.append("Career") for season in active_seasons: if season not in ["2014", "Career"]: continue player_split_url = "/players/split.cgi?id={}&t={}&year={}".format( player_id, type, season) data = MLBUtilities.fetch_data("www.baseball-reference.com", player_split_url, True) self.player_splits_parser.player_data = player self.player_splits_parser.season = season self.player_splits_parser.parse(data, season) # self.player_manager.save(player) ##################### # Grab BvP (or PvB) ##################### if self.scrape_bvp: if self.player_season_stats_parser.player_data[ MLBConstants.POSITION] == "Pitcher": self.player_bvp_parser.type = MLBConstants.PITCHER_TYPE bvp_url = "/play-index/batter_vs_pitcher.cgi?pitcher={}".format( player_id) else: self.player_bvp_parser.type = MLBConstants.BATTER_TYPE bvp_url = "/play-index/batter_vs_pitcher.cgi?batter={}".format( player_id) data = self.fetch_data(bvp_url, True) self.player_bvp_parser.player_data = player self.player_bvp_parser.parse(data) start = time.time() self.player_manager.save(player) end = time.time() print "\t\tDEBUG: Saved player stats in {} seconds".format(end - start) def determine_active_seasons(self, player): """ Convenience method for determining the seasons that a player has been active. """ seasons = [] if player[ MLBConstants. POSITION] == "Pitcher" and MLBConstants.STANDARD_PITCHING in player: for k in player[MLBConstants.STANDARD_PITCHING]: seasons.append(k) elif player[ MLBConstants. POSITION] != "Pitcher" and MLBConstants.STANDARD_BATTING in player: for k in player[MLBConstants.STANDARD_BATTING]: seasons.append(k) return seasons
class TestLineupManager(unittest.TestCase): def setUp(self): self.lineup_manager = LineupManager(testing=True) self.player_manager = PlayerManager(testing=True) def tearDown(self): self.lineup_manager.lineups_collection.drop() self.player_manager.players_collection.drop() self.lineup_manager = None self.player_manager = None def test_is_processed_true(self): self.assertFalse(self.lineup_manager.is_processed("dmaclean")) d = str(date.today()) lineup = {"date": d, "players": {"dmaclean": True}} self.lineup_manager.lineups_collection.save(lineup) # Reset the processed map. self.lineup_manager.processed_players = None self.assertTrue(self.lineup_manager.is_processed("dmaclean")) def test_is_processed_false(self): self.assertFalse(self.lineup_manager.is_processed("dmaclean")) def test_add_player_to_lineup_none_yet(self): self.assertFalse(self.lineup_manager.is_processed("dmaclean")) self.lineup_manager.add_player_to_lineup("dmaclean", {}) self.assertTrue(self.lineup_manager.is_processed("dmaclean")) self.lineup_manager.add_player_to_lineup("dmaclean2", {}) self.assertTrue(self.lineup_manager.is_processed("dmaclean2")) def test_get_id_for_player_name(self): self.assertTrue( self.lineup_manager.get_id_for_player_name("Dan MacLean") is None) player_data = {"name": "Dan MacLean", "player_id": "dmaclean"} self.player_manager.save(player_data) self.assertTrue( self.lineup_manager.get_id_for_player_name("Dan MacLean") == "dmaclean") def test_find_team_last_game(self): one_day = timedelta(days=1) today = date.today() yesterday = today - one_day two_days_ago = yesterday - one_day three_days_ago = two_days_ago - one_day yesterday_lineup = { "date": str(yesterday), "players": { "dmaclean": { "team": "BOS", "position": "P" } } } self.lineup_manager.lineups_collection.save(yesterday_lineup) two_days_ago_lineup = { "date": str(two_days_ago), "players": { "asmith": { "team": "CLE", "position": "2B" } } } self.lineup_manager.lineups_collection.save(two_days_ago_lineup) three_days_ago_lineup = { "date": str(three_days_ago), "players": { "bjohnson": { "team": "CLE", "position": "3B" } } } self.lineup_manager.lineups_collection.save(three_days_ago_lineup) players = self.lineup_manager.find_team_last_game("BOS") self.assertTrue(len(players) == 1) self.assertTrue(players[0][MLBConstants.PLAYER_ID] == "dmaclean") self.assertTrue(players[0][MLBConstants.POSITION] == "P") players = self.lineup_manager.find_team_last_game("CLE") self.assertTrue(len(players) == 1) self.assertTrue(players[0][MLBConstants.PLAYER_ID] == "asmith") self.assertTrue(players[0][MLBConstants.POSITION] == "2B") def test_find_player_position_last_game(self): one_day = timedelta(days=1) today = date.today() yesterday = today - one_day two_days_ago = yesterday - one_day three_days_ago = two_days_ago - one_day yesterday_lineup = { "date": str(yesterday), "players": { "dmaclean": { "team": "BOS", "position": "P" } } } self.lineup_manager.lineups_collection.save(yesterday_lineup) two_days_ago_lineup = { "date": str(two_days_ago), "players": { "asmith": { "team": "CLE", "position": "2B" } } } self.lineup_manager.lineups_collection.save(two_days_ago_lineup) three_days_ago_lineup = { "date": str(three_days_ago), "players": { "bjohnson": { "team": "CLE", "position": "3B" } } } self.lineup_manager.lineups_collection.save(three_days_ago_lineup) position = self.lineup_manager.find_player_position_last_game("asmith") self.assertTrue(position == "2B") position = self.lineup_manager.find_player_position_last_game( "bjohnson") self.assertTrue(position == "3B")
def setUp(self): self.lineup_manager = LineupManager(testing=True) self.player_manager = PlayerManager(testing=True)