Ejemplo n.º 1
0
    def __init__(self):
        self.source = "site"
        self.all_players = False
        self.scrape_bvp = False
        self.player_id = None
        self.url = None

        self.player_list_parser = PlayerListParser()
        self.player_season_stats_parser = PlayerSeasonStatsParser()
        self.player_gamelog_parser = PlayerGamelogParser()
        self.player_splits_parser = PlayerSplitsParser()
        self.player_bvp_parser = PlayerBvPParser()
        self.player_manager = PlayerManager()
Ejemplo n.º 2
0
	def __init__(self, sleep_time=2, testing=False):
		self.source = "site"
		self.player_regex = re.compile("/baseball/player.htm\?id=\d+")
		self.player_manager = PlayerManager(testing=testing)
		self.lineup_manager = LineupManager(testing=testing)
		self.name_mapping_manager = NameMappingManager(testing=testing)
		self.bbr_scraper = BaseballReferenceScraper()
		self.bbr_scraper.sleep_time = sleep_time
Ejemplo n.º 3
0
 def __init__(self):
     self.lineup_manager = LineupManager()
     self.player_manager = PlayerManager()
     self.ballpark_collection = MongoClient(
         'localhost', 27017)[MLBConstants.MONGO_MLB_DB_NAME][
             MLBConstants.MONGO_MLB_BALLPARK_FACTORS_COLLECTION]
     self.game_date = date.today()
     self.season = str(self.game_date.year)
Ejemplo n.º 4
0
 def __init__(self):
     self.source = "site"
     self.scrape_yesterdays_players = True
     self.scrape_bvp = False
     self.sleep_time = 2
     self.player_manager = PlayerManager()
     self.lineup_manager = LineupManager()
     self.bbr_scraper = BaseballReferenceScraper()
     self.bbr_scraper.sleep_time = self.sleep_time
Ejemplo n.º 5
0
class TestPlayerManager(unittest.TestCase):
    def setUp(self):
        self.player_manager = PlayerManager(testing=True)

    def tearDown(self):
        self.player_manager.drop_collection()
        self.player_manager = None

    def test_db_operations(self):
        self.assertTrue(self.player_manager.players_collection.count() == 0)

        player = self.player_manager.read("dmaclean")
        self.assertTrue(player is None)

        new_player = {
            MLBConstants.PLAYER_ID: "dmaclean",
            MLBConstants.NAME: "Dan MacLean",
            MLBConstants.POSITION: "Pitcher"
        }
        self.player_manager.save(new_player)

        player = self.player_manager.read({MLBConstants.PLAYER_ID: "dmaclean"})
        self.assertTrue(player[MLBConstants.PLAYER_ID] == "dmaclean")
        self.assertTrue(player[MLBConstants.NAME] == "Dan MacLean")
        self.assertTrue(player[MLBConstants.POSITION] == "Pitcher")

        self.assertTrue(self.player_manager.players_collection.count() == 1)

        player[MLBConstants.NAME] = "Steph MacLean"
        self.player_manager.save(player)

        player = self.player_manager.read({MLBConstants.PLAYER_ID: "dmaclean"})
        self.assertTrue(player[MLBConstants.PLAYER_ID] == "dmaclean")
        self.assertTrue(player[MLBConstants.NAME] == "Steph MacLean")
        self.assertTrue(player[MLBConstants.POSITION] == "Pitcher")

        self.assertTrue(self.player_manager.players_collection.count() == 1)
Ejemplo n.º 6
0
    def __init__(self, testing=False):
        self.testing_mode = testing
        """
		Initialization for the LineupManager.  Here we make a connection to MongoDB, grab a database handle,
		and also a handle on the lineups collection.
		"""
        self.client = MongoClient('localhost', 27017)

        # Get a database handle.  It'll either be live data or the testing database
        # depending on the "testing" flag that gets passed in.
        if testing:
            self.db = self.client[MLBConstants.MONGO_MLB_TEST_DB_NAME]
        else:
            self.db = self.client[MLBConstants.MONGO_MLB_DB_NAME]

        # Get a handle on the players collection.
        self.lineups_collection = self.db[
            MLBConstants.MONGO_MLB_LINEUPS_COLLECTION]

        self.player_manager = PlayerManager(testing=self.testing_mode)

        self.processed_players = None
Ejemplo n.º 7
0
 def setUp(self):
     self.player_manager = PlayerManager(testing=True)
Ejemplo n.º 8
0
class BaseballReferenceScraper:
    """
	Scraper for baseball-reference.com
	"""
    def __init__(self):
        self.source = "site"
        self.all_players = False
        self.scrape_bvp = False
        self.player_id = None
        self.url = None

        self.player_list_parser = PlayerListParser()
        self.player_season_stats_parser = PlayerSeasonStatsParser()
        self.player_gamelog_parser = PlayerGamelogParser()
        self.player_splits_parser = PlayerSplitsParser()
        self.player_bvp_parser = PlayerBvPParser()
        self.player_manager = PlayerManager()

    def process(self):
        self.readCLI()

        if self.player_id:
            self.process_player(self.player_id, self.url, True)
        else:
            self.process_players()

    def readCLI(self):
        for arg in sys.argv:
            if arg == "bbr_scraper.py":
                pass
            else:
                pieces = arg.split("=")
                if pieces[0] == "season":
                    self.season = int(pieces[1])
                elif pieces[0] == "all_players":
                    self.all_players = pieces[1] == "true"
                elif pieces[0] == "yesterday_only":
                    self.yesterday_only = pieces[1] == "true"
                elif pieces[0] == "sleep":
                    self.sleep_time = int(pieces[1])
                elif pieces[0] == "player_id":
                    self.player_id = pieces[1]
                elif pieces[0] == "url":
                    self.url = pieces[1]

    def process_players(self):
        """
		Performs fetching of player data.
		"""
        # alphabet = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
        # 			"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
        alphabet = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
        ]

        # Go through all players by letter
        for letter in alphabet:
            url = "/players/{}/".format(letter)
            data = MLBUtilities.fetch_data("www.baseball-reference.com", url,
                                           True)

            self.player_list_parser.parse(data, letter)

            # Go through odd player ids
            for player_id in self.player_list_parser.odd_player_ids:
                odd_url = "/players/{}/".format(player_id[0])
                self.process_player(player_id[1], odd_url)

            if self.all_players:
                # Go through retired players
                for player_id in self.player_list_parser.retired_player_ids:
                    self.process_player(player_id, url)

            # Go through active players
            for player_id in self.player_list_parser.active_player_ids:
                self.process_player(player_id, url, True)

    def process_player(self, player_id, url, active=False):
        start = time.time()
        player = self.player_manager.read({MLBConstants.PLAYER_ID: player_id})
        end = time.time()
        print "\t\tDEBUG: Read from player_manager in {} seconds".format(end -
                                                                         start)

        if player is None:
            player = {MLBConstants.PLAYER_ID: player_id}
        # We can skip this player
        elif MLBConstants.BATTER_VS_PITCHER in player and not active:
            logging.info(
                "Looks like all info for {} has already been scraped.  Moving on..."
                .format(player_id))
            return

        player_url = "{}{}.shtml".format(url, player_id)

        player_page_data = MLBUtilities.fetch_data(
            "www.baseball-reference.com", player_url, True)

        self.player_season_stats_parser.player_data = player
        self.player_season_stats_parser.parse(player_page_data)

        #############################################
        # Should we get the pitcher or batter page?
        #############################################
        if self.player_season_stats_parser.player_data[
                MLBConstants.POSITION] == "Pitcher":
            player_season_stats_detail_url = "{}{}-pitch.shtml".format(
                url, player_id)
        else:
            player_season_stats_detail_url = "{}{}-bat.shtml".format(
                url, player_id)

        ###############################################
        # Fetch detailed season stats for the player.
        ###############################################
        start = time.time()
        player_season_stats_detail_data = MLBUtilities.fetch_data(
            "www.baseball-reference.com", player_season_stats_detail_url, True)
        end = time.time()
        print "\t\tDEBUG: Fetched detailed season stats in {} seconds".format(
            end - start)

        start = time.time()
        self.player_season_stats_parser.parse(player_season_stats_detail_data)
        end = time.time()
        print "\t\tDEBUG: Parsed detailed season stats in {} seconds".format(
            end - start)

        # start = time.time()
        # self.player_manager.save(player)
        # end = time.time()
        # print "\t\tDEBUG: Saved detailed season stats in {} seconds".format(end-start)

        active_seasons = self.determine_active_seasons(player)
        type = "p" if player[MLBConstants.POSITION] == "Pitcher" else "b"

        #################
        # Grab gamelogs
        #################
        for season in active_seasons:
            if season not in ["2014"]:
                continue
            player_gamelog_url = "/players/gl.cgi?id={}&t={}&year={}".format(
                player_id, type, season)
            data = MLBUtilities.fetch_data("www.baseball-reference.com",
                                           player_gamelog_url, True)
            self.player_gamelog_parser.player_data = player
            self.player_gamelog_parser.type = MLBConstants.PITCHER_TYPE if player[
                MLBConstants.
                POSITION] == "Pitcher" else MLBConstants.BATTER_TYPE
            self.player_gamelog_parser.season = season
            self.player_gamelog_parser.parse(data)
        # self.player_manager.save(player)

        ###############
        # Grab splits
        ###############
        active_seasons.append("Career")
        for season in active_seasons:
            if season not in ["2014", "Career"]:
                continue
            player_split_url = "/players/split.cgi?id={}&t={}&year={}".format(
                player_id, type, season)
            data = MLBUtilities.fetch_data("www.baseball-reference.com",
                                           player_split_url, True)
            self.player_splits_parser.player_data = player
            self.player_splits_parser.season = season
            self.player_splits_parser.parse(data, season)
        # self.player_manager.save(player)

        #####################
        # Grab BvP (or PvB)
        #####################
        if self.scrape_bvp:
            if self.player_season_stats_parser.player_data[
                    MLBConstants.POSITION] == "Pitcher":
                self.player_bvp_parser.type = MLBConstants.PITCHER_TYPE
                bvp_url = "/play-index/batter_vs_pitcher.cgi?pitcher={}".format(
                    player_id)
            else:
                self.player_bvp_parser.type = MLBConstants.BATTER_TYPE
                bvp_url = "/play-index/batter_vs_pitcher.cgi?batter={}".format(
                    player_id)

            data = self.fetch_data(bvp_url, True)
            self.player_bvp_parser.player_data = player
            self.player_bvp_parser.parse(data)

        start = time.time()
        self.player_manager.save(player)
        end = time.time()
        print "\t\tDEBUG: Saved player stats in {} seconds".format(end - start)

    def determine_active_seasons(self, player):
        """
		Convenience method for determining the seasons that a player has been active.
		"""
        seasons = []

        if player[
                MLBConstants.
                POSITION] == "Pitcher" and MLBConstants.STANDARD_PITCHING in player:
            for k in player[MLBConstants.STANDARD_PITCHING]:
                seasons.append(k)
        elif player[
                MLBConstants.
                POSITION] != "Pitcher" and MLBConstants.STANDARD_BATTING in player:
            for k in player[MLBConstants.STANDARD_BATTING]:
                seasons.append(k)

        return seasons
Ejemplo n.º 9
0
class TestLineupManager(unittest.TestCase):
    def setUp(self):
        self.lineup_manager = LineupManager(testing=True)
        self.player_manager = PlayerManager(testing=True)

    def tearDown(self):
        self.lineup_manager.lineups_collection.drop()
        self.player_manager.players_collection.drop()
        self.lineup_manager = None
        self.player_manager = None

    def test_is_processed_true(self):
        self.assertFalse(self.lineup_manager.is_processed("dmaclean"))

        d = str(date.today())
        lineup = {"date": d, "players": {"dmaclean": True}}
        self.lineup_manager.lineups_collection.save(lineup)

        # Reset the processed map.
        self.lineup_manager.processed_players = None

        self.assertTrue(self.lineup_manager.is_processed("dmaclean"))

    def test_is_processed_false(self):
        self.assertFalse(self.lineup_manager.is_processed("dmaclean"))

    def test_add_player_to_lineup_none_yet(self):
        self.assertFalse(self.lineup_manager.is_processed("dmaclean"))

        self.lineup_manager.add_player_to_lineup("dmaclean", {})
        self.assertTrue(self.lineup_manager.is_processed("dmaclean"))

        self.lineup_manager.add_player_to_lineup("dmaclean2", {})
        self.assertTrue(self.lineup_manager.is_processed("dmaclean2"))

    def test_get_id_for_player_name(self):
        self.assertTrue(
            self.lineup_manager.get_id_for_player_name("Dan MacLean") is None)

        player_data = {"name": "Dan MacLean", "player_id": "dmaclean"}
        self.player_manager.save(player_data)

        self.assertTrue(
            self.lineup_manager.get_id_for_player_name("Dan MacLean") ==
            "dmaclean")

    def test_find_team_last_game(self):
        one_day = timedelta(days=1)
        today = date.today()
        yesterday = today - one_day
        two_days_ago = yesterday - one_day
        three_days_ago = two_days_ago - one_day

        yesterday_lineup = {
            "date": str(yesterday),
            "players": {
                "dmaclean": {
                    "team": "BOS",
                    "position": "P"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(yesterday_lineup)

        two_days_ago_lineup = {
            "date": str(two_days_ago),
            "players": {
                "asmith": {
                    "team": "CLE",
                    "position": "2B"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(two_days_ago_lineup)

        three_days_ago_lineup = {
            "date": str(three_days_ago),
            "players": {
                "bjohnson": {
                    "team": "CLE",
                    "position": "3B"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(three_days_ago_lineup)

        players = self.lineup_manager.find_team_last_game("BOS")
        self.assertTrue(len(players) == 1)
        self.assertTrue(players[0][MLBConstants.PLAYER_ID] == "dmaclean")
        self.assertTrue(players[0][MLBConstants.POSITION] == "P")

        players = self.lineup_manager.find_team_last_game("CLE")
        self.assertTrue(len(players) == 1)
        self.assertTrue(players[0][MLBConstants.PLAYER_ID] == "asmith")
        self.assertTrue(players[0][MLBConstants.POSITION] == "2B")

    def test_find_player_position_last_game(self):
        one_day = timedelta(days=1)
        today = date.today()
        yesterday = today - one_day
        two_days_ago = yesterday - one_day
        three_days_ago = two_days_ago - one_day

        yesterday_lineup = {
            "date": str(yesterday),
            "players": {
                "dmaclean": {
                    "team": "BOS",
                    "position": "P"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(yesterday_lineup)

        two_days_ago_lineup = {
            "date": str(two_days_ago),
            "players": {
                "asmith": {
                    "team": "CLE",
                    "position": "2B"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(two_days_ago_lineup)

        three_days_ago_lineup = {
            "date": str(three_days_ago),
            "players": {
                "bjohnson": {
                    "team": "CLE",
                    "position": "3B"
                }
            }
        }
        self.lineup_manager.lineups_collection.save(three_days_ago_lineup)

        position = self.lineup_manager.find_player_position_last_game("asmith")
        self.assertTrue(position == "2B")

        position = self.lineup_manager.find_player_position_last_game(
            "bjohnson")
        self.assertTrue(position == "3B")
Ejemplo n.º 10
0
 def setUp(self):
     self.lineup_manager = LineupManager(testing=True)
     self.player_manager = PlayerManager(testing=True)