def test(): game_directory = '/components/game/mlb/year_2014/month_04/day_04/gid_2014_04_04_anamlb_houmlb_1' # print "Fetching epg page" # epg_page = gds.fetch_epg(year, month, day) # print "Parsing epg page" # parser.parse_epg(epg_page) # # print "Fetching players page" # players_page = gds.fetch_players(game_directory) # print "Parsing players page" # parser.parse_players(players_page) # # print "Fetching hit chart page" # hit_chart_page = gds.fetch_hit_chart(game_directory) # print "Parsing hit chart page" # parser.parse_hit_chart(hit_chart_page) from pygameday import scrape print("Fetching innings_all page") inning_all_page = scrape.fetch_inning_all(game_directory) print("Parsing innings_all page") at_bats = parse_inning_all(inning_all_page) print(at_bats)
def process_game(self, game_xml_node): """Ingests a single game's GameDay data Parameters ---------- game_xml_node : lxml node The XML node of the game to process """ session = self.Session() game_dir = game_xml_node.get("game_data_directory") gameday_id = game_xml_node.get("id") if gameday_id in self.gameday_ids: # The game has been processed and should already be in the database logger.warn("Skipping game: {}. It's already in the DB.".format(gameday_id)) return # Parse the game db_game = parse.parse_game(game_xml_node) # If no data comes back, the game probably wasn't a Final. Abort. if db_game is None: msg = ("Skipping game: {}. It contained no data, " "probably because its status isn't Final").format(gameday_id) logger.warn(msg) return # If the game is a spring training game, skip it if ingest_spring_training is False if not self.ingest_spring_training and (db_game.game_type == "S" or db_game.game_type == "E"): msg = "Skipping game: {}. It's a spring training or exhibition game.".format(gameday_id) logger.warn(msg) return msg = "Processing game ID {}".format(gameday_id) logger.info(msg) # -------------------------------------------------------------------------------------------------------------- # Fetch game data # hit_chart_page = scrape.fetch_hit_chart(game_dir) players_page = scrape.fetch_players(game_dir) inning_all_page = scrape.fetch_inning_all(game_dir) # do some error checking if hit_chart_page is None: msg = "Error fetching hit chart page for game {}".format(gameday_id) logger.error(msg) if players_page is None: msg = "Error fetching players page for game {}".format(gameday_id) logger.error(msg) if inning_all_page is None: msg = "Error fetching inning events page for game {}".format(gameday_id) logger.error(msg) # -------------------------------------------------------------------------------------------------------------- # Parse AtBats (including Pitches), HitsInPlay, Players # db_at_bats = parse.parse_inning_all(inning_all_page) # Appends Pitches to AtBats db_hips = parse.parse_hit_chart(hit_chart_page) db_players = parse.parse_players(players_page) # -------------------------------------------------------------------------------------------------------------- # Append the AtBats to the Game. Note that Pitches are appended to AtBats # when the AtBats are parsed, so we don't have to do anything with Pitches. # for db_ab in db_at_bats: db_game.at_bats.append(db_ab) # -------------------------------------------------------------------------------------------------------------- # Append the hits in play to the Game # for db_h in db_hips: db_game.hits_in_play.append(db_h) # -------------------------------------------------------------------------------------------------------------- # Add the players using the database session and commit # This has to be done one at a time (instead of using session.add_all) # because add_all will fail if ANY of the players in the list are # duplicated, which could lead to some players being excluded from # the database. # for player in db_players: if int(player.player_id) in self.player_ids: # The player has been processed and should already be in the database logger.debug("Skipping player: {} because it has already been processed.".format(player.player_id)) else: # We haven't inserted this player yet error_occurred = False try: session.add(player) session.commit() except IntegrityError: # If an IntegrityError occurs, it's probably because the data # has already been inserted. session.rollback() msg = ("IntegrityError when inserting player: {}, " "probably because it's already in the database".format(str(player))) logger.error(msg) error_occurred = True except Exception as ex: # Just log other exceptions for now, and continue session.rollback() logger.error(str(ex)) error_occurred = True if not error_occurred: self.player_ids.add(player.player_id) # -------------------------------------------------------------------------------------------------------------- # Add the game data # if db_game.gameday_id in self.gameday_ids: # The game has been processed and should already be in the database logger.info("Skipping game: {} because it has already been ingested.".format(db_game.gameday_id)) else: # We haven't inserted this game yet error_occurred = False try: session.add(db_game) session.commit() except IntegrityError: # If an IntegrityError occurs, it's probably because the data # has already been inserted. session.rollback() msg = ("IntegrityError when inserting game: {}, " "probably because it's already in the database".format(db_game.gameday_id)) logger.error(msg) error_occurred = True except Exception as ex: # Just log other exceptions for now, and continue session.rollback() logger.error(str(ex)) error_occurred = True if not error_occurred: self.gameday_ids.add(db_game.gameday_id) # We are done session.close()