def execute(self, trim_data_sets=True): self.events.find_eligible_games_start() game_ids = db.Season_Game_PitchApp_View.get_all_bbref_game_ids_combined_no_missing_pfx( self.db_engine) if not game_ids: return Result.Fail( "No games meet the requirements for this process.") self.events.find_eligible_games_complete(game_ids) self.events.calculate_pitch_metrics_start() pitch_samples = [] at_bat_samples = [] inning_samples = [] for num, game_id in enumerate(game_ids, start=1): combined_data = self.scraped_data.get_combined_game_data(game_id) if not combined_data: continue result = self.calc_pitch_metrics(combined_data) pitch_samples.extend(result[0]) at_bat_samples.extend(result[2]) inning_samples.extend(result[4]) self.events.calculate_pitch_metrics_progress(num) self.events.calculate_pitch_metrics_complete() metrics = { "time_between_pitches": self.process_data_set(pitch_samples, trim=trim_data_sets), "time_between_at_bats": self.process_data_set(at_bat_samples, trim=trim_data_sets), "time_between_innings": self.process_data_set(inning_samples, trim=trim_data_sets), } return Result.Ok(metrics)
def parse_pitch_log(scraped_html, game, pitcher_id, url): page_content = html.fromstring(scraped_html, base_url=url) pitch_log = _initialize_pitch_log(game, pitcher_id, url) result = _parse_pitcher_details(page_content, game, pitcher_id) if result.failure: return Result.Ok(pitch_log) pitcher_dict = result.value pitch_log.pitcher_name = pitcher_dict["name"] pitch_log.pitcher_team_id_bb = pitcher_dict["team_id"] pitch_log.opponent_team_id_bb = pitcher_dict["opponent_id"] parsed = page_content.xpath(PITCHFX_URL_XPATH) if not parsed: return Result.Ok(pitch_log) rel_url = parsed[0] pitch_log.pitchfx_url = Template(T_PITCHFX_URL).substitute(rel_url=rel_url) result = _parse_pitch_counts(page_content) if result.failure: return Result.Ok(pitch_log) pitch_log.pitch_count_by_inning = result.value total_pitches = sum(pitch_log.pitch_count_by_inning.values()) pitch_log.total_pitch_count = int(total_pitches) pitch_log.parsed_all_info = True return Result.Ok(pitch_log)
def update_player_data(self, task, data_set, no_prompts): subprocess.run(["clear"]) print_heading(f"Update {data_set}", fg="bright_yellow") spinner = Halo(spinner=get_random_dots_spinner(), color=get_random_cli_color()) spinner.text = "Updating player data..." spinner.start() result = task.execute() if result.failure: spinner.stop() return result spinner.succeed(f"{data_set} was successfully updated!") if no_prompts: return Result.Ok() updated_players = result.value or [] if not updated_players: pause(message="Press any key to continue...") return Result.Ok(updated_players) heading = f"Updated {data_set}: Results" message = f"{len(updated_players)} changes total:" table_viewer = DictListTableViewer( dict_list=updated_players, prompt="Press Enter to continue", confirm_only=True, table_color="bright_yellow", heading=heading, heading_color="bright_yellow", message=message, message_color="blue", ) table_viewer.launch() return Result.Ok(updated_players)
def create_urls_for_brooks_pitch_logs_for_date(db_job, scraped_data, game_date): data_set = DataSet.BROOKS_PITCH_LOGS req_data_set = DataSet.BROOKS_GAMES_FOR_DATE games_for_date = scraped_data.get_brooks_games_for_date(game_date) if not games_for_date: return Result.Fail( get_unscraped_data_error(data_set, req_data_set, game_date)) urls = [] for game in games_for_date.games: if game.might_be_postponed: continue for pitcher_id, pitch_log_url in game.pitcher_appearance_dict.items(): pitch_app_id = f"{game.bbref_game_id}_{pitcher_id}" url_data = { "url": pitch_log_url, "url_id": pitch_app_id, "fileName": get_filename(scraped_data, data_set, pitch_app_id), "cachedHtmlFolderPath": get_cached_html_folderpath(scraped_data, data_set, game_date), "scrapedHtmlFolderpath": get_scraped_html_folderpath(db_job, data_set), } urls.append(from_dict(data_class=UrlDetails, data=url_data)) return Result.Ok(urls)
def _get_summary_report_for_date_range(start_date, end_date, status_date_range): start_str = start_date.strftime(DATE_MONTH_NAME) end_str = end_date.strftime(DATE_MONTH_NAME) heading = f"### STATUS REPORT FOR {start_str} - {end_str} ###" if not status_date_range: pages = [ DisplayPage([ "All data has been scraped for all dates in the requested range" ], heading, wrap=False) ] return Result.Ok( _create_report_viewer(pages, text_color="bright_magenta")) dict_list = [{ "game_date": ds.game_date_str, "status": ds.scrape_status_description } for ds in status_date_range] date_report = DictListTableViewer( dict_list, prompt="Press Enter to dismiss report", confirm_only=True, heading=heading, heading_color="bright_magenta", message=None, table_color="bright_magenta", ) return Result.Ok(date_report)
def decode_json_response(self, response): query_results = "" try: resp_json = response.json() query_results = resp_json["search_player_all"]["queryResults"] num_results = int(query_results["totalSize"]) if not num_results: result = self.try_alternate_url().on_success( request_url_with_retries) if result.failure: return result response = result.value resp_json = response.json() query_results = resp_json["search_player_all"]["queryResults"] num_results = int(query_results["totalSize"]) if not num_results: return Result.Fail( f"Failed to retrieve any results for player name: {self.name} (Tried 2 URLs)" ) return Result.Ok((query_results, num_results)) except (JSONDecodeError, KeyError) as e: error = f"Failed to decode HTTP response as JSON: {repr(e)}\n{e.response.text}" return Result.Fail(error) except ValueError: # pragma: no cover error = f"Failed to parse number of results from search response: {query_results}" return Result.Fail(error)
def launch(self): subprocess.run(["clear"]) print_heading(self.menu_heading, fg="bright_yellow") if not node_is_installed(): print_message(INSTALL_ERROR, fg="bright_red", bold=True) pause(message="Press any key to continue...") return if not NODEJS_INBOX.exists(): NODEJS_INBOX.mkdir(parents=True, exist_ok=True) if node_modules_folder_exists(): message = UPDATE_MESSAGE prompt = UPDATE_PROMPT temp_folder = None command = "npm update --timeout=9999999" else: message = INSTALL_MESSAGE prompt = INSTALL_PROMPT temp_folder = TemporaryDirectory(dir=NIGHTMAREJS_FOLDER) command = f"npm install --timeout=9999999 --cache={temp_folder.name}" print_message(message, fg="bright_yellow") if not yes_no_prompt(prompt, wrap=False): return Result.Ok(self.exit_menu) subprocess.run(["clear"]) print_heading(self.menu_heading, fg="bright_yellow") result = run_command(command, cwd=str(NIGHTMAREJS_FOLDER)) if result.failure: return result if temp_folder: temp_folder.cleanup() pause(message="\nPress any key to continue...") return Result.Ok(self.exit_menu)
def validate_brooks_game_id(input_str): match = BB_GAME_ID_REGEX.search(input_str) if not match: raise ValueError(f"String is not a valid bb game id: {input_str}") captured = match.groupdict() year = int(captured["year"]) month = int(captured["month"]) day = int(captured["day"]) game_number = int(captured["game_num"]) try: game_date = datetime(year, month, day) except Exception as e: error = f"Failed to parse game_date from game_id:\n{repr(e)}" return Result.Fail(error) away_team_id = captured["home_team"].upper() home_team_id = captured["away_team"].upper() game_dict = { "game_id": input_str, "game_date": game_date, "away_team_id": away_team_id, "home_team_id": home_team_id, "game_number": game_number, } return Result.Ok(game_dict)
def delete_from_s3(self, s3_key): # pragma: no cover try: self.s3_resource.Object(self.bucket_name, s3_key).delete() return Result.Ok() except botocore.exceptions.ClientError as ex: error_code = ex.response["Error"]["Code"] return Result.Fail(f"{repr(ex)} (Error Code: {error_code})")
def get_pitch_app_status_record(db_session, pitch_app_id): pitch_app_status = db.PitchAppScrapeStatus.find_by_pitch_app_id( db_session, pitch_app_id) if pitch_app_status: return Result.Ok(pitch_app_status) error = f"scrape_status_pitch_app does not contain an entry for pitch_app_id: {pitch_app_id}" return Result.Fail(error)
def validate_new_url_delay_setting(self, new_value): is_enabled, is_random, delay_uniform, delay_min, delay_max = new_value if not is_enabled: return Result.Fail("URL delay cannot be disabled!") if not is_random and delay_uniform < 3 or is_random and delay_min < 3: return Result.Fail("URL delay min value must be greater than 2 seconds!") return Result.Ok()
def patch_invalid_pfx_single_game(self): result = self.patch_invalid_pfx.execute(self.game_id) if result.failure: header = f"Invalid PitchFX Data for {self.game_id}\n" subprocess.run(["clear"]) print_message(header, wrap=False, bold=True, underline=True) print_message(result.error, fg="bright_yellow") pause(message="Press any key to continue...") return Result.Ok(True) if not self.prompt_user_create_patch_list(): return Result.Ok(True) result = self.patch_invalid_pfx.match_missing_pfx_data() if result.failure: return result for result, matches in self.patch_invalid_pfx.match_results.items(): if result == "success": for num, match_dict in enumerate(matches, start=1): match_dict["patch"] = self.prompt_user_create_patch( num, len(matches), match_dict) if result == "no_matches": self.display_no_match_found(matches) if result == "many_matches": self.display_many_matches_found(matches) if "success" not in self.patch_invalid_pfx.match_results: header = f"Invalid PitchFX Data for {self.game_id}\n" message = ( "Unable to identify missing data that matches the invalid PitchFX data for this " "game. You should inspect the combined data JSON file for this game and " "investigate the invalid data manually.\n") subprocess.run(["clear"]) print_message(header, wrap=False, bold=True, underline=True) print_message(message, fg="bright_yellow") pause(message="Press any key to continue...") return Result.Ok(True) result = self.patch_invalid_pfx.create_patch_list() if result.failure: return result if not self.patch_invalid_pfx.patch_list or not self.prompt_user_apply_patch_list( ): return Result.Ok(True) result = self.patch_invalid_pfx.apply_patch_list() if result.failure: return result self.patch_results = result.value print() if self.patch_results["fixed_all_errors"]: patch_result = f"PitchFX data for {self.game_id} is now completely reconciled (no errors of any type)!\n" print_success(patch_result) if self.patch_results["invalid_pfx"]: patch_result = f"{self.game_id} still contains invalid PitchFX data after applying the patch list.\n" print_error(patch_result) if self.patch_results["pfx_errors"]: patch_result = f"{self.game_id} still contains PitchFX data errors associated with valid at bats.\n" print_error(patch_result) pause(message="Press any key to continue...") subprocess.run(["clear"]) if self.prompt_user_view_patched_data(): self.display_patched_data_tables( **self.patch_results["patch_diff_report"]) return Result.Ok()
def match_missing_pfx_data(self): if not self.game_contains_invalid_pfx(): return Result.Ok() self.events.match_missing_pfx_data_start() for pfx in self.get_invalid_pfx_dict_list(): matches = self.find_game_events_matching_this_pfx(pfx) if not matches: match_dict = { "success": False, "invalid_pfx": pfx, "missing_pfx": {} } self.match_results["no_matches"].append(match_dict) continue if len(matches) > 1: result = self.check_for_exact_match(pfx) if result.failure: match_dict = { "success": False, "invalid_pfx": pfx, "missing_pfx": matches } self.match_results["many_matches"].append(match_dict) continue exact_match = result.value else: exact_match = matches[0] match_dict = self.found_successful_match(pfx, exact_match) self.match_results["success"].append(match_dict) self.events.match_missing_pfx_data_complete(self.match_results) return Result.Ok()
def check_for_exact_match(self, pfx): # given the invalid pfx data passed as an argument, the process to find an exact match is: exact_match = [ self.get_event_dict(event) # iterate through all game events, for each game event: for event in self.game_events # if game event is missing pitchfx data if event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] > 0 and ( # AND game_event and invalid pfx took place in the same inning event["inning_id"][-5:] == pfx["inning_id"] and ( # AND game event and invalid pfx have the same batter OR the same pitcher event["batter_id_mlb"] == pfx["batter_id"] or event["pitcher_id_mlb"] == pfx["pitcher_id"]) # AND number of pitches missing is the same as the number of invalid pfx and event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] == pfx["pitch_count"] # AND invalid pfx pitch seq. numbers are the same as the missing pitches and all(p_num in pfx["invalid_pfx"] for p_num in event["at_bat_pitchfx_audit"] ["missing_pitch_numbers"])) ] if not exact_match: # zero game events matched all the criteria -> NO EXACT MATCH return Result.Fail("") if len(exact_match) != 1: # more than one game event matched all the criteria -> NO EXACT MATCH return Result.Fail("") # one game event matched all the criteria -> EXACT MATCH return Result.Ok(exact_match[0])
def view_boxscore(self, team_id, player_type): boxscore = self.get_boxscore(team_id, player_type) while True: subprocess.run(["clear"]) result = self.select_player_prompt(player_type, boxscore) if result.failure: return Result.Ok() mlb_id = result.value if player_type == "BAT": self.view_at_bats_for_player(player_type, mlb_id) else: subprocess.run(["clear"]) result = self.select_pitcher_data_prompt(mlb_id) if result.failure: return Result.Ok() pitcher_data = result.value if pitcher_data == "AT_BATS": self.view_at_bats_for_player(player_type, mlb_id) if pitcher_data == "BAT_STATS": self.view_bat_stats_by_pitch_type_for_player(mlb_id) if pitcher_data == "PITCH_MIX_BY_STANCE": self.view_pitch_mix_batter_stance_splits(mlb_id) if pitcher_data == "PITCH_MIX_BY_SEASON": self.view_pitch_mix_season_splits(mlb_id) if pitcher_data == "PLATE_DISCIPLINE": self.view_pd_pitch_type_splits_for_pitcher(mlb_id) if pitcher_data == "BATTED_BALL": self.view_bb_pitch_type_splits_for_pitcher(mlb_id)
def validate_date_range(cls, db_session, start, end): if start.year != end.year: error = [ "Start and end dates must both be in the same year and within " "the scope of that year's MLB Regular Season." ] return Result.Fail(error) if start > end: start_str = start.strftime(DATE_ONLY) end_str = end.strftime(DATE_ONLY) error = [ '"start" must be a date before (or the same date as) "end":', f"start..: {start_str}", f"end....: {end_str}", ] return Result.Fail(error) season = cls.find_by_year(db_session, start.year) start_date_valid = cls.is_date_in_season(db_session, start).success end_date_valid = cls.is_date_in_season(db_session, end).success if not start_date_valid or not end_date_valid: error = [ f"Start and end date must both be within the {season.name}:", f"{season.name} Start Date..: {season.start_date_str}", f"{season.name} End Date....: {season.end_date_str}", ] return Result.Fail(error) return Result.Ok(season)
def import_id_map_csv(app, csv_folder): try: id_map_task = tasks.UpdatePlayerIdMapTask(app, csv_folder.joinpath(PLAYER_ID_MAP_CSV)) player_id_map = id_map_task.read_bbref_player_id_map_from_file() with tqdm( total=len(player_id_map), desc="Populating player_id table.....", unit="row", mininterval=0.12, maxinterval=5, unit_scale=True, ncols=90, ) as pbar: for id_map in player_id_map: app.db_session.add( db.PlayerId( mlb_id=int(id_map.mlb_ID), mlb_name=id_map.name_common, bbref_id=id_map.player_ID, bbref_name=None, ) ) pbar.update() return Result.Ok() except Exception as e: error = f"Error: {repr(e)}" app.db_session.rollback() return Result.Fail(error)
def launch(self): subprocess.run(["clear"]) print_message(f"Variable Name: {self.setting_name}\n", fg="bright_magenta", bold=True) print_message(f"Current Value: {self.current_setting}\n", fg="bright_yellow", bold=True) if not yes_no_prompt(prompt="\nChange current setting?"): return Result.Ok(self.exit_menu) user_confirmed, new_value = False, None while not user_confirmed: subprocess.run(["clear"]) prompt = f"Enter a new value for {self.setting_name}:\n" new_value = Input( prompt, word_color=colors.foreground["default"]).launch() result = self.confirm_new_value(new_value) if result.failure: return Result.Ok(self.exit_menu) user_confirmed = result.value result = self.dotenv.change_value(self.setting_name, new_value) if not self.restart_required: return result print_message(RESTART_WARNING, fg="bright_magenta", bold=True) pause(message="Press any key to continue...") exit(0)
def synchronize_files(self): if self.all_files_are_in_sync: message = "All files for selected data sets are in sync!" print_message(message, fg="bright_green", bold=True) pause(message="Press any key to continue...") return Result.Ok() for file_type, file_type_dict in self.sync_files.items(): for data_set, (out_of_sync, missing_files, outdated_files) in file_type_dict.items(): if not out_of_sync: continue all_sync_files = [] missing_count = 0 outdated_count = 0 if missing_files: all_sync_files.extend(missing_files) missing_count = len(missing_files) if outdated_files: all_sync_files.extend(outdated_files) outdated_count = len(outdated_files) table_viewer = self.create_table_viewer( all_sync_files, data_set, file_type, missing_count, outdated_count) apply_changes = table_viewer.launch() if apply_changes: self.apply_pending_changes(file_type, data_set, missing_files, outdated_files) return Result.Ok()
def execute(self, year): self.subscribe_to_events() all_patch_results = {} audit_report = self.scraped_data.get_audit_report() self.audit_report_before = deepcopy(audit_report) if year not in self.audit_report_before: return Result.Fail(f"No games for MLB {year} season have been scraped.") game_ids = self.audit_report_before[year].get("invalid_pfx", []) if not game_ids: return Result.Fail(f"No games for MLB {year} season have invalid pitchfx data.") self.events.patch_all_invalid_pitchfx_started() self.initialize_spinner(game_ids) for num, game_id in enumerate(game_ids, start=1): self.spinner.text = self.get_spinner_text(game_id, num, len(game_ids)) result = self.patch_invalid_pfx.execute(game_id, no_prompts=True) if result.failure: self.spinner.stop() return result patch_results = result.value all_patch_results[game_id] = patch_results self.spinner.text = self.get_spinner_text(game_id, num + 1, len(game_ids)) self.spinner.stop() audit_report = self.scraped_data.get_audit_report() self.audit_report_after = deepcopy(audit_report) (successful_change, invalid_pfx_change) = self.calculate_games_changed(year) self.events.patch_all_invalid_pitchfx_complete() self.unsubscribe_from_events() return Result.Ok( { "all_patch_results": all_patch_results, "successful_change": successful_change, "invalid_pfx_change": invalid_pfx_change, } )
def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() scraped_bbref_boxscores = db.DateScrapeStatus.verify_all_bbref_boxscores_scraped_for_date( self.db_session, game_date) return Result.Ok() if not scraped_bbref_boxscores else Result.Fail( "skip")
def apply(self, data): inning_matches = [ inning for inning in data.innings_list if inning.inning_id == self.inning_id ] if not inning_matches: error = f"Unable to locate the inning identified in this patch: {self.inning_id}" return Result.Fail(error) if len(inning_matches) > 1: error = "More than one inning was found that matches the inning identified in this patch: {self.inning_id}" return Result.Fail(error) inning = inning_matches[0] event_matches = [ event for event in inning.game_events if event.pbp_table_row_number == self.pbp_table_row_number ] if not event_matches: error = ( "Unable to locate the game event identified by pbp_table_row_number in this " f"patch: {self.pbp_table_row_number}") return Result.Fail(error) if len(event_matches) > 1: error = ( "More than one game event was found that matches the pbp_table_row_number " f"identified in this patch: {self.pbp_table_row_number}") return Result.Fail(error) event_matches[0].pitch_sequence = self.new_pitch_sequence return Result.Ok(data)
def _parse_pitcher_details(page_content, game, pitcher_id): query = Template(T_PITCHER_NAME_XPATH).substitute(id=pitcher_id) parsed = page_content.xpath(query) if not parsed: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) selected_pitcher = parsed[0] indices = [ n for n in range(len(selected_pitcher)) if selected_pitcher.find("-", n) == n ] if not indices or len(indices) < 2: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) indices.reverse() name = selected_pitcher[:indices[1]].strip() result = _parse_team_ids(game, selected_pitcher) if result.failure: return result id_dict = result.value pitcher_dict = { "name": name, "team_id": id_dict["team_id"], "opponent_id": id_dict["opponent_id"], } return Result.Ok(pitcher_dict)
def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() brooks_games_for_date = db.DateScrapeStatus.verify_brooks_daily_dashboard_scraped_for_date( self.db_session, game_date) return Result.Ok() if not brooks_games_for_date else Result.Fail( "skip")
def update_player_id_map(self): if not self.db_initialized: return Result.Ok() result = db.Season.is_date_in_season(self.db_session, datetime.now()) if result.failure: return Result.Ok() subprocess.run(["clear"]) return self.update_id_map_task.launch(no_prompts=True)
def change_value(self, env_var_name, new_value): if env_var_name not in ENV_VAR_NAMES: return Result.Fail( f"{env_var_name} is not a recognized environment variable.") self.env_var_dict[env_var_name] = new_value self.write_dotenv_file() self.read_dotenv_file() return Result.Ok()
def try_alternate_url(self): split = self.name.split() if len(split) <= 2: return Result.Fail( f"Failed to retrieve any results for player name: {self.name}") name_part = split[-1].upper() url = f"{MLB_PLAYER_SEARCH_URL}?sport_code='mlb'&name_part='{name_part}%25'&active_sw='Y'" return Result.Ok(url)
def delete_html(self, data_set, url_id): result_local = Result.Ok() result_s3 = Result.Ok() if self.html_stored_local(data_set): result_local = self.delete_html_local(data_set, url_id) if self.html_stored_s3(data_set): # pragma: no cover result_s3 = self.delete_html_s3(data_set, url_id) return Result.Combine([result_local, result_s3])
def write_config_file(self): try: config_json = json.dumps(self.config_json, indent=2, sort_keys=False) self.config_filepath.write_text(config_json) return Result.Ok() except Exception as e: error = f"Error: {repr(e)}" return Result.Fail(error)
def get_search_url(self, name): self.events.scrape_player_info_start(name) split = name.split() if not split or len(split) <= 1: # pragma: no cover return Result.Fail(f"Name was not in an expected format: {name}") name_part = ("%20".join(split[1:])).upper() url = f"{MLB_PLAYER_SEARCH_URL}?sport_code='mlb'&name_part='{name_part}%25'&active_sw='Y'" return Result.Ok(url)