def _parse_pitcher_details(page_content, game, pitcher_id): query = Template(T_PITCHER_NAME_XPATH).substitute(id=pitcher_id) parsed = page_content.xpath(query) if not parsed: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) selected_pitcher = parsed[0] indices = [ n for n in range(len(selected_pitcher)) if selected_pitcher.find("-", n) == n ] if not indices or len(indices) < 2: error = "Failed to parse pitcher name from game log page." return Result.Fail(error) indices.reverse() name = selected_pitcher[:indices[1]].strip() result = _parse_team_ids(game, selected_pitcher) if result.failure: return result id_dict = result.value pitcher_dict = { "name": name, "team_id": id_dict["team_id"], "opponent_id": id_dict["opponent_id"], } return Result.Ok(pitcher_dict)
def validate_new_url_delay_setting(self, new_value): is_enabled, is_random, delay_uniform, delay_min, delay_max = new_value if not is_enabled: return Result.Fail("URL delay cannot be disabled!") if not is_random and delay_uniform < 3 or is_random and delay_min < 3: return Result.Fail("URL delay min value must be greater than 2 seconds!") return Result.Ok()
def execute(self, year): self.subscribe_to_events() all_patch_results = {} audit_report = self.scraped_data.get_audit_report() self.audit_report_before = deepcopy(audit_report) if year not in self.audit_report_before: return Result.Fail(f"No games for MLB {year} season have been scraped.") game_ids = self.audit_report_before[year].get("invalid_pfx", []) if not game_ids: return Result.Fail(f"No games for MLB {year} season have invalid pitchfx data.") self.events.patch_all_invalid_pitchfx_started() self.initialize_spinner(game_ids) for num, game_id in enumerate(game_ids, start=1): self.spinner.text = self.get_spinner_text(game_id, num, len(game_ids)) result = self.patch_invalid_pfx.execute(game_id, no_prompts=True) if result.failure: self.spinner.stop() return result patch_results = result.value all_patch_results[game_id] = patch_results self.spinner.text = self.get_spinner_text(game_id, num + 1, len(game_ids)) self.spinner.stop() audit_report = self.scraped_data.get_audit_report() self.audit_report_after = deepcopy(audit_report) (successful_change, invalid_pfx_change) = self.calculate_games_changed(year) self.events.patch_all_invalid_pitchfx_complete() self.unsubscribe_from_events() return Result.Ok( { "all_patch_results": all_patch_results, "successful_change": successful_change, "invalid_pfx_change": invalid_pfx_change, } )
def validate_date_range(cls, db_session, start, end): if start.year != end.year: error = [ "Start and end dates must both be in the same year and within " "the scope of that year's MLB Regular Season." ] return Result.Fail(error) if start > end: start_str = start.strftime(DATE_ONLY) end_str = end.strftime(DATE_ONLY) error = [ '"start" must be a date before (or the same date as) "end":', f"start..: {start_str}", f"end....: {end_str}", ] return Result.Fail(error) season = cls.find_by_year(db_session, start.year) start_date_valid = cls.is_date_in_season(db_session, start).success end_date_valid = cls.is_date_in_season(db_session, end).success if not start_date_valid or not end_date_valid: error = [ f"Start and end date must both be within the {season.name}:", f"{season.name} Start Date..: {season.start_date_str}", f"{season.name} End Date....: {season.end_date_str}", ] return Result.Fail(error) return Result.Ok(season)
def check_for_exact_match(self, pfx): # given the invalid pfx data passed as an argument, the process to find an exact match is: exact_match = [ self.get_event_dict(event) # iterate through all game events, for each game event: for event in self.game_events # if game event is missing pitchfx data if event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] > 0 and ( # AND game_event and invalid pfx took place in the same inning event["inning_id"][-5:] == pfx["inning_id"] and ( # AND game event and invalid pfx have the same batter OR the same pitcher event["batter_id_mlb"] == pfx["batter_id"] or event["pitcher_id_mlb"] == pfx["pitcher_id"]) # AND number of pitches missing is the same as the number of invalid pfx and event["at_bat_pitchfx_audit"]["missing_pitchfx_count"] == pfx["pitch_count"] # AND invalid pfx pitch seq. numbers are the same as the missing pitches and all(p_num in pfx["invalid_pfx"] for p_num in event["at_bat_pitchfx_audit"] ["missing_pitch_numbers"])) ] if not exact_match: # zero game events matched all the criteria -> NO EXACT MATCH return Result.Fail("") if len(exact_match) != 1: # more than one game event matched all the criteria -> NO EXACT MATCH return Result.Fail("") # one game event matched all the criteria -> EXACT MATCH return Result.Ok(exact_match[0])
def apply(self, data): inning_matches = [ inning for inning in data.innings_list if inning.inning_id == self.inning_id ] if not inning_matches: error = f"Unable to locate the inning identified in this patch: {self.inning_id}" return Result.Fail(error) if len(inning_matches) > 1: error = "More than one inning was found that matches the inning identified in this patch: {self.inning_id}" return Result.Fail(error) inning = inning_matches[0] event_matches = [ event for event in inning.game_events if event.pbp_table_row_number == self.pbp_table_row_number ] if not event_matches: error = ( "Unable to locate the game event identified by pbp_table_row_number in this " f"patch: {self.pbp_table_row_number}") return Result.Fail(error) if len(event_matches) > 1: error = ( "More than one game event was found that matches the pbp_table_row_number " f"identified in this patch: {self.pbp_table_row_number}") return Result.Fail(error) event_matches[0].pitch_sequence = self.new_pitch_sequence return Result.Ok(data)
def decode_json_response(self, response): query_results = "" try: resp_json = response.json() query_results = resp_json["search_player_all"]["queryResults"] num_results = int(query_results["totalSize"]) if not num_results: result = self.try_alternate_url().on_success( request_url_with_retries) if result.failure: return result response = result.value resp_json = response.json() query_results = resp_json["search_player_all"]["queryResults"] num_results = int(query_results["totalSize"]) if not num_results: return Result.Fail( f"Failed to retrieve any results for player name: {self.name} (Tried 2 URLs)" ) return Result.Ok((query_results, num_results)) except (JSONDecodeError, KeyError) as e: error = f"Failed to decode HTTP response as JSON: {repr(e)}\n{e.response.text}" return Result.Fail(error) except ValueError: # pragma: no cover error = f"Failed to parse number of results from search response: {query_results}" return Result.Fail(error)
def execute_nodejs_script(script_file_path, script_args): # pragma: no cover result = validate_file_path(script_file_path) if result.failure: return result valid_filepath = result.value if program_is_installed("node"): success = execute_js(str(valid_filepath), arguments=script_args) elif program_is_installed("nodejs"): success = execute_shell_command(f"nodejs {valid_filepath} {script_args}") else: return Result.Fail("Node.js is NOT installed!") return Result.Ok() if success else Result.Fail("nodejs script failed")
def parse_player_data_v2(self, player_data, bbref_id, debut_limit=None): try: debut = datetime.strptime(player_data.get("mlbDebutDate", ""), DATE_ONLY).date() except ValueError: # pragma: no cover debut = date.min if debut_limit and debut.year < debut_limit: return Result.Fail("Player debuted before the debut limit") try: birth_date = datetime.strptime(player_data.get("birthDate", ""), DATE_ONLY).date() except ValueError: # pragma: no cover birth_date = date.min match = HEIGHT_REGEX.search(player_data.get("height", r"0' 0\"")) if not match: return Result.Fail("Response JSON was not in the expected format") groups = match.groupdict() height_total_inches = int(groups["feet"]) * 12 + int(groups["inches"]) name_given = ( f'{player_data.get("firstName", "")} {player_data["middleName"]}' if "middleName" in player_data else player_data.get( "firstName", "")) first_name = player_data.get( "useName", "") if "useName" in player_data else player_data.get( "firstName", "") bats = player_data.get("batSide", {}) throws = player_data.get("pitchHand", {}) player_dict = { "name_first": first_name, "name_last": player_data.get("lastName", ""), "name_given": name_given, "bats": bats.get("code", ""), "throws": throws.get("code", ""), "weight": player_data.get("weight"), "height": height_total_inches, "debut": debut, "birth_year": birth_date.year, "birth_month": birth_date.month, "birth_day": birth_date.day, "birth_country": player_data.get("birthCountry", ""), "birth_state": player_data.get("birthStateProvince", ""), "birth_city": player_data.get("birthCity", ""), "bbref_id": bbref_id, "mlb_id": player_data["id"], "add_to_db_backup": True, } return Result.Ok(player_dict)
def add_data_for_year(self, year): report_for_season = self.app.audit_report.get(year) if not report_for_season: return Result.Fail( f"Audit report could not be generated for MLB Season {year}") game_ids = report_for_season.get("successful") if not game_ids: error = f"No games for MLB Season {year} qualify to have PitchFx data imported." return Result.Fail(error) self.events.add_data_to_db_start(year, game_ids) self.add_data_for_games(year, game_ids) self.events.add_data_to_db_complete(year) return Result.Ok()
def is_date_in_season(cls, db_session, check_date, season_type=SeasonType.REGULAR_SEASON): season = cls.find_by_year(db_session, check_date.year) if not season: error = ( f"Database does not contain info for the MLB {check_date.year} " f'{season_type.replace("_", " ").title()}' ) return Result.Fail(error) date_str = check_date.strftime(DATE_ONLY) if check_date < season.start_date or check_date > season.end_date: error = f"{date_str} is not within the scope of the {season.name}" return Result.Fail(error) return Result.Ok(season)
def _validate_single_date(db_session, game_date): season = db.Season.find_by_year(db_session, game_date.year) date_is_valid = db.Season.is_date_in_season(db_session, game_date).success date_str = game_date.strftime(DATE_ONLY) if not date_is_valid: error = (f"'{date_str}' is not within the {season.name}:\n" f"season_start_date: {season.start_date_str}\n" f"season_end_date: {season.end_date_str}") return Result.Fail(error) date_status = db.DateScrapeStatus.find_by_date(db_session, game_date) if not date_status: error = f"scrape_status_date does not contain an entry for date: {date_str}" return Result.Fail(error) return Result.Ok(date_status)
def get_sync_parameters(self): result = season_prompt(self.db_session, "Select a season to synchronize scraped data:") if result.failure: return Result.Fail("") self.year = result.value.year self.file_types = file_types_prompt( "Select one or multiple file types to synchronize:") for file_type in self.file_types: self.sync_tasks[file_type] = self.get_data_sets_to_sync(file_type) result = self.sync_direction_prompt() if result.failure: return Result.Fail("") self.sync_direction = result.value return Result.Ok()
def validate_file_path(input_path: Union[Path, str]): if not input_path: return Result.Fail("NoneType or empty string is not a valid file path.") if isinstance(input_path, str): filepath = Path(input_path) elif not isinstance(input_path, Path): error = f'"input_path" parameter must be str or Path value (not "{type(input_path)}").' return Result.Fail(error) else: filepath = input_path if not filepath.exists(): return Result.Fail(f'File does not exist: "{filepath}"') if not filepath.is_file(): return Result.Fail(f'The provided path is NOT a file: "{filepath}"') return Result.Ok(filepath)
def initialize(self): if self.scrape_condition == ScrapeCondition.NEVER: return Result.Fail("skip") signal( SIGINT, partial(user_cancelled, self.db_session, self.db_job, self.spinner)) self.spinner.text = "Building URL List..." self.spinner.start() self.url_tracker = UrlTracker(self.db_job, self.data_set, self.scraped_data) result = self.url_tracker.create_url_set(self.start_date, self.end_date) return result if result.failure else Result.Ok( ) if self.url_tracker.total_urls else Result.Fail("skip")
def download_file(url: str, local_folder: Path): file_name = get_file_name_from_url(url) local_file_path = local_folder.joinpath(file_name) r = requests.head(url) remote_file_size = int(r.headers.get("content-length", 0)) if not remote_file_size: return Result.Fail( f'Request for "{file_name}" did not return a response containing the file size.' ) local_file_size = 0 resume_header = None fopen_mode = "wb" if not local_file_path.exists(): print(f'"{file_name}" does not exist. Downloading...') else: local_file_size = local_file_path.stat().st_size if local_file_size == remote_file_size: print(f'"{file_name}" is complete. Skipping...') return Result.Ok(local_file_path) print(f'"{file_name}" is incomplete. Resuming...') resume_header = {"Range": f"bytes={local_file_size}-"} fopen_mode = "ab" r = requests.get(url, stream=True, headers=resume_header) with open(local_file_path, fopen_mode) as f: with tqdm( total=remote_file_size, unit="B", unit_scale=True, unit_divisor=1024, desc=local_file_path.name, initial=local_file_size, ascii=True, miniters=1, ) as pbar: for chunk in r.iter_content(32 * CHUNK_SIZE): f.write(chunk) pbar.update(len(chunk)) local_file_size = local_file_path.stat().st_size if local_file_size == remote_file_size: return Result.Ok(local_file_path) more_or_fewer = "more" if local_file_size > remote_file_size else "fewer" error = ( f'Recieved {more_or_fewer} bytes than expected for "{file_name}"!\n' f"Expected File Size: {remote_file_size:,} bytes\n" f"Received File Size: {local_file_size:,} bytes") return Result.Fail(error)
def validate_brooks_game_id(input_str): match = BB_GAME_ID_REGEX.search(input_str) if not match: raise ValueError(f"String is not a valid bb game id: {input_str}") captured = match.groupdict() year = int(captured["year"]) month = int(captured["month"]) day = int(captured["day"]) game_number = int(captured["game_num"]) try: game_date = datetime(year, month, day) except Exception as e: error = f"Failed to parse game_date from game_id:\n{repr(e)}" return Result.Fail(error) away_team_id = captured["home_team"].upper() home_team_id = captured["away_team"].upper() game_dict = { "game_id": input_str, "game_date": game_date, "away_team_id": away_team_id, "home_team_id": home_team_id, "game_number": game_number, } return Result.Ok(game_dict)
def create_urls_for_brooks_pitch_logs_for_date(db_job, scraped_data, game_date): data_set = DataSet.BROOKS_PITCH_LOGS req_data_set = DataSet.BROOKS_GAMES_FOR_DATE games_for_date = scraped_data.get_brooks_games_for_date(game_date) if not games_for_date: return Result.Fail( get_unscraped_data_error(data_set, req_data_set, game_date)) urls = [] for game in games_for_date.games: if game.might_be_postponed: continue for pitcher_id, pitch_log_url in game.pitcher_appearance_dict.items(): pitch_app_id = f"{game.bbref_game_id}_{pitcher_id}" url_data = { "url": pitch_log_url, "url_id": pitch_app_id, "fileName": get_filename(scraped_data, data_set, pitch_app_id), "cachedHtmlFolderPath": get_cached_html_folderpath(scraped_data, data_set, game_date), "scrapedHtmlFolderpath": get_scraped_html_folderpath(db_job, data_set), } urls.append(from_dict(data_class=UrlDetails, data=url_data)) return Result.Ok(urls)
def status_date_range(app, start, end, verbosity): """Report status for each date in a specified range. Dates can be provided in any format that is recognized by dateutil.parser. For example, all of the following strings are valid ways to represent the same date: "2018-5-13" -or- "05/13/2018" -or- "May 13 2018" """ report_type = StatusReport.NONE if verbosity <= 0: error = f"Invalid value for verbosity: {verbosity}. Value must be greater than zero." return exit_app(app, Result.Fail(error)) elif verbosity == 1: report_type = StatusReport.DATE_SUMMARY_MISSING_DATA elif verbosity == 2: report_type = StatusReport.DATE_SUMMARY_ALL_DATES elif verbosity == 3: report_type = StatusReport.DATE_DETAIL_MISSING_DATA elif verbosity == 4: report_type = StatusReport.DATE_DETAIL_ALL_DATES else: report_type = StatusReport.DATE_DETAIL_MISSING_PITCHFX result = report_date_range_status(app.db_session, start, end, report_type) if result.success: report_viewer = result.value report_viewer.launch() return exit_app(app, result)
def ui(app): # pragma: no cover """Menu-driven UI powered by Bullet.""" try: result = MainMenu(app).launch() return exit_app(app, result) except Exception as e: return exit_app(app, Result.Fail(f"Error: {repr(e)}"))
def import_id_map_csv(app, csv_folder): try: id_map_task = tasks.UpdatePlayerIdMapTask(app, csv_folder.joinpath(PLAYER_ID_MAP_CSV)) player_id_map = id_map_task.read_bbref_player_id_map_from_file() with tqdm( total=len(player_id_map), desc="Populating player_id table.....", unit="row", mininterval=0.12, maxinterval=5, unit_scale=True, ncols=90, ) as pbar: for id_map in player_id_map: app.db_session.add( db.PlayerId( mlb_id=int(id_map.mlb_ID), mlb_name=id_map.name_common, bbref_id=id_map.player_ID, bbref_name=None, ) ) pbar.update() return Result.Ok() except Exception as e: error = f"Error: {repr(e)}" app.db_session.rollback() return Result.Fail(error)
def get_pitch_app_status_record(db_session, pitch_app_id): pitch_app_status = db.PitchAppScrapeStatus.find_by_pitch_app_id( db_session, pitch_app_id) if pitch_app_status: return Result.Ok(pitch_app_status) error = f"scrape_status_pitch_app does not contain an entry for pitch_app_id: {pitch_app_id}" return Result.Fail(error)
def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() scraped_bbref_boxscores = db.DateScrapeStatus.verify_all_bbref_boxscores_scraped_for_date( self.db_session, game_date) return Result.Ok() if not scraped_bbref_boxscores else Result.Fail( "skip")
def execute(self, trim_data_sets=True): self.events.find_eligible_games_start() game_ids = db.Season_Game_PitchApp_View.get_all_bbref_game_ids_combined_no_missing_pfx( self.db_engine) if not game_ids: return Result.Fail( "No games meet the requirements for this process.") self.events.find_eligible_games_complete(game_ids) self.events.calculate_pitch_metrics_start() pitch_samples = [] at_bat_samples = [] inning_samples = [] for num, game_id in enumerate(game_ids, start=1): combined_data = self.scraped_data.get_combined_game_data(game_id) if not combined_data: continue result = self.calc_pitch_metrics(combined_data) pitch_samples.extend(result[0]) at_bat_samples.extend(result[2]) inning_samples.extend(result[4]) self.events.calculate_pitch_metrics_progress(num) self.events.calculate_pitch_metrics_complete() metrics = { "time_between_pitches": self.process_data_set(pitch_samples, trim=trim_data_sets), "time_between_at_bats": self.process_data_set(at_bat_samples, trim=trim_data_sets), "time_between_innings": self.process_data_set(inning_samples, trim=trim_data_sets), } return Result.Ok(metrics)
def check_current_status(self, game_date): if self.scrape_condition == ScrapeCondition.ALWAYS: return Result.Ok() brooks_games_for_date = db.DateScrapeStatus.verify_brooks_daily_dashboard_scraped_for_date( self.db_session, game_date) return Result.Ok() if not brooks_games_for_date else Result.Fail( "skip")
def delete_from_s3(self, s3_key): # pragma: no cover try: self.s3_resource.Object(self.bucket_name, s3_key).delete() return Result.Ok() except botocore.exceptions.ClientError as ex: error_code = ex.response["Error"]["Code"] return Result.Fail(f"{repr(ex)} (Error Code: {error_code})")
def validate_file(local_file_path: Path, hash_file_path: Path) -> Result: if not local_file_path.exists(): return Result.Fail(f"Unable to locate file: {local_file_path}") md5 = hashlib.md5() with open(local_file_path, "rb") as f: while chunk := f.read(CHUNK_SIZE): md5.update(chunk)
def validate_folder_path(input_path: Union[Path, str]): if not input_path: return Result.Fail("NoneType or empty string is not a valid folder path.") if isinstance(input_path, str): folderpath = Path(input_path) elif not isinstance(input_path, Path): error = f'"input_path" parameter must be str or Path value (not "{type(input_path)}").' return Result.Fail(error) else: folderpath = input_path if not folderpath.exists(): return Result.Fail(f'Directory does NOT exist: "{folderpath}"') if not folderpath.is_dir(): return Result.Fail(f'The provided path is NOT a directory: "{folderpath}"') if is_windows() and folderpath.is_reserved(): return Result.Fail(f'The provided path is reserved under Windows: "{folderpath}"') return Result.Ok(folderpath)
def change_value(self, env_var_name, new_value): if env_var_name not in ENV_VAR_NAMES: return Result.Fail( f"{env_var_name} is not a recognized environment variable.") self.env_var_dict[env_var_name] = new_value self.write_dotenv_file() self.read_dotenv_file() return Result.Ok()
def write_config_file(self): try: config_json = json.dumps(self.config_json, indent=2, sort_keys=False) self.config_filepath.write_text(config_json) return Result.Ok() except Exception as e: error = f"Error: {repr(e)}" return Result.Fail(error)