def _clean(self, data: pd.DataFrame) -> pd.DataFrame: unnecessaryCols = ["Rk"] renameCols = {"gp": "g", "rcs": "cs", "rcs%": "cspct"} intCols = ["g", "tc", "po", "a", "e", "dp", "sba", "cs", "pb", "ci"] floatCols = ["fpct", "cspct"] finalColNames = [ "Name", "Season", "g", "tc", "po", "a", "e", "fpct", "dp", "sba", "cs", "cspct", "pb", "ci", ] if self._inseason: finalColNames = [ "Name", "Season", "Date", "g", "tc", "po", "a", "e", "fpct", "dp", "sba", "cs", "cspct", "pb", "ci", ] data.drop(columns=unnecessaryCols, inplace=True) data.rename(columns=renameCols, inplace=True) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) data[floatCols] = data[floatCols].replace("INF", np.nan) data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def _clean_conference(self, data: pd.DataFrame, team: str) -> pd.DataFrame: unnecessaryCols = ["Rk"] renameCols = {"gp": "g", "k": "so", "k/9": "so_9"} intCols = ["g", "h", "r", "er", "bb", "so", "hr"] floatCols = ["so_9", "era"] finalColNames = [ "Name", "Season", "g", "ip", "h", "r", "er", "bb", "so", "so_9", "hr", "era", ] if self._inseason: finalColNames = [ "Name", "Season", "Date", "g", "ip", "h", "r", "er", "bb", "so", "so_9", "hr", "era", ] data = data.drop(columns=unnecessaryCols) data = data.rename(columns=renameCols) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) data[floatCols] = data[floatCols].replace("INF", np.nan) data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def test_year_to_season(self, year, expected): assert utils.year_to_season(year) == expected
def _clean_overall(self, data: pd.DataFrame, team: str) -> pd.DataFrame: unnecessaryCols = [ "No.", "Name", "Pos", "Yr", "app", "gs", "GS", "w", "l", "sv", "cg", "ip", "h", "r", "er", "bb", "k", "hr", "era", ] intCols = [ "G", "W", "L", "SV", "CG", "SHO", "IP", "H", "R", "ER", "BB", "SO", "x2B", "x3B", "HR", "AB", "WP", "HBP", "BK", "SF", "SH", ] floatCols = ["ERA", "AVG", "SO_9"] renameCols = { "APP": "G", "2B": "x2B", "3B": "x3B", "B/AVG": "AVG", "SFA": "SF", "SHA": "SH", "k/9": "SO_9", } finalColNames = [ "Name", "Season", "G", "W", "L", "SV", "CG", "SHO", "IP", "H", "R", "ER", "BB", "SO", "ERA", "x2B", "x3B", "HR", "AB", "AVG", "WP", "HBP", "BK", "SF", "SH", "SO_9", ] if self._inseason: finalColNames = [ "Name", "Season", "Date", "G", "W", "L", "SV", "CG", "SHO", "IP", "H", "R", "ER", "BB", "SO", "ERA", "x2B", "x3B", "HR", "AB", "AVG", "WP", "HBP", "BK", "SF", "SH", "SO_9", ] data.drop(columns=unnecessaryCols, inplace=True) data.rename(columns=renameCols, inplace=True) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) data[floatCols] = data[floatCols].replace("INF", np.nan) data["Name"] = team data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def _clean(self, data: pd.DataFrame, team_id: str) -> pd.DataFrame: # add TeamId, Season # replace dashes and strip dots from Yr (Fr. -> Fr) # column names cannot start with a digit in PostgreSQL!!!!! # disallowed column names: no., 2b, 3b, go/fo data = data.copy() intCols = [ "No.", "g", "ab", "r", "h", "2b", "3b", "hr", "rbi", "bb", "k", "sb", "cs", "hbp", "sf", "sh", "tb", "xbh", "hdp", "go", "fo", "pa", ] floatCols = ["avg", "obp", "slg", "go/fo"] newColNames = [ "No", "Name", "Yr", "Pos", "G", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", "PA", ] finalColNames = [ "No", "Name", "Team", "Season", "Yr", "Pos", "G", "PA", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", ] if self._inseason: finalColNames = [ "No", "Name", "Team", "Season", "Date", "Yr", "Pos", "G", "PA", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", ] data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) # convert column names to a friendlier format data.columns = newColNames data["Team"] = team_id data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data["Yr"] = data["Yr"].str.rstrip(".") data["Pos"] = data["Pos"].replace("", np.nan) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def _clean(self, data: pd.DataFrame, team: str) -> pd.DataFrame: if self._split == GameLogSplit.HITTING: intCols = [ "ab", "r", "h", "x2b", "x3b", "hr", "rbi", "bb", "so", "sb", "cs", "hbp", "sf", "sh", "tb", "xbh", "gdp", "go", "fo", "pa", ] floatCols = ["go_fo"] renameCols = { "2b": "x2b", "3b": "x3b", "k": "so", "hdp": "gdp", "go/fo": "go_fo", } elif self._split == GameLogSplit.PITCHING: intCols = ["w", "l", "sv", "h", "r", "er", "bb", "so", "hr"] floatCols = ["era"] renameCols = {"k": "so"} elif self._split == GameLogSplit.FIELDING: intCols = ["tc", "po", "a", "e", "dp", "sba", "cs", "pb", "ci"] floatCols = ["fpct", "cspct"] renameCols = {"rcs": "cs", "rcs%": "cspct"} data.rename(columns=renameCols, inplace=True) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) data[floatCols] = data[floatCols].replace("INF", np.nan) # replace tabs data["Opponent"] = [x.replace("\t", "") for x in data["Opponent"]] # strip excessive whitespace data["Opponent"] = [" ".join(x.split()) for x in data["Opponent"]] # replace strange # in Date column (Maranatha 2012) data["Date"] = [x.replace("#", "").strip() for x in data["Date"]] data["Name"] = team data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["scrape_date"] = str(date.today()) # filter out cancelled games that don't have a result data = data[data["Score"] != ""] data["game_num"] = list(range(1, len(data) + 1)) data["game_num"] = data["game_num"].apply(str) finalColNames = data.axes[1].tolist() finalColNames.remove("Season") finalColNames.remove("Name") if self._inseason: finalColNames.remove("scrape_date") finalColNames.insert(1, "Season") finalColNames.insert(2, "Name") if self._inseason: finalColNames.insert(0, "scrape_date") finalColNames.remove("game_num") finalColNames.insert(0, "game_num") data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def _clean(self, data: pd.DataFrame) -> pd.DataFrame: unnecessaryCols = ["Rk"] intCols = [ "gp", "ab", "r", "h", "2b", "3b", "hr", "rbi", "bb", "k", "sb", "cs", "hbp", "sf", "sh", "tb", "xbh", "hdp", "go", "fo", "pa", ] floatCols = ["avg", "obp", "slg", "go/fo"] newColNames = [ "Name", "G", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", "PA", ] finalColNames = [ "Name", "Season", "G", "PA", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", ] if self._inseason: finalColNames = [ "Name", "Season", "Date", "G", "PA", "AB", "R", "H", "x2B", "x3B", "HR", "RBI", "BB", "SO", "SB", "CS", "AVG", "OBP", "SLG", "HBP", "SF", "SH", "TB", "XBH", "GDP", "GO", "FO", "GO_FO", ] data.drop(columns=unnecessaryCols, inplace=True) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) # convert column names to a friendlier format data.columns = newColNames data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data
def _clean_conference(self, data: pd.DataFrame, team_id: str) -> pd.DataFrame: renameCols = {"No.": "No", "app": "g", "k": "so", "k/9": "so_9"} intCols = [ "No", "g", "gs", "w", "l", "sv", "cg", "h", "r", "er", "bb", "so", "hr", ] floatCols = ["so_9", "era"] finalColNames = [ "No", "Name", "Team", "Season", "Yr", "Pos", "g", "gs", "w", "l", "sv", "cg", "ip", "h", "r", "er", "bb", "so", "so_9", "hr", "era", ] if self._inseason: finalColNames = [ "No", "Name", "Team", "Season", "Date", "Yr", "Pos", "g", "gs", "w", "l", "sv", "cg", "ip", "h", "r", "er", "bb", "so", "so_9", "hr", "era", ] data.rename(columns=renameCols, inplace=True) data[intCols] = data[intCols].replace("-", "0") data[floatCols] = data[floatCols].replace("-", np.nan) data[floatCols] = data[floatCols].replace("INF", np.nan) data["Team"] = team_id data["Season"] = str(utils.year_to_season(self._year)) if self._inseason: data["Date"] = str(date.today()) data["Yr"] = data["Yr"].str.rstrip(".") data["Pos"] = data["Pos"].replace("", np.nan) data = data[finalColNames] data.columns = data.columns.to_series().str.lower() return data