def _clean(self, data: pd.DataFrame) -> pd.DataFrame:
        unnecessaryCols = ["Rk"]
        renameCols = {"gp": "g", "rcs": "cs", "rcs%": "cspct"}
        intCols = ["g", "tc", "po", "a", "e", "dp", "sba", "cs", "pb", "ci"]
        floatCols = ["fpct", "cspct"]
        finalColNames = [
            "Name",
            "Season",
            "g",
            "tc",
            "po",
            "a",
            "e",
            "fpct",
            "dp",
            "sba",
            "cs",
            "cspct",
            "pb",
            "ci",
        ]
        if self._inseason:
            finalColNames = [
                "Name",
                "Season",
                "Date",
                "g",
                "tc",
                "po",
                "a",
                "e",
                "fpct",
                "dp",
                "sba",
                "cs",
                "cspct",
                "pb",
                "ci",
            ]

        data.drop(columns=unnecessaryCols, inplace=True)
        data.rename(columns=renameCols, inplace=True)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)
        data[floatCols] = data[floatCols].replace("INF", np.nan)

        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())

        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #2
0
    def _clean_conference(self, data: pd.DataFrame, team: str) -> pd.DataFrame:
        unnecessaryCols = ["Rk"]
        renameCols = {"gp": "g", "k": "so", "k/9": "so_9"}
        intCols = ["g", "h", "r", "er", "bb", "so", "hr"]
        floatCols = ["so_9", "era"]
        finalColNames = [
            "Name",
            "Season",
            "g",
            "ip",
            "h",
            "r",
            "er",
            "bb",
            "so",
            "so_9",
            "hr",
            "era",
        ]
        if self._inseason:
            finalColNames = [
                "Name",
                "Season",
                "Date",
                "g",
                "ip",
                "h",
                "r",
                "er",
                "bb",
                "so",
                "so_9",
                "hr",
                "era",
            ]

        data = data.drop(columns=unnecessaryCols)
        data = data.rename(columns=renameCols)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)
        data[floatCols] = data[floatCols].replace("INF", np.nan)

        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())

        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #3
0
 def test_year_to_season(self, year, expected):
     assert utils.year_to_season(year) == expected
Exemple #4
0
    def _clean_overall(self, data: pd.DataFrame, team: str) -> pd.DataFrame:
        unnecessaryCols = [
            "No.",
            "Name",
            "Pos",
            "Yr",
            "app",
            "gs",
            "GS",
            "w",
            "l",
            "sv",
            "cg",
            "ip",
            "h",
            "r",
            "er",
            "bb",
            "k",
            "hr",
            "era",
        ]
        intCols = [
            "G",
            "W",
            "L",
            "SV",
            "CG",
            "SHO",
            "IP",
            "H",
            "R",
            "ER",
            "BB",
            "SO",
            "x2B",
            "x3B",
            "HR",
            "AB",
            "WP",
            "HBP",
            "BK",
            "SF",
            "SH",
        ]
        floatCols = ["ERA", "AVG", "SO_9"]
        renameCols = {
            "APP": "G",
            "2B": "x2B",
            "3B": "x3B",
            "B/AVG": "AVG",
            "SFA": "SF",
            "SHA": "SH",
            "k/9": "SO_9",
        }
        finalColNames = [
            "Name",
            "Season",
            "G",
            "W",
            "L",
            "SV",
            "CG",
            "SHO",
            "IP",
            "H",
            "R",
            "ER",
            "BB",
            "SO",
            "ERA",
            "x2B",
            "x3B",
            "HR",
            "AB",
            "AVG",
            "WP",
            "HBP",
            "BK",
            "SF",
            "SH",
            "SO_9",
        ]
        if self._inseason:
            finalColNames = [
                "Name",
                "Season",
                "Date",
                "G",
                "W",
                "L",
                "SV",
                "CG",
                "SHO",
                "IP",
                "H",
                "R",
                "ER",
                "BB",
                "SO",
                "ERA",
                "x2B",
                "x3B",
                "HR",
                "AB",
                "AVG",
                "WP",
                "HBP",
                "BK",
                "SF",
                "SH",
                "SO_9",
            ]

        data.drop(columns=unnecessaryCols, inplace=True)
        data.rename(columns=renameCols, inplace=True)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)
        data[floatCols] = data[floatCols].replace("INF", np.nan)

        data["Name"] = team
        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())
        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #5
0
    def _clean(self, data: pd.DataFrame, team_id: str) -> pd.DataFrame:
        # add TeamId, Season
        # replace dashes and strip dots from Yr (Fr. -> Fr)
        # column names cannot start with a digit in PostgreSQL!!!!!
        # disallowed column names: no., 2b, 3b, go/fo
        data = data.copy()
        intCols = [
            "No.",
            "g",
            "ab",
            "r",
            "h",
            "2b",
            "3b",
            "hr",
            "rbi",
            "bb",
            "k",
            "sb",
            "cs",
            "hbp",
            "sf",
            "sh",
            "tb",
            "xbh",
            "hdp",
            "go",
            "fo",
            "pa",
        ]
        floatCols = ["avg", "obp", "slg", "go/fo"]
        newColNames = [
            "No",
            "Name",
            "Yr",
            "Pos",
            "G",
            "AB",
            "R",
            "H",
            "x2B",
            "x3B",
            "HR",
            "RBI",
            "BB",
            "SO",
            "SB",
            "CS",
            "AVG",
            "OBP",
            "SLG",
            "HBP",
            "SF",
            "SH",
            "TB",
            "XBH",
            "GDP",
            "GO",
            "FO",
            "GO_FO",
            "PA",
        ]
        finalColNames = [
            "No",
            "Name",
            "Team",
            "Season",
            "Yr",
            "Pos",
            "G",
            "PA",
            "AB",
            "R",
            "H",
            "x2B",
            "x3B",
            "HR",
            "RBI",
            "BB",
            "SO",
            "SB",
            "CS",
            "AVG",
            "OBP",
            "SLG",
            "HBP",
            "SF",
            "SH",
            "TB",
            "XBH",
            "GDP",
            "GO",
            "FO",
            "GO_FO",
        ]
        if self._inseason:
            finalColNames = [
                "No",
                "Name",
                "Team",
                "Season",
                "Date",
                "Yr",
                "Pos",
                "G",
                "PA",
                "AB",
                "R",
                "H",
                "x2B",
                "x3B",
                "HR",
                "RBI",
                "BB",
                "SO",
                "SB",
                "CS",
                "AVG",
                "OBP",
                "SLG",
                "HBP",
                "SF",
                "SH",
                "TB",
                "XBH",
                "GDP",
                "GO",
                "FO",
                "GO_FO",
            ]

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)

        # convert column names to a friendlier format
        data.columns = newColNames

        data["Team"] = team_id
        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())
        data["Yr"] = data["Yr"].str.rstrip(".")
        data["Pos"] = data["Pos"].replace("", np.nan)

        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #6
0
    def _clean(self, data: pd.DataFrame, team: str) -> pd.DataFrame:
        if self._split == GameLogSplit.HITTING:
            intCols = [
                "ab",
                "r",
                "h",
                "x2b",
                "x3b",
                "hr",
                "rbi",
                "bb",
                "so",
                "sb",
                "cs",
                "hbp",
                "sf",
                "sh",
                "tb",
                "xbh",
                "gdp",
                "go",
                "fo",
                "pa",
            ]
            floatCols = ["go_fo"]
            renameCols = {
                "2b": "x2b",
                "3b": "x3b",
                "k": "so",
                "hdp": "gdp",
                "go/fo": "go_fo",
            }

        elif self._split == GameLogSplit.PITCHING:
            intCols = ["w", "l", "sv", "h", "r", "er", "bb", "so", "hr"]
            floatCols = ["era"]
            renameCols = {"k": "so"}

        elif self._split == GameLogSplit.FIELDING:
            intCols = ["tc", "po", "a", "e", "dp", "sba", "cs", "pb", "ci"]
            floatCols = ["fpct", "cspct"]
            renameCols = {"rcs": "cs", "rcs%": "cspct"}

        data.rename(columns=renameCols, inplace=True)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)
        data[floatCols] = data[floatCols].replace("INF", np.nan)

        # replace tabs
        data["Opponent"] = [x.replace("\t", "") for x in data["Opponent"]]
        # strip excessive whitespace
        data["Opponent"] = [" ".join(x.split()) for x in data["Opponent"]]

        # replace strange # in Date column (Maranatha 2012)
        data["Date"] = [x.replace("#", "").strip() for x in data["Date"]]

        data["Name"] = team
        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["scrape_date"] = str(date.today())

        # filter out cancelled games that don't have a result
        data = data[data["Score"] != ""]
        data["game_num"] = list(range(1, len(data) + 1))
        data["game_num"] = data["game_num"].apply(str)

        finalColNames = data.axes[1].tolist()
        finalColNames.remove("Season")
        finalColNames.remove("Name")
        if self._inseason:
            finalColNames.remove("scrape_date")

        finalColNames.insert(1, "Season")
        finalColNames.insert(2, "Name")
        if self._inseason:
            finalColNames.insert(0, "scrape_date")

        finalColNames.remove("game_num")
        finalColNames.insert(0, "game_num")
        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #7
0
    def _clean(self, data: pd.DataFrame) -> pd.DataFrame:
        unnecessaryCols = ["Rk"]
        intCols = [
            "gp",
            "ab",
            "r",
            "h",
            "2b",
            "3b",
            "hr",
            "rbi",
            "bb",
            "k",
            "sb",
            "cs",
            "hbp",
            "sf",
            "sh",
            "tb",
            "xbh",
            "hdp",
            "go",
            "fo",
            "pa",
        ]
        floatCols = ["avg", "obp", "slg", "go/fo"]
        newColNames = [
            "Name",
            "G",
            "AB",
            "R",
            "H",
            "x2B",
            "x3B",
            "HR",
            "RBI",
            "BB",
            "SO",
            "SB",
            "CS",
            "AVG",
            "OBP",
            "SLG",
            "HBP",
            "SF",
            "SH",
            "TB",
            "XBH",
            "GDP",
            "GO",
            "FO",
            "GO_FO",
            "PA",
        ]

        finalColNames = [
            "Name",
            "Season",
            "G",
            "PA",
            "AB",
            "R",
            "H",
            "x2B",
            "x3B",
            "HR",
            "RBI",
            "BB",
            "SO",
            "SB",
            "CS",
            "AVG",
            "OBP",
            "SLG",
            "HBP",
            "SF",
            "SH",
            "TB",
            "XBH",
            "GDP",
            "GO",
            "FO",
            "GO_FO",
        ]
        if self._inseason:
            finalColNames = [
                "Name",
                "Season",
                "Date",
                "G",
                "PA",
                "AB",
                "R",
                "H",
                "x2B",
                "x3B",
                "HR",
                "RBI",
                "BB",
                "SO",
                "SB",
                "CS",
                "AVG",
                "OBP",
                "SLG",
                "HBP",
                "SF",
                "SH",
                "TB",
                "XBH",
                "GDP",
                "GO",
                "FO",
                "GO_FO",
            ]

        data.drop(columns=unnecessaryCols, inplace=True)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)

        # convert column names to a friendlier format
        data.columns = newColNames

        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())

        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data
Exemple #8
0
    def _clean_conference(self, data: pd.DataFrame,
                          team_id: str) -> pd.DataFrame:
        renameCols = {"No.": "No", "app": "g", "k": "so", "k/9": "so_9"}
        intCols = [
            "No",
            "g",
            "gs",
            "w",
            "l",
            "sv",
            "cg",
            "h",
            "r",
            "er",
            "bb",
            "so",
            "hr",
        ]
        floatCols = ["so_9", "era"]
        finalColNames = [
            "No",
            "Name",
            "Team",
            "Season",
            "Yr",
            "Pos",
            "g",
            "gs",
            "w",
            "l",
            "sv",
            "cg",
            "ip",
            "h",
            "r",
            "er",
            "bb",
            "so",
            "so_9",
            "hr",
            "era",
        ]

        if self._inseason:
            finalColNames = [
                "No",
                "Name",
                "Team",
                "Season",
                "Date",
                "Yr",
                "Pos",
                "g",
                "gs",
                "w",
                "l",
                "sv",
                "cg",
                "ip",
                "h",
                "r",
                "er",
                "bb",
                "so",
                "so_9",
                "hr",
                "era",
            ]
        data.rename(columns=renameCols, inplace=True)

        data[intCols] = data[intCols].replace("-", "0")
        data[floatCols] = data[floatCols].replace("-", np.nan)
        data[floatCols] = data[floatCols].replace("INF", np.nan)

        data["Team"] = team_id
        data["Season"] = str(utils.year_to_season(self._year))
        if self._inseason:
            data["Date"] = str(date.today())
        data["Yr"] = data["Yr"].str.rstrip(".")
        data["Pos"] = data["Pos"].replace("", np.nan)

        data = data[finalColNames]
        data.columns = data.columns.to_series().str.lower()
        return data