def test_connect_db(self, db_config): with utils.connect_db(db_config) as conn: assert not conn.closed result = conn.execute("select version()").fetchone() assert len(result) == 1 print(result) assert conn.closed
def cli(load: bool, clear: bool, season: Optional[int], dir: str) -> None: """Script entry point""" config = utils.init_config() utils.init_logging(config["LOGGING"]) conn = utils.connect_db(config["DB"]) batters = pd.read_sql_table("raw_batters_overall", conn) pitchers = pd.read_sql_table("raw_pitchers_overall", conn) corrections = pd.read_sql_table("name_corrections", conn) duplicates = get_duplicates(conn) batters = CleanFunctions.normalize_names(batters) pitchers = CleanFunctions.normalize_names(pitchers) batters = batters[["lname", "fname", "team", "season"]] pitchers = pitchers[["lname", "fname", "team", "season"]] # All batters and pitchers # (remove duplicates where a player batted and pitched in the same season) data = pd.merge(batters, pitchers, on=["fname", "lname", "team", "season"], how="outer") data = data.sort_values(by=["lname", "fname", "team", "season"]) data = CleanFunctions.apply_corrections(data, corrections) data = generate_ids(data, duplicates) if season: data = data[data["season"] == season] if load: if clear: print("Clearing database table") conn.execute("DELETE FROM player_id") print("Loading data into database") utils.db_load_data(data, "player_id", conn, if_exists="append", index=False) else: print("Dumping to csv") data.to_csv(os.path.join(dir, "player_id.csv"), index=False) conn.close()
def cli(): config = utils.init_config() # utils.init_logging(config["LOGGING"]) conn = utils.connect_db(config["DB"]) tables = get_all_table_names(conn) data = pd.concat([table_count(conn, table) for table in tables]) data.sort_values("table", inplace=True) data.reset_index(drop=True, inplace=True) raw = data[data["table"].str.startswith("raw")] clean = data[~data["table"].str.startswith("raw")] print("***** Raw ******") print(raw[~raw["table"].str.endswith("inseason")]) print(raw[raw["table"].str.endswith("inseason")]) print("**** Clean *****") print(clean[~clean["table"].str.endswith("inseason")]) print(clean[clean["table"].str.endswith("inseason")]) conn.close()
def final( year: List[int], stat: Tuple[int], split: str, load: bool, verbose: bool ) -> None: """Run ETLs for the final subcommand :param args: Arguments for the ETLs :param conn: Database connection object """ config = utils.init_config() utils.init_logging(config["LOGGING"]) logging.info("Initializing cleaning controller script") conn = utils.connect_db(config["DB"]) if split == "all": splits = list(Split) else: splits = [Split(split)] for year_ in year: logging.info("Running ETLs for %s", year_) run_etls(list(stat), year_, splits, load, conn) conn.close() logging.info("Cleaning completed")
def cli( corrections: bool, fname: int, lname: int, nicknames: bool, duplicates: bool, dir: str, ) -> None: """Script entry point""" levenshtein_first = fname levenshtein_last = lname print(fname, lname) config = utils.init_config() utils.init_logging(config["LOGGING"]) conn = utils.connect_db(config["DB"]) batters = pd.read_sql_table("raw_batters_overall", conn) pitchers = pd.read_sql_table("raw_pitchers_overall", conn) if corrections: corrections_df = pd.read_sql_table("name_corrections", conn) if nicknames: nicknames_df = pd.read_sql_table("nicknames", conn) conn.close() batters = CleanFunctions.normalize_names(batters) pitchers = CleanFunctions.normalize_names(pitchers) batters = batters[["lname", "fname", "team", "season", "pos"]] pitchers = pitchers[["lname", "fname", "team", "season", "pos"]] data = pd.concat([batters, pitchers], ignore_index=True) data = data.sort_values(by=["lname", "fname", "team", "season"]) if corrections: data = CleanFunctions.apply_corrections(data, corrections_df) data = data.sort_values(by=["lname", "fname", "team", "season"]) if levenshtein_last or levenshtein_first: print("Performing levenshtein analysis") output = levenshtein_analysis(data, levenshtein_first, levenshtein_last) print("Found", len(output), "candidates") if len(output) > 0: print("Dumping to csv") filename = os.path.join(dir, "levenshtein_analysis.csv") output.to_csv(filename, index=False) if nicknames: print("Performing nickname analysis") output = nickname_analysis(data, nicknames_df) print("Found", len(output), "candidates") if len(output) > 0: print("Dumping to csv") filename = os.path.join(dir, "nickname_analysis.csv") output.to_csv(filename, index=False) if duplicates: print("Performing duplicate names analysis") output = duplicate_names_analysis(data) print("Found", len(output), "candidates") if len(output) > 0: print("Dumping to csv") filename = os.path.join(dir, "duplicate_names_analysis.csv") output.to_csv(filename, index=False) print("Dumping all names to csv") filename = os.path.join(dir, "all_names.csv") data.to_csv(filename, index=False)