Ejemplo n.º 1
0
 def test_connect_db(self, db_config):
     with utils.connect_db(db_config) as conn:
         assert not conn.closed
         result = conn.execute("select version()").fetchone()
         assert len(result) == 1
         print(result)
     assert conn.closed
Ejemplo n.º 2
0
def cli(load: bool, clear: bool, season: Optional[int], dir: str) -> None:
    """Script entry point"""

    config = utils.init_config()
    utils.init_logging(config["LOGGING"])
    conn = utils.connect_db(config["DB"])

    batters = pd.read_sql_table("raw_batters_overall", conn)
    pitchers = pd.read_sql_table("raw_pitchers_overall", conn)
    corrections = pd.read_sql_table("name_corrections", conn)
    duplicates = get_duplicates(conn)

    batters = CleanFunctions.normalize_names(batters)
    pitchers = CleanFunctions.normalize_names(pitchers)

    batters = batters[["lname", "fname", "team", "season"]]
    pitchers = pitchers[["lname", "fname", "team", "season"]]

    # All batters and pitchers
    # (remove duplicates where a player batted and pitched in the same season)
    data = pd.merge(batters,
                    pitchers,
                    on=["fname", "lname", "team", "season"],
                    how="outer")
    data = data.sort_values(by=["lname", "fname", "team", "season"])

    data = CleanFunctions.apply_corrections(data, corrections)
    data = generate_ids(data, duplicates)

    if season:
        data = data[data["season"] == season]

    if load:
        if clear:
            print("Clearing database table")
            conn.execute("DELETE FROM player_id")

        print("Loading data into database")
        utils.db_load_data(data,
                           "player_id",
                           conn,
                           if_exists="append",
                           index=False)

    else:
        print("Dumping to csv")
        data.to_csv(os.path.join(dir, "player_id.csv"), index=False)
    conn.close()
Ejemplo n.º 3
0
def cli():
    config = utils.init_config()
    # utils.init_logging(config["LOGGING"])
    conn = utils.connect_db(config["DB"])

    tables = get_all_table_names(conn)
    data = pd.concat([table_count(conn, table) for table in tables])
    data.sort_values("table", inplace=True)
    data.reset_index(drop=True, inplace=True)
    raw = data[data["table"].str.startswith("raw")]
    clean = data[~data["table"].str.startswith("raw")]
    print("***** Raw ******")
    print(raw[~raw["table"].str.endswith("inseason")])
    print(raw[raw["table"].str.endswith("inseason")])
    print("**** Clean *****")
    print(clean[~clean["table"].str.endswith("inseason")])
    print(clean[clean["table"].str.endswith("inseason")])
    conn.close()
Ejemplo n.º 4
0
def final(
    year: List[int], stat: Tuple[int], split: str, load: bool, verbose: bool
) -> None:
    """Run ETLs for the final subcommand

    :param args: Arguments for the ETLs
    :param conn: Database connection object
    """
    config = utils.init_config()
    utils.init_logging(config["LOGGING"])
    logging.info("Initializing cleaning controller script")
    conn = utils.connect_db(config["DB"])

    if split == "all":
        splits = list(Split)
    else:
        splits = [Split(split)]

    for year_ in year:
        logging.info("Running ETLs for %s", year_)
        run_etls(list(stat), year_, splits, load, conn)

    conn.close()
    logging.info("Cleaning completed")
Ejemplo n.º 5
0
def cli(
    corrections: bool,
    fname: int,
    lname: int,
    nicknames: bool,
    duplicates: bool,
    dir: str,
) -> None:
    """Script entry point"""
    levenshtein_first = fname
    levenshtein_last = lname
    print(fname, lname)
    config = utils.init_config()
    utils.init_logging(config["LOGGING"])
    conn = utils.connect_db(config["DB"])

    batters = pd.read_sql_table("raw_batters_overall", conn)
    pitchers = pd.read_sql_table("raw_pitchers_overall", conn)
    if corrections:
        corrections_df = pd.read_sql_table("name_corrections", conn)
    if nicknames:
        nicknames_df = pd.read_sql_table("nicknames", conn)

    conn.close()

    batters = CleanFunctions.normalize_names(batters)
    pitchers = CleanFunctions.normalize_names(pitchers)

    batters = batters[["lname", "fname", "team", "season", "pos"]]
    pitchers = pitchers[["lname", "fname", "team", "season", "pos"]]

    data = pd.concat([batters, pitchers], ignore_index=True)
    data = data.sort_values(by=["lname", "fname", "team", "season"])

    if corrections:
        data = CleanFunctions.apply_corrections(data, corrections_df)
        data = data.sort_values(by=["lname", "fname", "team", "season"])

    if levenshtein_last or levenshtein_first:
        print("Performing levenshtein analysis")
        output = levenshtein_analysis(data, levenshtein_first,
                                      levenshtein_last)
        print("Found", len(output), "candidates")
        if len(output) > 0:
            print("Dumping to csv")
            filename = os.path.join(dir, "levenshtein_analysis.csv")
            output.to_csv(filename, index=False)

    if nicknames:
        print("Performing nickname analysis")
        output = nickname_analysis(data, nicknames_df)
        print("Found", len(output), "candidates")
        if len(output) > 0:
            print("Dumping to csv")
            filename = os.path.join(dir, "nickname_analysis.csv")
            output.to_csv(filename, index=False)

    if duplicates:
        print("Performing duplicate names analysis")
        output = duplicate_names_analysis(data)
        print("Found", len(output), "candidates")
        if len(output) > 0:
            print("Dumping to csv")
            filename = os.path.join(dir, "duplicate_names_analysis.csv")
            output.to_csv(filename, index=False)

    print("Dumping all names to csv")
    filename = os.path.join(dir, "all_names.csv")
    data.to_csv(filename, index=False)