def test_write_csv_empty(): df = pd.DataFrame( [], columns=[CommonFields.DATE, CommonFields.FIPS, CommonFields.CASES]) with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df, tmp.path, structlog.get_logger()) assert "fips,date,cases\n" == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
def test_remove_index_column(): df = pd.DataFrame([("99", "2020-04-01", "a", 123)], columns=["fips", "date", "index", "cases" ]).set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df, tmp.path, structlog.get_logger()) assert "fips,date,cases\n99,2020-04-01,123\n" == tmp.file.read() assert [l["event"] for l in logs ] == ["Dropping column named 'index'", "Writing DataFrame"]
def test_write_csv_columns_are_sorted_in_output_with_extras(): df = pd.DataFrame([], columns=[ CommonFields.DATE, CommonFields.FIPS, "extra2", CommonFields.CASES, "extra1" ]) df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: log = structlog.get_logger() write_df_as_csv(df, tmp.path, log) assert "fips,date,cases,extra1,extra2\n" == tmp.file.read() assert [l["event"] for l in logs] == [ "Writing DataFrame", ]
def main(replace_local_mirror: bool): common_init.configure_logging() copier = AwsDataLakeCopier.make_with_data_root(DATA_ROOT) if replace_local_mirror: copier.replace_local_mirror() transformer = AwsDataLakeTransformer.make_with_data_root(DATA_ROOT) for source_name, source_files in copier.get_sources(): log = structlog.get_logger(source_name=source_name) write_df_as_csv( transformer.transform(source_files, log), DATA_ROOT / "aws-lake" / f"timeseries-{source_name}.csv", log, )
def test_write_csv_extra_columns_dropped(): df = pd.DataFrame([], columns=[ CommonFields.DATE, CommonFields.FIPS, "extra1", CommonFields.CASES, "extra2" ]) df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS) with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: log = structlog.get_logger() write_df_as_csv(only_common_columns(df, log), tmp.path, log) assert "fips,date,cases\n" == tmp.file.read() assert [l["event"] for l in logs] == [ "Dropping columns not in CommonFields", "Writing DataFrame", ]
def test_float_na_formatting(): df = pd.DataFrame( [("99", "2020-04-01", 1.0, 2, 3), ("99", "2020-04-02", pd.NA, pd.NA, None)], columns="fips date metric_a metric_b metric_c".split(), ).set_index(COMMON_FIELDS_TIMESERIES_KEYS) expected_csv = """fips,date,metric_a,metric_b,metric_c 99,2020-04-01,1,2,3 99,2020-04-02,,, """ with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"]
def test_float_formatting(): input_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6 99123,2020-04-01,1,2.0000000,3,0.0004,0.00005,6000000000 99123,2020-04-02,,,,,, 99123,2020-04-03,1,2,3.1234567,4,5,6.0 """ input_df = read_csv_to_indexed_df(StringIO(input_csv)) expected_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6 99123,2020-04-01,1,2,3,0.0004,5e-05,6000000000 99123,2020-04-02,,,,,, 99123,2020-04-03,1,2,3.1234567,4,5,6 """ with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(input_df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"]
def test_write_csv(): df = pd.DataFrame({ CommonFields.DATE: ["2020-04-01", "2020-04-02"], CommonFields.FIPS: ["06045", "45123"], CommonFields.CASES: [234, 456], }) df_original = df.copy() expected_csv = """fips,date,cases 06045,2020-04-01,234 45123,2020-04-02,456 """ # Call write_df_as_csv with index set to ["fips", "date"], the expected normal index. with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df.set_index(["fips", "date"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Writing DataFrame"] # Pass df with other index that will be changed. Check that the same output is written to the # file. with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df, tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df.set_index(["date", "cases"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] with temppathlib.NamedTemporaryFile( "w+") as tmp, structlog.testing.capture_logs() as logs: write_df_as_csv(df.set_index(["date", "fips"]), tmp.path, structlog.get_logger()) assert expected_csv == tmp.file.read() assert [l["event"] for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"] assert repr(df) == repr(df_original)
*sorted(rename.items(), key=lambda f_c: common_order[f_c[1]])) # Copy only columns in `rename.keys()` to a new DataFrame and rename. data = data.loc[:, list(names_in)].rename(columns=rename) if col_not_in_fields_or_common: self.log.warning("Removing columns not in CommonFields", columns=col_not_in_fields_or_common) return data def remove_duplicate_city_data(data): # City data before 3-23 was not duplicated, copy the city name to the county field. select_pre_march_23 = data.date < "2020-03-23" data.loc[select_pre_march_23, "county"] = data.loc[select_pre_march_23].apply( fill_missing_county_with_city, axis=1) # Don't want to return city data because it's duplicated in county return data.loc[select_pre_march_23 | ((~select_pre_march_23) & data["city"].isnull())].copy() if __name__ == "__main__": common_init.configure_logging() log = structlog.get_logger() transformer = CovidDataScraperTransformer.make_with_data_root(DATA_ROOT) write_df_as_csv( only_common_columns(transformer.transform(), log), DATA_ROOT / "cases-cds" / "timeseries-common.csv", log, )