Beispiel #1
0
def test_write_csv_empty():
    df = pd.DataFrame(
        [], columns=[CommonFields.DATE, CommonFields.FIPS, CommonFields.CASES])
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df, tmp.path, structlog.get_logger())
        assert "fips,date,cases\n" == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]
Beispiel #2
0
def test_remove_index_column():
    df = pd.DataFrame([("99", "2020-04-01", "a", 123)],
                      columns=["fips", "date", "index", "cases"
                               ]).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df, tmp.path, structlog.get_logger())
        assert "fips,date,cases\n99,2020-04-01,123\n" == tmp.file.read()

    assert [l["event"] for l in logs
            ] == ["Dropping column named 'index'", "Writing DataFrame"]
Beispiel #3
0
def test_write_csv_columns_are_sorted_in_output_with_extras():
    df = pd.DataFrame([],
                      columns=[
                          CommonFields.DATE, CommonFields.FIPS, "extra2",
                          CommonFields.CASES, "extra1"
                      ])
    df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS)
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        log = structlog.get_logger()
        write_df_as_csv(df, tmp.path, log)
        assert "fips,date,cases,extra1,extra2\n" == tmp.file.read()
    assert [l["event"] for l in logs] == [
        "Writing DataFrame",
    ]
Beispiel #4
0
def main(replace_local_mirror: bool):
    common_init.configure_logging()

    copier = AwsDataLakeCopier.make_with_data_root(DATA_ROOT)
    if replace_local_mirror:
        copier.replace_local_mirror()

    transformer = AwsDataLakeTransformer.make_with_data_root(DATA_ROOT)
    for source_name, source_files in copier.get_sources():
        log = structlog.get_logger(source_name=source_name)
        write_df_as_csv(
            transformer.transform(source_files, log),
            DATA_ROOT / "aws-lake" / f"timeseries-{source_name}.csv",
            log,
        )
Beispiel #5
0
def test_write_csv_extra_columns_dropped():
    df = pd.DataFrame([],
                      columns=[
                          CommonFields.DATE, CommonFields.FIPS, "extra1",
                          CommonFields.CASES, "extra2"
                      ])
    df = df.set_index(COMMON_FIELDS_TIMESERIES_KEYS)
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        log = structlog.get_logger()
        write_df_as_csv(only_common_columns(df, log), tmp.path, log)
        assert "fips,date,cases\n" == tmp.file.read()
    assert [l["event"] for l in logs] == [
        "Dropping columns not in CommonFields",
        "Writing DataFrame",
    ]
Beispiel #6
0
def test_float_na_formatting():
    df = pd.DataFrame(
        [("99", "2020-04-01", 1.0, 2, 3),
         ("99", "2020-04-02", pd.NA, pd.NA, None)],
        columns="fips date metric_a metric_b metric_c".split(),
    ).set_index(COMMON_FIELDS_TIMESERIES_KEYS)

    expected_csv = """fips,date,metric_a,metric_b,metric_c
99,2020-04-01,1,2,3
99,2020-04-02,,,
"""

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()

    assert [l["event"] for l in logs] == ["Writing DataFrame"]
Beispiel #7
0
def test_float_formatting():
    input_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6
99123,2020-04-01,1,2.0000000,3,0.0004,0.00005,6000000000
99123,2020-04-02,,,,,,
99123,2020-04-03,1,2,3.1234567,4,5,6.0
"""
    input_df = read_csv_to_indexed_df(StringIO(input_csv))

    expected_csv = """fips,date,col_1,col_2,col_3,col_4,col_5,col_6
99123,2020-04-01,1,2,3,0.0004,5e-05,6000000000
99123,2020-04-02,,,,,,
99123,2020-04-03,1,2,3.1234567,4,5,6
"""

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(input_df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()

    assert [l["event"] for l in logs] == ["Writing DataFrame"]
Beispiel #8
0
def test_write_csv():
    df = pd.DataFrame({
        CommonFields.DATE: ["2020-04-01", "2020-04-02"],
        CommonFields.FIPS: ["06045", "45123"],
        CommonFields.CASES: [234, 456],
    })
    df_original = df.copy()
    expected_csv = """fips,date,cases
06045,2020-04-01,234
45123,2020-04-02,456
"""
    # Call write_df_as_csv with index set to ["fips", "date"], the expected normal index.
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df.set_index(["fips", "date"]), tmp.path,
                        structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"] for l in logs] == ["Writing DataFrame"]

    # Pass df with other index that will be changed. Check that the same output is written to the
    # file.
    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df, tmp.path, structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df.set_index(["date", "cases"]), tmp.path,
                        structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    with temppathlib.NamedTemporaryFile(
            "w+") as tmp, structlog.testing.capture_logs() as logs:
        write_df_as_csv(df.set_index(["date", "fips"]), tmp.path,
                        structlog.get_logger())
        assert expected_csv == tmp.file.read()
    assert [l["event"]
            for l in logs] == ["Fixing DataFrame index", "Writing DataFrame"]

    assert repr(df) == repr(df_original)
            *sorted(rename.items(), key=lambda f_c: common_order[f_c[1]]))
        # Copy only columns in `rename.keys()` to a new DataFrame and rename.
        data = data.loc[:, list(names_in)].rename(columns=rename)
        if col_not_in_fields_or_common:
            self.log.warning("Removing columns not in CommonFields",
                             columns=col_not_in_fields_or_common)

        return data


def remove_duplicate_city_data(data):
    # City data before 3-23 was not duplicated, copy the city name to the county field.
    select_pre_march_23 = data.date < "2020-03-23"
    data.loc[select_pre_march_23,
             "county"] = data.loc[select_pre_march_23].apply(
                 fill_missing_county_with_city, axis=1)
    # Don't want to return city data because it's duplicated in county
    return data.loc[select_pre_march_23 |
                    ((~select_pre_march_23) & data["city"].isnull())].copy()


if __name__ == "__main__":
    common_init.configure_logging()
    log = structlog.get_logger()
    transformer = CovidDataScraperTransformer.make_with_data_root(DATA_ROOT)
    write_df_as_csv(
        only_common_columns(transformer.transform(), log),
        DATA_ROOT / "cases-cds" / "timeseries-common.csv",
        log,
    )