Python _build_dataframe Examples, libs.datasets.combined_datasets._build_dataframe Python Examples

Example #1

0

Show file

def test_build_timeseries():
    data_a = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,1\n"
    )
    data_b = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,2\n"
    )
    datasets = {"source_a": data_a, "source_b": data_b}

    combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 2

    combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 1

Example #2

0

Show file

def test_fill_fields_with_data_source_add_column():
    # existing_df does not have a current_icu column. Check that it doesn't cause a crash.
    existing_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,preserved\n"
        "55005,ZZ,county,North County,ab\n"
        "55,ZZ,state,Grand State,cd\n", )
    new_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu\n"
        "55007,ZZ,county,West County,28\n"
        "55,ZZ,state,Grand State,64\n", )

    datasets = {"existing": existing_df, "new": new_df}

    result = _build_dataframe(
        {
            "current_icu": ["new"],
            "preserved": ["existing"]
        }, datasets, Override.BY_ROW)

    expected = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55005,ZZ,county,North County,,ab\n"
        "55007,ZZ,county,West County,28,\n"
        "55,ZZ,state,Grand State,64,cd\n", )
    assert to_dict(["fips"], result) == to_dict(["fips"], expected)

Example #3

0

Show file

def test_fill_fields_with_data_source():
    existing_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55005,ZZ,county,North County,43,ab\n"
        "55006,ZZ,county,South County,,cd\n"
        "55,ZZ,state,Grand State,46,ef\n")
    new_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu\n"
        "55006,ZZ,county,South County,27\n"
        "55007,ZZ,county,West County,28\n"
        "55,ZZ,state,Grand State,64\n")

    datasets = {"existing": existing_df, "new": new_df}

    result = _build_dataframe(
        {
            "current_icu": ["existing", "new"],
            "preserved": ["existing"]
        }, datasets, Override.BY_ROW)

    expected = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55005,ZZ,county,North County,43,ab\n"
        "55006,ZZ,county,South County,27,cd\n"
        "55007,ZZ,county,West County,28,\n"
        "55,ZZ,state,Grand State,64,ef\n")

    assert to_dict(["fips"], result) == to_dict(["fips"], expected)

Example #4

0

Show file

def test_build_latest():
    data_a = read_csv_and_index_fips(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,1\n"
        "Three County,XY,97333,USA,county,2020-04-01,3\n"
    )
    data_b = read_csv_and_index_fips(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,2\n"
    )
    datasets = {"source_a": data_a, "source_b": data_b}

    combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets)
    assert combined.at["97123", "cases"] == 2
    assert combined.at["97333", "cases"] == 3

    combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets)
    assert combined.at["97123", "cases"] == 1
    assert combined.at["97333", "cases"] == 3

Example #5

0

Show file

def test_fill_fields_with_data_source_nan_overwrite():
    existing_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55,ZZ,state,Grand State,46,ef\n")
    new_df = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu\n"
        "55,ZZ,state,Grand State,\n")

    datasets = {"existing": existing_df, "new": new_df}

    result = _build_dataframe(
        {
            "current_icu": ["existing", "new"],
            "preserved": ["existing"]
        }, datasets, Override.BY_ROW)

    expected = read_csv_and_index_fips(
        "fips,state,aggregate_level,county,current_icu,preserved\n"
        "55,ZZ,state,Grand State,,ef\n")

    assert to_dict(["fips"], result) == to_dict(["fips"], expected)

Example #6

0

Show file

def test_fill_fields_with_data_source_timeseries():
    # Timeseries in existing_df and new_df are merged together.
    existing_df = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,ab\n"
        "55005,ZZ,county,North County,2,2020-05-02,cd\n"
        "55005,ZZ,county,North County,,2020-05-03,ef\n"
        "55006,ZZ,county,South County,4,2020-05-04,gh\n"
        "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
        "55,ZZ,state,Grand State,43,2020-05-03,kl\n")
    new_df = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date\n"
        "55006,ZZ,county,South County,44,2020-05-04\n"
        "55007,ZZ,county,West County,28,2020-05-03\n"
        "55005,ZZ,county,North County,3,2020-05-03\n"
        "55,ZZ,state,Grand State,42,2020-05-02\n")

    datasets = {"existing": existing_df, "new": new_df}

    result = _build_dataframe({
        "cnt": ["existing", "new"],
        "foo": ["existing"]
    }, datasets, Override.BY_ROW)

    expected = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,cnt,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,ab\n"
        "55005,ZZ,county,North County,2,2020-05-02,cd\n"
        "55005,ZZ,county,North County,3,2020-05-03,ef\n"
        "55006,ZZ,county,South County,44,2020-05-04,gh\n"
        "55007,ZZ,county,West County,28,2020-05-03,\n"
        "55,ZZ,state,Grand State,41,2020-05-01,ij\n"
        "55,ZZ,state,Grand State,42,2020-05-02,\n"
        "55,ZZ,state,Grand State,43,2020-05-03,kl\n")

    assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"],
                                                        expected)

Example #7

0

Show file

def test_build_timeseries_override():
    data_a = read_csv_and_index_fips_date(
        "fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3"
    )
    data_b = read_csv_and_index_fips_date(
        "fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n"
    )
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, None]

    # The combined m1 timeseries is the highest priority real value for each date, a blend of source_a and source_b.
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES_POINT
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3]

    # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for
    # both 2020-04-01 and 2020-04-02.
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_ROW
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES_POINT
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_ROW
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]