def test_build_timeseries(): data_a = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,1\n" ) data_b = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,2\n" ) datasets = {"source_a": data_a, "source_b": data_b} combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 2 combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 1
def test_fill_fields_with_data_source_add_column(): # existing_df does not have a current_icu column. Check that it doesn't cause a crash. existing_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,preserved\n" "55005,ZZ,county,North County,ab\n" "55,ZZ,state,Grand State,cd\n", ) new_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n", ) datasets = {"existing": existing_df, "new": new_df} result = _build_dataframe( { "current_icu": ["new"], "preserved": ["existing"] }, datasets, Override.BY_ROW) expected = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,,ab\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,cd\n", ) assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_fill_fields_with_data_source(): existing_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,43,ab\n" "55006,ZZ,county,South County,,cd\n" "55,ZZ,state,Grand State,46,ef\n") new_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu\n" "55006,ZZ,county,South County,27\n" "55007,ZZ,county,West County,28\n" "55,ZZ,state,Grand State,64\n") datasets = {"existing": existing_df, "new": new_df} result = _build_dataframe( { "current_icu": ["existing", "new"], "preserved": ["existing"] }, datasets, Override.BY_ROW) expected = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55005,ZZ,county,North County,43,ab\n" "55006,ZZ,county,South County,27,cd\n" "55007,ZZ,county,West County,28,\n" "55,ZZ,state,Grand State,64,ef\n") assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_build_latest(): data_a = read_csv_and_index_fips( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,1\n" "Three County,XY,97333,USA,county,2020-04-01,3\n" ) data_b = read_csv_and_index_fips( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,2\n" ) datasets = {"source_a": data_a, "source_b": data_b} combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets) assert combined.at["97123", "cases"] == 2 assert combined.at["97333", "cases"] == 3 combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets) assert combined.at["97123", "cases"] == 1 assert combined.at["97333", "cases"] == 3
def test_fill_fields_with_data_source_nan_overwrite(): existing_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55,ZZ,state,Grand State,46,ef\n") new_df = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu\n" "55,ZZ,state,Grand State,\n") datasets = {"existing": existing_df, "new": new_df} result = _build_dataframe( { "current_icu": ["existing", "new"], "preserved": ["existing"] }, datasets, Override.BY_ROW) expected = read_csv_and_index_fips( "fips,state,aggregate_level,county,current_icu,preserved\n" "55,ZZ,state,Grand State,,ef\n") assert to_dict(["fips"], result) == to_dict(["fips"], expected)
def test_fill_fields_with_data_source_timeseries(): # Timeseries in existing_df and new_df are merged together. existing_df = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,,2020-05-03,ef\n" "55006,ZZ,county,South County,4,2020-05-04,gh\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n") new_df = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date\n" "55006,ZZ,county,South County,44,2020-05-04\n" "55007,ZZ,county,West County,28,2020-05-03\n" "55005,ZZ,county,North County,3,2020-05-03\n" "55,ZZ,state,Grand State,42,2020-05-02\n") datasets = {"existing": existing_df, "new": new_df} result = _build_dataframe({ "cnt": ["existing", "new"], "foo": ["existing"] }, datasets, Override.BY_ROW) expected = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,cnt,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,44,2020-05-04,gh\n" "55007,ZZ,county,West County,28,2020-05-03,\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,42,2020-05-02,\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n") assert to_dict(["fips", "date"], result) == to_dict(["fips", "date"], expected)
def test_build_timeseries_override(): data_a = read_csv_and_index_fips_date( "fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3" ) data_b = read_csv_and_index_fips_date( "fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n" ) datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, None] # The combined m1 timeseries is the highest priority real value for each date, a blend of source_a and source_b. combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES_POINT ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3] # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for # both 2020-04-01 and 2020-04-02. combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_ROW ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES_POINT ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_ROW ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]