Beispiel #1
0
    def transform(self) -> pd.DataFrame:
        state = "TX"
        tsa_regions = self.raw_tsa_scraped_path.read_text()

        census_data = census_data_helpers.load_county_fips_data(self.county_fips_csv)

        data = []
        for line in tsa_regions.split("\n"):
            if not line:
                continue

            area, county_names = re.match(r".+Area ([A-Z]) - (.*)[;.]", line).groups()
            counties = county_names.split(", ")
            for county in counties:
                # TODO(chris): Find better way match county to fips.  I believe there are some
                # python packages that do a lot of the heavy lifting.
                if county == "Raines":
                    county = "Rains"
                if county == "Dewitt":
                    county = "DeWitt"

                county = county + " County"
                county_data = census_data.get_county_data(state, county)
                if not county_data:
                    raise CountyNotFoundInCensusData()

                data.append({"fips": county_data["fips"], "state": state, "tsa_region": area})

        return pd.DataFrame(data)
 def update(self):
     hosp_by_tsa_date = pd.read_csv(self.hospitalizations_by_tsa_csv, dtype={Fields.FIPS: str})
     census_data = census_data_helpers.load_county_fips_data(self.county_fips_csv)
     tsa_to_fips = pd.read_csv(self.tsa_to_fips_csv, dtype={Fields.FIPS: str})
     output = build_hospitalizations_spread_by_population(
         hosp_by_tsa_date, census_data.data, tsa_to_fips
     )
     output[CommonFields.AGGREGATE_LEVEL] = "county"
     output[CommonFields.COUNTRY] = "USA"
     return output
def update(data_url: str):

    # TODO(tom): Switch to ccd_helpers. See
    #  https://github.com/covid-projections/covid-data-public/pull/196
    all_df = pd.read_parquet(data_url)

    variables = [
        "adult_icu_beds_capacity",
        "adult_icu_beds_in_use",
        "hospital_beds_capacity",
        "hospital_beds_in_use",
        "adult_icu_beds_in_use_covid",
        "hospital_beds_in_use_covid",
    ]
    unit = "beds"
    measurements = ["current", "rolling_average_7_day"]

    is_federal_hospital_data = (
        (all_df[Fields.PROVIDER] == "hhs")
        & (all_df[Fields.VARIABLE_NAME].isin(variables))
        & (all_df[Fields.MEASUREMENT].isin(measurements))
        & (all_df[Fields.UNIT] == unit)
        & (all_df[Fields.AGE] == "all")
        & (all_df[Fields.RACE] == "all")
        & (all_df[Fields.SEX] == "all")
    )

    # Subset only to hospital data we want.
    df = all_df.loc[is_federal_hospital_data].copy()

    # Add FIPS column.
    df[Fields.FIPS] = helpers.fips_from_int(df[Fields.LOCATION])

    # Subset only to columns we want.
    df = df[[Fields.FIPS, Fields.DATE, Fields.VARIABLE_NAME, Fields.VALUE]]

    # Convert to wide using variable_name as the columns.
    wide_df = df.pivot_table(
        index=[Fields.FIPS.value, Fields.DATE.value],
        columns=Fields.VARIABLE_NAME.value,
        values=Fields.VALUE.value,
    ).reset_index()

    # Rename to common fields.
    wide_df = helpers.rename_fields(wide_df, Fields, set(), _logger)

    # Split counties and states.
    counties_df = wide_df.loc[wide_df[Fields.FIPS].str.len() == 5].copy()
    states_df = wide_df.loc[wide_df[Fields.FIPS].str.len() == 2].copy()

    # Add county metadata.
    census_data = census_data_helpers.load_county_fips_data(COUNTY_DATA_PATH).data
    census_data = census_data.set_index(CommonFields.FIPS)
    counties_df[CommonFields.COUNTY] = counties_df[Fields.FIPS].map(
        census_data[CommonFields.COUNTY]
    )
    counties_df[CommonFields.STATE] = counties_df[Fields.FIPS].map(census_data[CommonFields.STATE])
    counties_df[CommonFields.AGGREGATE_LEVEL] = "county"

    # Add state metadata.
    states_df[CommonFields.STATE] = states_df[Fields.FIPS].apply(
        lambda fips: us.states.lookup(fips).abbr
    )
    states_df[CommonFields.AGGREGATE_LEVEL] = "state"

    # Merge counties and states back together.
    out_df = pd.concat([counties_df, states_df])

    # Add country metadata.
    out_df[CommonFields.COUNTRY] = "USA"

    out_df = filter_early_data(out_df)

    return out_df