Python CMU Examples, can_tools.scrapers.base.CMU Python Examples

Example #1

0

Show file

File: facility.py Project: christopherturner/can-scrapers

    def normalize(self, data: str) -> pd.DataFrame:
        # Read the dataframe from the string csv
        df = data.copy()
        df.columns = [x.lower().strip() for x in df.columns]

        # Set date and fips code
        # NOTE: collection_week refers to the first day of the week, so add 6
        # days to get the last day.
        df.loc[:, "dt"] = pd.to_datetime(
            df["collection_week"]) + timedelta(days=6)

        # Filter out all of the columns without a fips code for now -- I
        # think that it is likely that we could reverse engineer these
        # either by looking them up or by mapping city to county
        df = df.loc[~df["fips_code"].isna(), :]
        # :see_no_evil:
        df["location"] = (
            df["fips_code"].astype(int).replace({
                # 02120 corresponded to Kenai-Cook Inlet Division... It was
                # then the relevant piece became Kenai Peninsula Borough which
                # is 02122
                # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf
                2120: 2122,
                # City associated with the hospital is Seward which is in the
                # Kenai Borough which is 02122 but I have no idea how this
                # ended up with fips code 02210???
                # https://en.wikipedia.org/wiki/Seward,_Alaska
                2210: 2122,
                # 02260 was fips code for Valdez-Chitina-Whittier Division... It
                # was then put into Valdez–Cordova Census Area which is
                # 02261, but 02261 was split in Jan 2019 and we'll need to change
                # this again if we update geographies
                # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf
                2260: 2261,
                # 02280 corresponded to Wrangell-Petersburg but became the
                # Petersburg Borough 02195 in 2012
                # https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf
                2280: 2195,
                # City associated with the hospital is Cordova which is in the
                # Valdez-Cordova census area but I don't know which one this
                # ended up in after the split...
                # https://en.wikipedia.org/wiki/Cordova,_Alaska
                2080: 2261,
                # Source of change: https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf
                # page 6
                # Virginia, 2013: Bedford (independent) city (FIPS 51515) was changed to
                # town status and added to Bedford County (FIPS 51019) effective July 1st, 2013
                51515: 51019,
            }))

        # Set all missing values (-999999) to nan for all numeric columns
        numeric_cols = list(df.select_dtypes("number"))
        df.loc[:, numeric_cols] = df.loc[:, numeric_cols].where(
            lambda x: x > 0, np.nan)

        # Variables that can be determined with "simple average"
        vars_to_compute_avg = [
            "inpatient_beds_7_day",
            "inpatient_beds_used_7_day",
            "total_staffed_adult_icu_beds_7_day",
            "staffed_adult_icu_bed_occupancy_7_day",
            "staffed_icu_adult_patients_confirmed_covid_7_day",
        ]
        for var in vars_to_compute_avg:
            df.loc[:, f"{var}_canavg"] = df.eval(f"{var}_sum / {var}_coverage")

        # Variables that require "more complicated average"
        aps = "total_adult_patients_hospitalized_confirmed_covid_7_day_sum"
        apc = "total_adult_patients_hospitalized_confirmed_covid_7_day_coverage"
        pps = "total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum"
        ppc = "total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage"
        temp = df.eval(f"{aps} / {apc}")
        # Do the pediatric sum second so that we keep adult values if they're available
        # (while filling pediatric missing data with 0s) but if adult is missing then
        # it will stay as missing
        temp = temp + df.eval(f"{pps} / {ppc}").fillna(0.0)
        df.loc[:, "inpatient_beds_used_covid_7_day_canavg"] = temp.values

        crename = {
            "inpatient_beds_7_day_canavg":
            CMU(
                category="hospital_beds_capacity",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "inpatient_beds_used_7_day_canavg":
            CMU(
                category="hospital_beds_in_use",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            # This column is generated by summing adult and pediatric
            # beds -- Should be missing if either is missing
            "inpatient_beds_used_covid_7_day_canavg":
            CMU(
                category="hospital_beds_in_use_covid",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "total_staffed_adult_icu_beds_7_day_canavg":
            CMU(
                category="adult_icu_beds_capacity",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "staffed_adult_icu_bed_occupancy_7_day_canavg":
            CMU(
                category="adult_icu_beds_in_use",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "staffed_icu_adult_patients_confirmed_covid_7_day_canavg":
            CMU(
                category="adult_icu_beds_in_use_covid",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
        }

        # Reshape by putting into long form
        df_long = df.melt(id_vars=["dt", "location"],
                          value_vars=crename.keys()).dropna()
        df_long.loc[:, "value"] = pd.to_numeric(
            df_long["value"].astype(str).str.replace(",", ""))

        # Add category, measurement, unit, age, sex, race
        df_long = self.extract_CMU(df_long, crename)

        # Group by relevant factors and sum
        identifier = [
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "sex",
            "race",
            "ethnicity",
        ]

        # TODO: We could do a different groupby and put this into states
        # or hospital regions
        out_county = (df_long.groupby(identifier)["value"].agg(
            pd.Series.sum, skipna=False).reset_index())

        # TODO: Throwing out territories because I don't remember which weren't
        # included in the census data :(
        out_county = out_county.query("location < 60_000").copy()

        # Add vintage
        out_county.loc[:, "vintage"] = self._retrieve_vintage()
        out_county.loc[:, "location_type"] = "county"
        cols_2_keep = identifier + ["vintage", "location_type", "value"]

        return out_county.loc[:, cols_2_keep]

Example #2

0

Show file

    def normalize(self, data) -> pd.DataFrame:
        countiesVac1 = data[0]["stringColumn"]["values"]
        countiesNullIndexVac1 = data[0]["nullIndex"]
        countiesSupplyVac1 = data[1]["doubleColumn"]["values"]
        countiesSupplyNullIndexVac1 = data[1]["nullIndex"]
        countiesAllocVac1 = data[2]["doubleColumn"]["values"]
        countiesAllocNullIndexVac1 = data[2]["nullIndex"]

        # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values for
        # null indexes
        countiesNullIndexVac1.sort()
        countiesAllocNullIndexVac1.sort()
        countiesSupplyNullIndexVac1.sort()

        # inserting values of 0 for the indexes corresponding with the null value indexes
        for value in countiesNullIndexVac1:
            countiesVac1.insert(value, 0)
        for value in countiesSupplyNullIndexVac1:
            countiesSupplyVac1.insert(value, 0)
        for value in countiesAllocNullIndexVac1:
            countiesAllocVac1.insert(value, 0)

        # create dose 1 data frame
        countyVaccineDataFrameVac1 = pd.DataFrame({
            "location_name":
            countiesVac1,
            "supplyVac1":
            countiesSupplyVac1,
            "administeredVac1":
            countiesAllocVac1,
        })

        countiesVac2 = data[3]["stringColumn"]["values"]
        countiesNullIndexVac2 = data[3]["nullIndex"]
        countiesSupplyVac2 = data[4]["doubleColumn"]["values"]
        countiesSupplyNullIndexVac2 = data[4]["nullIndex"]
        countiesAllocVac2 = data[5]["doubleColumn"]["values"]
        countiesAllocNullIndexVac2 = data[5]["nullIndex"]

        # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values
        countiesNullIndexVac2.sort()
        countiesAllocNullIndexVac2.sort()
        countiesSupplyNullIndexVac2.sort()

        # inserting values of 0 for the indexes corresponding with the null value indexes
        for value in countiesNullIndexVac2:
            countiesVac2.insert(value, 0)
        for value in countiesSupplyNullIndexVac2:
            countiesSupplyVac2.insert(value, 0)
        for value in countiesAllocNullIndexVac2:
            countiesAllocVac2.insert(value, 0)

        # create dose 2 dataframe
        countyVaccineDataFrameVac2 = pd.DataFrame({
            "location_name":
            countiesVac2,
            "supplyVac2":
            countiesSupplyVac2,
            "administeredVac2":
            countiesAllocVac2,
        })

        # merge dose 1 data frame with dose 2 dataframe
        countyVaccineDataFrame = countyVaccineDataFrameVac1.merge(
            countyVaccineDataFrameVac2, on="location_name", how="outer")

        countyVaccineDataFrame["dt"] = self.execution_dt
        countyVaccineDataFrame["totalSupply"] = (
            countyVaccineDataFrame["supplyVac2"] +
            countyVaccineDataFrame["supplyVac1"])

        crename = {
            "totalSupply":
            CMU(
                category="total_vaccine_allocated",
                measurement="cumulative",
                unit="doses",
            ),
            "administeredVac1":
            CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "administeredVac2":
            CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
        }
        out = countyVaccineDataFrame.melt(id_vars=["dt", "location_name"],
                                          value_vars=crename.keys()).dropna()

        out["value"] = out["value"].astype(int)
        out["vintage"] = self._retrieve_vintage()
        df = self.extract_CMU(out, crename)
        return df.drop(["variable"], axis="columns")

Example #3

0

Show file

    def normalize(self, data) -> pd.DataFrame:
        """
        Cleans and normalizes the data we recieve from the Google api
        """
        datesListVac1 = data[0]["dateColumn"]["values"]
        nullValuesSupplyListVac1 = data[1]["nullIndex"]
        valuesSupplyListVac1 = data[1]["doubleColumn"]["values"]
        nullValuesAdminListVac1 = data[2]["nullIndex"]
        valuesAdminListVac1 = data[2]["doubleColumn"]["values"]

        # sorts the Null Values index lists so we dont get an out of bounds error when we insert values of 0 in their places
        nullValuesAdminListVac1.sort()
        nullValuesSupplyListVac1.sort()
        for value in nullValuesSupplyListVac1:
            valuesSupplyListVac1.insert(value, 0)
        for value in nullValuesAdminListVac1:
            valuesAdminListVac1.insert(value, 0)

        # creating dataframe for dose 1 county level data
        stateVaccineDataFrameVac1 = pd.DataFrame({
            "dt":
            datesListVac1,
            "supplyVac1":
            valuesSupplyListVac1,
            "administeredVac1":
            valuesAdminListVac1,
        })

        datesListVac2 = data[3]["dateColumn"]["values"]
        nullValuesSupplyListVac2 = data[4]["nullIndex"]
        valuesSupplyListVac2 = data[4]["doubleColumn"]["values"]
        nullValuesAdminListVac2 = data[5]["nullIndex"]
        valuesAdminListVac2 = data[5]["doubleColumn"]["values"]
        nullValuesAdminListVac2.sort()
        nullValuesSupplyListVac2.sort()
        for value in nullValuesSupplyListVac2:
            valuesSupplyListVac2.insert(value, 0)
        for value in nullValuesAdminListVac2:
            valuesAdminListVac2.insert(value, 0)

        # creating data frame for second dose of vaccines
        stateVaccineDataFrameVac2 = pd.DataFrame({
            "dt":
            datesListVac2,
            "supplyVac2":
            valuesSupplyListVac2,
            "administeredVac2":
            valuesAdminListVac2,
        })

        # merges two data frames together
        stateVaccineDataFrame = stateVaccineDataFrameVac1.merge(
            stateVaccineDataFrameVac2, on="dt", how="outer")
        # sums the first dose allocation and the second dose allocations together
        stateVaccineDataFrame["supplyTotal"] = (
            stateVaccineDataFrame["supplyVac2"] +
            stateVaccineDataFrame["supplyVac1"])
        # create cumulative vaccine supply variable
        stateVaccineDataFrame["supplyCumulative"] = [
            stateVaccineDataFrame["supplyTotal"].loc[0:x].sum()
            for x in range(len(stateVaccineDataFrame["supplyTotal"]))
        ]
        stateVaccineDataFrame["location"] = self.state_fips
        stateVaccineDataFrame["dt"] = pd.to_datetime(
            stateVaccineDataFrame["dt"])
        crename = {
            "supplyCumulative":
            CMU(
                category="total_vaccine_allocated",
                measurement="cumulative",
                unit="doses",
            ),
            "administeredVac1":
            CMU(category="total_vaccine_initiated",
                measurement="new",
                unit="people"),
            "administeredVac2":
            CMU(category="total_vaccine_completed",
                measurement="new",
                unit="people"),
        }
        out = stateVaccineDataFrame.melt(id_vars=["dt", "location"],
                                         value_vars=crename.keys()).dropna()
        out["value"] = out["value"].astype(int)
        out["vintage"] = self._retrieve_vintage()
        out = self.extract_CMU(out, crename)
        return out.drop(["variable"], axis="columns")

Example #4

0

Show file

class GeorgiaCountyVaccineAge(GeorgiaCountyVaccine):
    service = "Georgia_DPH_PUBLIC_Vaccination_Dashboard_V5_VIEW"
    sheet = 7
    column_names = ["AGE"]

    variables = {
        "00-05":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="0-9",
        ),
        "05_09":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="5-9",
        ),
        "10_14":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="10-14",
        ),
        "15_19":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="15-19",
        ),
        "20_24":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="20-24",
        ),
        "25_34":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="25-34",
        ),
        "35_44":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="35-44",
        ),
        "45_54":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="45-54",
        ),
        "55_64":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="55-64",
        ),
        "65_74":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="65-74",
        ),
        "75_84":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="75-84",
        ),
        "85PLUS":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="85_plus",
        ),
    }

    def fetch(self):
        return self.get_all_jsons(self.service, self.sheet, "6")

    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df = (df.pivot_table(index="COUNTYFIPS",
                             columns=self.column_names,
                             values="COUNTS").reset_index().rename_axis(
                                 None, axis=1))
        df = self._rename_or_add_date_and_location(
            df, location_column="COUNTYFIPS", timezone="US/Eastern")
        df = self._reshape_variables(df, self.variables)

        locs_to_drop = ["0", "00000", 0]
        df = df.query("location not in @locs_to_drop")
        return df

Example #5

0

Show file

class LAVaccineCountyDemographics(LAVaccineCounty):
    variables = {
        "PercInt_Black_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="black",
        ),
        "PercInt_White_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="white",
        ),
        "PercInt_Other_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="other",
        ),
        "PercInt_RaceUnk_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="unknown",
        ),
        "PercComp_Black_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            race="black",
        ),
        "PercComp_Other_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            race="other",
        ),
        "PercComp_RaceUnk_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            race="unknown",
        ),
        "PercComp_White_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            race="white",
        ),
        "PercInt_5to17_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="5-17",
        ),
        "PercInt_18to29_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="18-29",
        ),
        "PercInt_30to39_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="30-39",
        ),
        "PercInt_40to49_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="40-49",
        ),
        "PercInt_50to59_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="50-59",
        ),
        "PercInt_60to69_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="60-69",
        ),
        "PercInt_70plus_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="70_plus",
        ),
        "PercInt_AgeUnk_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="unknown",
        ),
        "PercComp_5to17_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="5-17",
        ),
        "PercComp_18to29_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="18-29",
        ),
        "PercComp_30to39_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="30-39",
        ),
        "PercComp_40to49_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="40-49",
        ),
        "PercComp_50to59_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="50-59",
        ),
        "PercComp_60to69_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="60-69",
        ),
        "PercComp_70plus_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="70_plus",
        ),
        "PercInt_Female_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="female",
        ),
        "PercInt_Male_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="male",
        ),
        "PercInt_SexUnk_value":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="unknown",
        ),
        "PercComp_Female_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            sex="female",
        ),
        "PercComp_Male_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            sex="male",
        ),
        "PercComp_SexUnk_value":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            sex="unknown",
        ),
    }

    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        # Multiply each of these columns by the population column
        init_col_names = [
            x.replace("_value", "") for x in self.variables.keys()
        ]

        for col in init_col_names:
            df[col + "_value"] = np.floor(
                (df[col] / 100) * df["Total_2018pop"])

        df = self._rename_or_add_date_and_location(df,
                                                   location_column="PFIPS",
                                                   timezone="US/Eastern")
        df = self._reshape_variables(df, self.variables)

        return df

Example #6

0

Show file

class WisconsinVaccineAge(WisconsinVaccineCounty):
    data_tableau_table = "Age vax/unvax County"
    # age does not report missing/unknown entries
    missing_tableau_table = ""
    location_name_col = "AGG(Geography TT)-alias"
    location_type = "state"

    # map wide form column names into CMUs
    cmus = {
        "SUM(Initiation or completed count for TT)-alias": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
        )
    }

    def _get_demographic(
        self, df: pd.DataFrame, demo: str, demo_col_name: str
    ) -> pd.DataFrame:
        """
        description: a general "normalize" function to avoid extra/copied code
                     each demographic uses this in its respective normalize

        params:
            demo: the demographic as labeled according to CMU (age,sex,race, etc...)
            demo_col_name: the name of the demographic column from the fetched data

        returns: normalized data in long format
        """

        # county names (converted to title case)
        df["location_name"] = df[self.location_name_col].str.title()
        # fix county names
        df = df.replace(
            {"location_name": {"St Croix": "St. Croix", "Fond Du Lac": "Fond du Lac"}}
        )

        # parse out data columns
        value_cols = list(set(df.columns) & set(self.cmus.keys()))
        assert len(value_cols) == len(self.cmus)

        df = (
            df.melt(id_vars=[demo_col_name, "location_name"], value_vars=value_cols)
            .dropna()
            .assign(
                dt=self._retrieve_dt(self.timezone),
                vintage=self._retrieve_vintage(),
                value=lambda x: pd.to_numeric(
                    x["value"].astype(str).str.replace(",", "")
                ),
            )
            .pipe(self.extract_CMU, cmu=self.cmus)
        )
        df[demo] = df[demo_col_name]
        return df.drop(["variable", demo_col_name], axis=1)

    def fetch(self) -> pd.DataFrame:
        if self.missing_tableau_table:
            # extract both data table and missing data table
            dfs = [
                self.get_tableau_view().get(table)
                for table in [self.data_tableau_table, self.missing_tableau_table]
            ]
            return pd.concat(dfs)
        else:
            return self.get_tableau_view()[self.data_tableau_table]

    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self._get_demographic(df, "age", "Age-value")
        return df.replace({"age": {"65+": "65_plus"}})

Example #7

0

Show file

    def normalize(self, data) -> pd.DataFrame:
        countiesTotal = data[0]["stringColumn"]["values"]
        countiesTotalNullIndex = data[0]["nullIndex"]
        countiesTotalPer = data[1]["doubleColumn"]["values"]
        countiesTotalPerNullIndex = data[1]["nullIndex"]

        # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values for
        # null indexes
        countiesTotalNullIndex.sort()
        countiesTotalPerNullIndex.sort()

        # inserting values of 0 for the indexes corresponding with the null value indexes
        for value in countiesTotalNullIndex:
            countiesTotal.insert(value, 0)
        for value in countiesTotalPerNullIndex:
            countiesTotalPer.insert(value, 0)

        # create dose 1 data frame
        countyVaccineDataFrameAll = pd.DataFrame({
            "location_name": countiesTotal,
            "TotalPer": countiesTotalPer,
        })
        countyVaccineDataFrameAll["TotalPer"] = (
            countyVaccineDataFrameAll["TotalPer"] * 100)
        countyVaccineDataFrameAll["age"] = "all"
        counties18plus = data[2]["stringColumn"]["values"]
        counties18PlusNullIndex = data[2]["nullIndex"]
        counties18PlusPer = data[3]["doubleColumn"]["values"]
        counties18PlusPerNullIndex = data[3]["nullIndex"]

        # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values
        counties18PlusNullIndex.sort()
        counties18PlusPerNullIndex.sort()

        # inserting values of 0 for the indexes corresponding with the null value indexes
        for value in counties18PlusNullIndex:
            counties18plus.insert(value, 0)
        for value in counties18PlusPerNullIndex:
            counties18PlusPer.insert(value, 0)

        # create dose 2 dataframe
        countyVaccineDataFrame18 = pd.DataFrame({
            "location_name": counties18plus,
            "TotalPer": counties18PlusPer,
        })
        countyVaccineDataFrame18["age"] = "18_plus"
        countyVaccineDataFrame18["TotalPer"] = (
            countyVaccineDataFrame18["TotalPer"] * 100)
        counties65plus = data[4]["stringColumn"]["values"]
        counties65PlusNullIndex = data[4]["nullIndex"]
        counties65PlusPer = data[5]["doubleColumn"]["values"]
        counties65PlusPerNullIndex = data[5]["nullIndex"]

        # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values
        counties65PlusNullIndex.sort()
        counties65PlusPerNullIndex.sort()

        # inserting values of 0 for the indexes corresponding with the null value indexes
        for value in counties65PlusNullIndex:
            counties65plus.insert(value, 0)
        for value in counties18PlusPerNullIndex:
            counties65PlusPer.insert(value, 0)

        # create dose 2 dataframe
        countyVaccineDataFrame65 = pd.DataFrame({
            "location_name": counties65plus,
            "TotalPer": counties65PlusPer,
        })
        countyVaccineDataFrame65["age"] = "65_plus"
        countyVaccineDataFrame65["TotalPer"] = (
            countyVaccineDataFrame65["TotalPer"] * 100)

        # merge data frame for all populations with dataframe of those 18 +
        countyVaccineDataFrame = countyVaccineDataFrameAll.append(
            countyVaccineDataFrame18)

        # Merge Previous dataframe with dataframe of those 65+
        countyVaccineDataFrame = countyVaccineDataFrame.append(
            countyVaccineDataFrame65)

        countyVaccineDataFrame["dt"] = self.execution_dt

        crename = {
            "TotalPer":
            CMU(
                category="total_vaccine_completed",
                measurement="current",
                unit="percentage",
            ),
        }
        out = countyVaccineDataFrame.melt(
            id_vars=["dt", "location_name",
                     "age"], value_vars=crename.keys()).dropna()

        out["value"] = out["value"].astype(int)
        out["vintage"] = self._retrieve_vintage()
        df = self.extract_CMU(
            out,
            crename,
            ["category", "measurement", "unit", "sex", "race", "ethnicity"],
        )
        return df.drop(["variable"], axis="columns")

Example #8

0

Show file

File: mi_vaccine.py Project: christopherturner/can-scrapers

 def _make_cmu(cat):
     return CMU(
         category=cat,
         measurement="cumulative",
         unit="people",
     )

Example #9

0

Show file

File: tn_state.py Project: blew2015/can-scrapers

    def normalize(self, data) -> pd.DataFrame:
        # Read data into data frame
        df = pd.read_excel(data.content, parse_dates=["DATE"])

        # Rename columns
        df = df.rename(columns={
            "DATE": "dt",
            "COUNTY": "location_name",
            "AGE_GROUP": "age"
        })

        # Drop the information that we won't be keeping track of
        age_not_keep = ["Pending"]
        loc_not_keep = ["Out of State", "Pending"]
        df = df.query(
            "(age not in @age_not_keep) & (location_name not in @loc_not_keep)"
        )

        # Fix incorrectly spelled county names
        loc_replacer = {"Dekalb": "DeKalb"}
        df = df.replace({"location_name": loc_replacer})

        # Translate age column
        df = self.translate_age(df)

        # Create dictionary for columns to map
        crename = {
            "CASE_COUNT":
            CMU(category="cases", measurement="cumulative", unit="people"),
        }

        # Move things into long format
        df = df.melt(id_vars=["dt", "location_name", "age"],
                     value_vars=crename.keys()).dropna()

        # Determine the category of each observation
        df = self.extract_CMU(
            df, crename,
            ["category", "measurement", "unit", "race", "ethnicity", "sex"])

        # Determine what columns to keep
        cols_to_keep = [
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        # Drop extraneous columns
        out = df.loc[:, cols_to_keep]

        # Convert value columns
        out["value"] = out["value"].astype(int)

        # Add rows that don't change
        out["vintage"] = self._retrieve_vintage()

        return out

Example #10

0

Show file

    def normalize(self, data: str) -> pd.DataFrame:
        df = data.copy()
        # Read the dataframe from the string csv
        df.columns = [x.lower().strip() for x in df.columns]

        # Set date and fips code
        df.loc[:, "dt"] = pd.to_datetime(df["date"])
        df.loc[:, "location"] = df["state"].map(
            lambda x: int(us.states.lookup(x).fips))

        crename = {
            "critical_staffing_shortage_today_yes":
            CMU(
                category="critical_staff_shortage_yes",
                measurement="current",
                unit="hospitals",
            ),
            "critical_staffing_shortage_today_no":
            CMU(
                category="critical_staff_shortage_no",
                measurement="current",
                unit="hospitals",
            ),
            "critical_staffing_shortage_today_yes":
            CMU(
                category="critical_staff_shortage_noreport",
                measurement="current",
                unit="hospitals",
            ),
            "critical_staffing_shortage_anticipated_within_week_yes":
            CMU(
                category="critical_staff_shortage_yes",
                measurement="anticipated_within_7_day",
                unit="hospitals",
            ),
            "critical_staffing_shortage_anticipated_within_week_no":
            CMU(
                category="critical_staff_shortage_no",
                measurement="anticipated_within_7_day",
                unit="hospitals",
            ),
            "critical_staffing_shortage_anticipated_within_week_yes":
            CMU(
                category="critical_staff_shortage_noreport",
                measurement="anticipated_within_7_day",
                unit="hospitals",
            ),
            "inpatient_beds":
            CMU(category="hospital_beds_capacity",
                measurement="current",
                unit="beds"),
            "inpatient_beds_used":
            CMU(category="hospital_beds_in_use",
                measurement="current",
                unit="beds"),
            "inpatient_beds_used_covid":
            CMU(
                category="hospital_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            "inpatient_beds_utilization":
            CMU(
                category="hospital_beds_in_use",
                measurement="current",
                unit="percentage",
            ),
            "total_staffed_adult_icu_beds":
            CMU(category="adult_icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "staffed_adult_icu_bed_occupancy":
            CMU(category="adult_icu_beds_in_use",
                measurement="current",
                unit="beds"),
            "staffed_icu_adult_patients_confirmed_covid":
            CMU(
                category="adult_icu_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            "adult_icu_bed_covid_utilization":
            CMU(
                category="adult_icu_beds_in_use",
                measurement="current",
                unit="percentage",
            ),
        }

        # Put into long form
        out = df.melt(id_vars=["dt", "location"], value_vars=crename.keys())
        out.loc[:,
                "value"] = pd.to_numeric(out["value"].astype(str).str.replace(
                    ",", "").replace("nan", None))

        # Add category, measurement, unit, age, sex, race
        out = self.extract_CMU(out, crename)
        out["vintage"] = self._retrieve_vintage()
        cols_2_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "sex",
            "race",
            "value",
        ]

        return out

Example #11

0

Show file

File: mi_vaccine.py Project: christopherturner/can-scrapers

    def normalize(self, data: pd.DataFrame) -> pd.DataFrame:
        # date is written out in first column name
        data["variable"] = data["Vaccine Type"] + data["Dose Number"]
        data["variable"] = data["variable"].str.replace(" ", "")

        def _make_cmu(cat):
            return CMU(
                category=cat,
                measurement="cumulative",
                unit="people",
            )

        colnames = {
            "Person's Residence in County": "location_name",
            "Data as of": "dt",
            "Number of Doses": "value",
        }
        cmus = {
            "J&JFirstDose":
            _make_cmu("janssen_vaccine_completed"),
            "ModernaFirstDose":
            _make_cmu("moderna_vaccine_initiated"),
            "ModernaSecondDose":
            _make_cmu("moderna_vaccine_completed"),
            "PfizerFirstDose":
            _make_cmu("pfizer_vaccine_initiated"),
            "PfizerSecondDose":
            _make_cmu("pfizer_vaccine_completed"),
            "total_initiated":
            _make_cmu("total_vaccine_initiated"),
            "total_completed":
            _make_cmu("total_vaccine_completed"),
            "total":
            CMU(
                category="total_vaccine_doses_administered",
                measurement="cumulative",
                unit="doses",
            ),
        }
        not_counties = ["No County", "Non-Michigan Resident"]  # noqa

        # need to sum over all the possible facility types for distribution
        df = (data.rename(
            columns=colnames
        ).loc[:, ["location_name", "dt", "variable", "value"]].query(
            "location_name not in @not_counties"
        ).assign(dt=lambda x: pd.to_datetime(x["dt"])).pivot_table(
            index=["dt", "location_name"],
            columns="variable",
            values="value",
            aggfunc="sum",
        ).fillna(0).astype(int).assign(
            total_initiated=lambda x: x.eval(
                "ModernaFirstDose + PfizerFirstDose") + x["J&JFirstDose"],
            total_completed=lambda x: x.eval(
                "ModernaSecondDose + PfizerSecondDose") + x["J&JFirstDose"],
        ).assign(total=lambda x: x.eval("total_initiated + total_completed"),
                 ).loc[:, cmus.keys()])

        # Detroit data is reported separately from Wayne county. As detroit is not a real
        # county, combine data with Wayne county.
        is_wayne_county = df.index.get_level_values("location_name") == "Wayne"
        is_detroit = df.index.get_level_values("location_name") == "Detroit"

        renamed_detroit_data = df.loc[is_detroit, :].rename(
            index={"Detroit": "Wayne"}, level="location_name")

        # verify that indices are the same so that when adding data frames
        # no values are dropped
        assert renamed_detroit_data.index.equals(
            df.loc[is_wayne_county, :].index)
        df.loc[is_wayne_county, :] += renamed_detroit_data

        # Drop detroit data
        df = df.loc[~is_detroit, :]

        # now we need to reindex to fill in all dates -- fill missing with 0
        dates = pd.Series(df.index.get_level_values("dt")).agg(["min", "max"])
        new_index = pd.MultiIndex.from_product(
            [
                pd.date_range(*dates),
                df.index.get_level_values("location_name").unique(),
            ],
            names=["dt", "location_name"],
        )

        return (df.reindex(new_index, fill_value=0)  # fill in missing dates
                .sort_index()  # make sure we are sorted
                .unstack(level=[
                    "location_name"
                ])  # make index=dt, columns=[variable,loc_name]
                .cumsum()  # compute cumulative sum
                .stack(level=[0, 1])  # long form Series
                .rename("value")  # name the series
                .reset_index()  # convert to long form df
                .assign(  # fill fix value
                    value=lambda x: pd.to_numeric(x.loc[:, "value"]),
                    vintage=self._retrieve_vintage(),
                ).pipe(self.extract_CMU, cmu=cmus)  # extract CMUs
                .drop(["variable"], axis=1)  # drop variable
                )

Example #12

0

Show file

File: ctp.py Project: hector323/can-scrapers

    def normalize(self, data: pd.DataFrame) -> pd.DataFrame:
        column_map = dict(
            death=CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
            ),
            hospitalizedCurrently=CMU(
                category="hospital_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            inIcuCurrently=CMU(
                category="icu_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            negative=CMU(
                category="pcr_tests_negative",
                measurement="cumulative",
                unit="unique_people",
            ),
            negativeTestsAntibody=CMU(
                category="antibody_tests_negative",
                measurement="cumulative",
                unit="specimens",
            ),
            negativeTestsPeopleAntibody=CMU(
                category="antibody_tests_negative",
                measurement="cumulative",
                unit="unique_people",
            ),
            negativeTestsViral=CMU(
                category="pcr_tests_negative",
                measurement="cumulative",
                unit="specimens",
            ),
            positive=CMU(
                category="cases",
                measurement="cumulative",
                unit="people",
            ),
            positiveCasesViral=CMU(
                category="pcr_tests_positive",
                measurement="cumulative",
                unit="unique_people",
            ),
            positiveTestsAntibody=CMU(
                category="antibody_tests_positive",
                measurement="cumulative",
                unit="specimens",
            ),
            positiveTestsAntigen=CMU(
                category="antigen_tests_positive",
                measurement="cumulative",
                unit="specimens",
            ),
            positiveTestsPeopleAntibody=CMU(
                category="antibody_tests_positive",
                measurement="cumulative",
                unit="unique_people",
            ),
            positiveTestsPeopleAntigen=CMU(
                category="antigen_tests_positive",
                measurement="cumulative",
                unit="unique_people",
            ),
            positiveTestsViral=CMU(
                category="pcr_tests_positive",
                measurement="cumulative",
                unit="specimens",
            ),
            totalTestsAntigen=CMU(
                category="antigen_tests_total",
                measurement="cumulative",
                unit="specimens",
            ),
            totalTestsAntibody=CMU(
                category="antibody_tests_total",
                measurement="cumulative",
                unit="specimens",
            ),
            totalTestsPeopleAntibody=CMU(
                category="antibody_tests_total",
                measurement="cumulative",
                unit="unique_people",
            ),
            totalTestsPeopleAntigen=CMU(
                category="antigen_tests_total",
                measurement="cumulative",
                unit="unique_people",
            ),
            totalTestsPeopleViral=CMU(
                category="pcr_tests_total",
                measurement="cumulative",
                unit="unique_people",
            ),
            totalTestsViral=CMU(
                category="pcr_tests_total",
                measurement="cumulative",
                unit="specimens",
            ),
        )

        df = (data.rename(columns=dict(fips="location", date="dt")).melt(
            id_vars=["dt", "location"],
            value_vars=column_map.keys(),
        ).dropna().pipe(self.extract_CMU,
                        column_map).assign(location_type="state",
                                           vintage=self._retrieve_vintage()))

        return df

Example #13

0

Show file

    def pre_normalize(self, data) -> pd.DataFrame:
        """
        Get icu and hospital usage by covid patients from the OpenDataCali api

        Parameters
        ----------
        data : List
            A list of json elements

        Returns
        -------
        df: pd.DataFrame
            A pandas DataFrame containing icu+hospital usage for each county

        """
        # Rename columns and subset data
        crename = {
            "hospitalized_covid_patients":
            CMU(
                category="hospital_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            "all_hospital_beds":
            CMU(category="hospital_beds_capacity",
                measurement="current",
                unit="beds"),
            "icu_covid_patients":
            CMU(category="icu_beds_in_use_covid",
                measurement="current",
                unit="beds"),
        }

        # Read in data and convert to long format
        df = self.data_from_raw(data).rename(
            columns={"county": "location_name"})

        # Convert column to date
        df = df.replace("None", None)
        df = df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
        df["dt"] = pd.to_datetime(df["todays_date"])

        # Create a total number of icu covid patients
        df["icu_covid_patients"] = df.eval(
            "icu_covid_confirmed_patients + icu_suspected_covid_patients")

        # Reshape
        out = df.melt(id_vars=["dt", "location_name"],
                      value_vars=crename.keys()).dropna()

        # Determine the category and demographics of each observation
        out = self.extract_CMU(out, crename)

        cols_to_keep = [
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]

Example #14

0

Show file

    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        bads = [r",", r"%", "nan"]
        str_cols = ["County", "FileNumber", "ProviderName"]
        df = self._clean_cols(df, bads, str_cols)
        df["location_name"] = df["County"].str.title()

        # Create new columns
        df["ICU Census"] = df["Adult ICU Census"] + df["Pediatric ICU Census"]
        df["ICU Capacity"] = (df["Total AdultICU Capacity"] +
                              df["Total PediatricICU Capacity"])
        df["Available ICU"] = df["Available Adult ICU"] + df[
            "Available Pediatric ICU"]

        # Rename appropriate columns
        crename = {
            "Adult ICU Census":
            CMU(category="adult_icu_beds_in_use",
                measurement="current",
                unit="beds"),
            "Available Adult ICU":
            CMU(category="adult_icu_beds_available",
                measurement="current",
                unit="beds"),
            "Total AdultICU Capacity":
            CMU(category="adult_icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "Pediatric ICU Census":
            CMU(category="pediatric_icu_beds_in_use",
                measurement="current",
                unit="beds"),
            "Available Pediatric ICU":
            CMU(
                category="pediatric_icu_beds_available",
                measurement="current",
                unit="beds",
            ),
            "Total PediatricICU Capacity":
            CMU(
                category="pediatric_icu_beds_capacity",
                measurement="current",
                unit="beds",
            ),
            "ICU Census":
            CMU(category="icu_beds_in_use", measurement="current",
                unit="beds"),
            "ICU Capacity":
            CMU(category="icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "Available ICU":
            CMU(category="icu_beds_available",
                measurement="current",
                unit="beds"),
        }

        # Drop grand total and melt
        out = (df.query("location_name != 'Grand Total'").melt(
            id_vars=["location_name"], value_vars=crename.keys()).dropna())
        out["value"] = pd.to_numeric(out["value"])
        out = out.groupby(["location_name", "variable"]).sum().reset_index()
        out.loc[out["location_name"] == "Desoto", "location_name"] = "DeSoto"

        # Extract category information and add other context
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Eastern")
        out["vintage"] = self._retrieve_vintage()
        self.clean_desoto(out)
        return out.loc[:, self.out_cols]

Example #15

0

Show file

File: nm_vaccine.py Project: blew2015/can-scrapers

    def normalize(self, data) -> pd.DataFrame:
        # Read data into data frame
        key = "data"
        if key not in data:
            raise ValueError(f"Expected to find {key} in JSON response")
        df = pd.DataFrame(data[key])

        # Determine what columns to keep
        cols_to_keep = [
            "county",
            "date",
            "modernaShipped",
            "pfizerShipped",
            "dosesAdministered",
            "totalShipped",
            "partiallyVaccinated",
            "fullyVaccinated",
            "percentPartiallyVaccinated",
            "percentFullyVaccinated",
        ]

        # Drop extraneous columns
        df = df.loc[:, cols_to_keep]

        # Rename columns
        df = df.rename(columns={"date": "dt", "county": "location_name"})

        # Convert date time column to a datetime
        df = df.assign(dt=lambda x: pd.to_datetime(x["dt"]))

        # Create dictionary for columns to map
        crename = {
            "modernaShipped": CMU(
                category="moderna_vaccine_distributed",
                measurement="cumulative",
                unit="doses",
            ),
            "pfizerShipped": CMU(
                category="pfizer_vaccine_distributed",
                measurement="cumulative",
                unit="doses",
            ),
            "dosesAdministered": CMU(
                category="total_vaccine_doses_administered",
                measurement="cumulative",
                unit="doses",
            ),
            "totalShipped": CMU(
                category="total_vaccine_distributed",
                measurement="cumulative",
                unit="doses",
            ),
            "partiallyVaccinated": variables.INITIATING_VACCINATIONS_ALL,
            "fullyVaccinated": variables.FULLY_VACCINATED_ALL,
            "percentPartiallyVaccinated": variables.PERCENTAGE_PEOPLE_INITIATING_VACCINE,
            "percentFullyVaccinated": variables.PERCENTAGE_PEOPLE_COMPLETING_VACCINE,
        }

        # Move things into long format
        df = df.melt(
            id_vars=["dt", "location_name"], value_vars=crename.keys()
        ).dropna()

        # Determine the category of each observation
        df = self.extract_CMU(df, crename)

        # Determine what columns to keep
        cols_to_keep = [
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        # Drop extraneous columns
        out = df.loc[:, cols_to_keep]

        # Convert value columns
        out["value"] = out["value"].astype(int)

        # Add rows that don't change
        out["vintage"] = self._retrieve_vintage()

        return out

Example #16

0

Show file

File: tn_state.py Project: blew2015/can-scrapers

    def normalize(self, data) -> pd.DataFrame:
        # Read data into data frame
        df = pd.read_excel(data.content, parse_dates=["Date"])

        # Rename columns
        df = df.rename(
            columns={
                "Date": "dt",
                "CAT_DETAIL": "category_detail",
                "Category": "category_name",
            })

        # Drop the information that we won't be keeping track of
        cat_detail_not_keep = ["Pending"]
        df = df.query("category_detail not in @cat_detail_not_keep")

        # Translate race, ethnicity, and gender (sex) to standard names
        cat_detail_replace = {
            "American Indian or Alaska Native": "ai_an",
            "Asian": "asian",
            "Black or African American": "black",
            "White": "white",
            "Native Hawaiian or Other Pacific Islander": "pacific_islander",
            "Other/ Multiracial": "multiple_other",
            "Other/Multiracial": "multiple_other",
            "Hispanic": "hispanic",
            "Not Hispanic or Latino": "non-hispanic",
            "Female": "female",
            "Male": "male",
        }
        df = df.replace({"category_detail": cat_detail_replace})

        # Split data packed into category_name and category_detail into race, ethnicity, and gender (sex) columns
        df.loc[df["category_name"] == "RACE", "race"] = df["category_detail"]
        df.loc[df["category_name"] != "RACE", "race"] = "all"
        df.loc[df["category_name"] == "ETHNICITY",
               "ethnicity"] = df["category_detail"]
        df.loc[df["category_name"] != "ETHNICITY", "ethnicity"] = "all"
        df.loc[df["category_name"] == "SEX", "sex"] = df["category_detail"]
        df.loc[df["category_name"] != "SEX", "sex"] = "all"

        # Create dictionary for columns to map
        crename = {
            "Cat_CaseCount":
            CMU(category="cases", measurement="cumulative", unit="people"),
            "CAT_DEATHCOUNT":
            CMU(category="deaths", measurement="cumulative", unit="people"),
        }

        # Move things into long format
        df = df.melt(
            id_vars=[
                "dt",
                "category_detail",
                "category_name",
                "race",
                "ethnicity",
                "sex",
            ],
            value_vars=crename.keys(),
        ).dropna()

        # Determine the category of each observation
        df = self.extract_CMU(df, crename,
                              ["category", "measurement", "unit", "age"])

        # Determine what columns to keep
        cols_to_keep = [
            "dt",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        # Drop extraneous columns
        out = df.loc[:, cols_to_keep]

        # Convert value columns
        out["value"] = out["value"].astype(int)

        # Add rows that don't change
        out["location"] = self.state_fips
        out["vintage"] = self._retrieve_vintage()

        return out

Example #17

0

Show file

File: de_vaccine.py Project: MattSidor/can-scrapers

class DelawareCountyVaccine(StateDashboard):
    kent_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-kent/covid19_vaccine_fully_vaccinated"
    new_castle_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-new-castle/covid19_vaccine_fully_vaccinated"
    sussex_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-sussex/covid19_vaccine_fully_vaccinated"
    kent_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-kent/covid19_vaccine_administrations"
    new_castle_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-new-castle/covid19_vaccine_administrations"
    sussex_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-sussex/covid19_vaccine_administrations"

    has_location = False
    location_type = "county"

    # Initialize
    source = "https://myhealthycommunity.dhss.delaware.gov"
    source_name = "Delaware Health and Social Services"
    state_fips = int(us.states.lookup("Delaware").fips)

    variables = {
        "FirstDose": CMU(
            category="total_vaccine_initiated", measurement="new", unit="doses"
        ),
        "SecondDose": CMU(
            category="total_vaccine_completed", measurement="new", unit="doses"
        ),
        "TotalDoses": CMU(
            category="total_vaccine_doses_administered", measurement="new", unit="doses"
        ),
    }

    def _fetch_fully_vaccinated(self):
        dfs = []
        fully_vaccinated_urls = [
            {"county": "Kent", "url": self.kent_fully_vaccinated_url},
            {"county": "New Castle", "url": self.new_castle_fully_vaccinated_url},
            {"county": "Sussex", "url": self.sussex_fully_vaccinated_url},
        ]
        for curl in fully_vaccinated_urls:
            county = curl["county"]
            url = curl["url"]
            r = requests.get(url)
            soup = BeautifulSoup(r.text, features="lxml")
            tdata = json.loads(
                soup.find(
                    "div",
                    {"aria-labelledby": "chart-covid-vaccine-fully-vaccinated-label"},
                )["data-charts--covid-vaccine-fully-vaccinated-config-value"]
            )
            sd = tdata["startDate"]
            # Parse start date
            startDate = dt.datetime(sd[0], sd[1], sd[2])
            # Get first dose data
            first_dose_data = tdata["series"][0]["data"]
            idx = pd.date_range(startDate, periods=len(first_dose_data), freq="d")
            first_dose_df = pd.DataFrame(
                data=first_dose_data, columns=["FirstDose"], index=idx
            )
            # Get second dose data
            second_dose_data = tdata["series"][1]["data"]
            idx = pd.date_range(startDate, periods=len(second_dose_data), freq="d")
            second_dose_df = pd.DataFrame(
                data=second_dose_data, columns=["SecondDose"], index=idx
            )
            df = first_dose_df.join(second_dose_df)
            df["location_name"] = county
            dfs.append(df)
        return pd.concat(dfs)

    def _fetch_total_administered(self):
        dfs = []
        total_urls = [
            {"county": "Kent", "url": self.kent_total_url},
            {"county": "New Castle", "url": self.new_castle_total_url},
            {"county": "Sussex", "url": self.sussex_total_url},
        ]
        for curl in total_urls:
            county = curl["county"]
            url = curl["url"]
            r = requests.get(url)
            soup = BeautifulSoup(r.text, features="lxml")
            tdata = json.loads(
                soup.find(
                    "div",
                    {
                        "aria-labelledby": "chart-covid-vaccine-administrations-daily-label"
                    },
                )["data-charts--covid-vaccine-administrations-daily-config-value"]
            )
            sd = tdata["startDate"]
            # Parse start date
            startDate = dt.datetime(sd[0], sd[1], sd[2])
            # Get first dose data
            total_df = None
            for srs in tdata["series"]:
                if srs["name"] == "Daily Count":
                    total_data = srs["data"]
                    idx = pd.date_range(startDate, periods=len(total_data), freq="d")
                    total_df = pd.DataFrame(
                        data=total_data, columns=["TotalDoses"], index=idx
                    )
            if total_df is None:
                raise "Couln't get county total data"
            df = total_df
            df["location_name"] = county
            dfs.append(df)
        return pd.concat(dfs)

    def fetch(self):
        totals = self._fetch_total_administered()
        doses = self._fetch_fully_vaccinated()
        return {"totals": totals, "doses": doses}

    def normalize(self, data):

        totals = data["totals"]
        totals = totals.reset_index().rename(columns={"index": "dt"})
        doses = data["doses"]
        doses = doses.reset_index().rename(columns={"index": "dt"})
        df = (
            totals.set_index(["dt", "location_name"])
            .join(doses.set_index(["dt", "location_name"]))
            .reset_index()
        )

        df = df.fillna(0)
        out = self._reshape_variables(df, self.variables)

        return out

Example #18

0

Show file

class GeorgiaCountyVaccineRace(GeorgiaCountyVaccineAge):
    sheet = 6
    column_names = ["RACE_ID"]

    variables = {
        "2054-5":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="black",
        ),
        "2076-8":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="white",
        ),
        "2106-3":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="white",
        ),
        "1002-5":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="ai_an",
        ),
        "2028-9":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="multiple",
        ),
        "ANHOPI":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="asian",
        ),
        "UNK":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="unknown",
        ),
        "2131-1":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="other",
        ),
    }

    def fetch(self):
        return self.get_all_jsons(self.service, self.sheet, "6")

Example #19

0

Show file

File: cdc_county_vaccines.py Project: christopherturner/can-scrapers

class CDCCountyVaccine(FederalDashboard):
    has_location = True
    location_type = "county"
    source = "https://covid.cdc.gov/covid-data-tracker/#county-view"
    url = ("https://covid.cdc.gov/covid-data-tracker/COVIDData/"
           "getAjaxData?id=vaccination_county_condensed_data")
    source_name = "Centers for Disease Control and Prevention"
    provider = "cdc"

    variables = {
        "Series_Complete_Yes":
        variables.FULLY_VACCINATED_ALL,
        "Series_Complete_18Plus":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="18_plus",
        ),
        "Series_Complete_65Plus":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
            age="65_plus",
        ),
        "Series_Complete_Pop_Pct":
        variables.PERCENTAGE_PEOPLE_COMPLETING_VACCINE,
        "Series_Complete_18PlusPop_Pct":
        CMU(
            category="total_vaccine_completed",
            measurement="current",
            unit="percentage",
            age="18_plus",
        ),
        "Series_Complete_65PlusPop_Pct":
        CMU(
            category="total_vaccine_completed",
            measurement="current",
            unit="percentage",
            age="65_plus",
        ),
    }

    def fetch(self):
        response = requests.get(self.url)
        if not response.ok:
            msg = f"Failed to make request to {self.url}\n"
            msg += "Response from request was\n:"
            msg += textwrap.indent(response.content, "\t")
            raise RequestError(msg)
        return response.json()

    def normalize(self, data):
        df = pd.DataFrame.from_records(
            data["vaccination_county_condensed_data"])
        out = self._rename_or_add_date_and_location(df,
                                                    location_column="FIPS",
                                                    date_column="Date",
                                                    locations_to_drop=["UNK"])
        return self._reshape_variables(out, self.variables)

Example #20

0

Show file

    def normalize(self, data: str) -> pd.DataFrame:
        # Read the dataframe from the string csv
        df = pd.read_csv(StringIO(data))
        df.columns = [x.lower().strip() for x in df.columns]

        # Set date and fips code
        df.loc[:, "dt"] = pd.to_datetime(df["collection_week"])

        # Filter out all of the columns without a fips code for now -- I
        # think that it is likely that we could reverse engineer these
        # either by looking them up or by mapping city to county
        df = df.loc[~df["fips_code"].isna(), :]
        # :see_no_evil:
        df["location"] = (
            df["fips_code"]
            .astype(int)
            .replace(
                {
                    # 02120 corresponded to Kenai-Cook Inlet Division... It was
                    # then the relevant piece became Kenai Peninsula Borough which
                    # is 02122
                    # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf
                    2120: 2122,
                    # City associated with the hospital is Seward which is in the
                    # Kenai Borough which is 02122 but I have no idea how this
                    # ended up with fips code 02210???
                    # https://en.wikipedia.org/wiki/Seward,_Alaska
                    2210: 2122,
                    # 02260 was fips code for Valdez-Chitina-Whittier Division... It
                    # was then put into Valdez–Cordova Census Area which is
                    # 02261, but 02261 was split in Jan 2019 and we'll need to change
                    # this again if we update geographies
                    # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf
                    2260: 2261,
                    # 02280 corresponded to Wrangell-Petersburg but became the
                    # Petersburg Borough 02195 in 2012
                    # https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf
                    2280: 2195,
                    # City associated with the hospital is Cordova which is in the
                    # Valdez-Cordova census area but I don't know which one this
                    # ended up in after the split...
                    # https://en.wikipedia.org/wiki/Cordova,_Alaska
                    2080: 2261,
                }
            )
        )

        # Set all missing values (-999999) to nan for all numeric columns
        numeric_cols = list(df.select_dtypes("number"))
        df.loc[:, numeric_cols] = df.loc[:, numeric_cols].where(lambda x: x > 0, np.nan)

        # Create new columns that we need
        df["inpatient_beds_used_covid_7_day_avg"] = df.eval(
            "total_adult_patients_hospitalized_confirmed_covid_7_day_avg + "
            "total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg"
        )

        crename = {
            "inpatient_beds_7_day_avg": CMU(
                category="hospital_beds_capacity",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "inpatient_beds_used_7_day_avg": CMU(
                category="hospital_beds_in_use",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            # This column is generated by summing adult and pediatric
            # beds -- Should be missing if either is missing
            "inpatient_beds_used_covid_7_day_avg": CMU(
                category="hospital_beds_in_use_covid",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "total_staffed_adult_icu_beds_7_day_avg": CMU(
                category="adult_icu_beds_capacity",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "staffed_adult_icu_bed_occupancy_7_day_avg": CMU(
                category="adult_icu_beds_in_use",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
            "staffed_icu_adult_patients_confirmed_covid_7_day_avg": CMU(
                category="adult_icu_beds_in_use_covid",
                measurement="rolling_average_7_day",
                unit="beds",
            ),
        }

        # Reshape by putting into long form
        df_long = df.melt(
            id_vars=["dt", "location"], value_vars=crename.keys()
        ).dropna()
        df_long.loc[:, "value"] = pd.to_numeric(
            df_long["value"].astype(str).str.replace(",", "")
        )

        # Add category, measurement, unit, age, sex, race
        df_long = self.extract_CMU(df_long, crename)

        # Group by relevant factors and sum
        identifier = [
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "sex",
            "race",
        ]

        # TODO: We could do a different groupby and put this into states
        # or hospital regions
        out_county = df_long.groupby(identifier)["value"].sum().reset_index()

        # TODO: Throwing out territories because I don't remember which weren't
        # included in the census data :(
        out_county = out_county.query("location < 60_000")

        # Add vintage
        out_county["vintage"] = self._retrieve_vintage()
        out_county["location_type"] = "county"
        cols_2_keep = identifier + ["vintage", "location_type", "value"]

        return out_county.loc[:, cols_2_keep]