Exemple #1
0
class ALCountyVaccineAge(ALCountyVaccineSex):
    variable_columns = ["AGECAT"]
    sheet_num = 5
    variables = {
        "16-54": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="16-54",
        ),
        "55-64": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="55-64",
        ),
        "65-74": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="65-74",
        ),
        "75+": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            age="75_plus",
        ),
    }
Exemple #2
0
    def pre_normalize(self, data) -> pd.DataFrame:
        df = self.arcgis_jsons_to_df(data)

        # Make columns names all-lowercase
        df.columns = [x.lower() for x in list(df)]
        df = df.rename(columns={"county": "location_name"})

        crename = {
            "med_total":
            CMU(category="hospital_beds_capacity",
                measurement="current",
                unit="beds"),
            "covid_patients":
            CMU(
                category="hospital_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            "icu_total":
            CMU(category="icu_beds_capacity",
                measurement="current",
                unit="beds"),
            "icu_avail":
            CMU(category="icu_beds_available",
                measurement="current",
                unit="beds"),
        }

        df["dt"] = df["date"].map(self._esri_ts_to_dt)

        out = df.melt(id_vars=["location_name", "dt"],
                      value_vars=crename.keys()).dropna()

        non_county_regions = [
            "Pennsylvania",
            "East Central HCC",
            "HCC of Southwest PA",
            "Keystone HCC",
            "North Central HCC",
            "Northeast",
            "Northcentral",
            "Northeast HCC",
            "Northern Tier HCC",
            "Northwest",
            "Southcentral",
            "Southeast HCC",
            "Southeast",
            "Southwest",
        ]

        out = out[~out["location_name"].isin(non_county_regions)]

        out.loc[:, "value"] = pd.to_numeric(out["value"])

        out = self.extract_CMU(out, crename)

        return out.loc[:, self.cols_to_keep]
Exemple #3
0
    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df.columns = [x.lower() for x in list(df)]
        df["location"] = (self.state_fips * 1000) + df["county"].astype(int)

        # 12025 is the OLD (retired in 1997) fips code for Date county. It is now known
        # as Miami-Dade county with fips code 12086
        df.loc[:, "location"] = df["location"].replace(12025, 12086)

        crename = {
            "casesall": CMU(category="cases", measurement="cumulative", unit="people"),
            "deaths": CMU(category="deaths", measurement="cumulative", unit="people"),
            "newpos": CMU(
                category="unspecified_tests_positive",
                measurement="new",
                unit="test_encounters",
            ),
            "newneg": CMU(
                category="unspecified_tests_negative",
                measurement="new",
                unit="test_encounters",
            ),
            "newtested": CMU(
                category="unspecified_tests_total",
                measurement="new",
                unit="test_encounters",
            ),
        }
        out = (
            df.melt(id_vars=["location"], value_vars=crename.keys())
            .assign(
                dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage()
            )
            .query("location not in (12998, 12999)")
            .dropna()
        )
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Extract category information and add other variable context
        out = self.extract_CMU(out, crename)

        cols_to_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #4
0
    def normalize(self, data: pd.DataFrame) -> pd.DataFrame:
        df = data.rename(columns={
            "Test Date": "dt",
            "County": "location_name"
        })

        crename = {
            "New Positives":
            CMU(
                category="unspecified_tests_positive",
                measurement="new",
                unit="test_encounters",
            ),
            "Total Number of Tests Performed":
            CMU(
                category="unspecified_tests_total",
                measurement="new",
                unit="test_encounters",
            ),
            "Cumulative Number of Positives":
            CMU(
                category="unspecified_tests_positive",
                measurement="cumulative",
                unit="test_encounters",
            ),
            "Cumulative Number of Tests Performed":
            CMU(
                category="unspecified_tests_total",
                measurement="cumulative",
                unit="test_encounters",
            ),
        }
        out = df.melt(id_vars=["dt", "location_name"],
                      value_vars=crename.keys()).dropna()
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Determine the category and demographics of each observation
        out = self.extract_CMU(out, crename)
        out["vintage"] = self._retrieve_vintage()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #5
0
    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df.columns = [x.lower() for x in list(df)]
        df["location"] = (self.state_fips *
                          1000) + df["county_fip"].astype(int)

        crename = {
            "totalcasecount":
            CMU(category="cases", measurement="cumulative", unit="people"),
            "totaldeathcount":
            CMU(category="deaths", measurement="cumulative", unit="people"),
            "total_pop_tested":
            CMU(
                category="pcr_tests_total",
                measurement="cumulative",
                unit="unique_people",
            ),
            "total_testing_vol":
            CMU(
                category="pcr_tests_total",
                measurement="cumulative",
                unit="specimens",
            ),
            "daily_testing_vol":
            CMU(
                category="pcr_tests_total",
                measurement="new",
                unit="specimens",
            ),
        }
        out = (df.melt(id_vars=["location"], value_vars=crename.keys()).assign(
            dt=self._retrieve_dt("US/Eastern"),
            vintage=self._retrieve_vintage()).dropna())
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Extract category information and add other variable context
        out = self.extract_CMU(out, crename)

        cols_to_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]
        return out.loc[:, cols_to_keep]
Exemple #6
0
    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df.columns = [x.lower() for x in list(df)]
        df["location"] = df["geoid"].astype(int)

        crename = {
            "positive": CMU(category="cases", measurement="cumulative", unit="people"),
            "deaths": CMU(category="deaths", measurement="cumulative", unit="people"),
            "neg_new": CMU(
                category="pcr_tests_negative",
                measurement="new",
                unit="unique_people",
            ),
            "pos_new": CMU(
                category="pcr_tests_positive",
                measurement="new",
                unit="unique_people",
            ),
            "test_new": CMU(
                category="pcr_tests_total",
                measurement="new",
                unit="unique_people",
            ),
        }
        out = (
            df.melt(id_vars=["location"], value_vars=crename.keys())
            .assign(
                dt=self._retrieve_dt("US/Central"), vintage=self._retrieve_vintage()
            )
            .dropna()
        )
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Extract category information and add other variable context
        out = self.extract_CMU(out, crename)

        cols_to_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #7
0
class MSCountyVaccine(StateDashboard):
    has_location = False
    source = "https://msdh.ms.gov/msdhsite/_static/resources/12130.pdf"
    location_type = "county"
    state_fips = int(us.states.lookup("Mississippi").fips)
    fetch_url = "https://msdh.ms.gov/msdhsite/_static/resources/12130.pdf"
    source_name = "Mississippi State Department of Health"

    variable_map = {
        "People Receiving at least One Dose**":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
        ),
        "People Fully Vaccinated***":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
        ),
        "Total Doses Administered":
        CMU(
            category="total_vaccine_doses_administered",
            measurement="cumulative",
            unit="doses",
        ),
    }

    def fetch(self):
        return camelot.read_pdf(self.fetch_url, pages="2", flavor="stream")

    def normalize(self, data):
        # Clean up dataframe from PDF.
        data = data[0].df
        header = data.iloc[1, :].reset_index(drop=True)
        data = data.iloc[2:].reset_index(drop=True)
        data.columns = header.to_list()

        data = self._rename_or_add_date_and_location(
            data,
            location_name_column="County of Residence",
            location_names_to_drop=["Total"],
            timezone="US/Central",
        )
        data = self._reshape_variables(data, self.variable_map)
        data = data.replace({"location_name": {"Desoto": "DeSoto"}})
        return data.dropna(subset=["value"])
Exemple #8
0
    def normalize(self, data):
        # retrieve and convert excel object to df, re-structure df
        data = pd.ExcelFile(data.content)
        df = self._wrangle(data.parse("Overal Stats"))

        crename = {
            "Total Overall Number of Tests":
            CMU(
                category="unspecified_tests_total",
                measurement="cumulative",
                unit="test_encounters",
            ),
            "Total Number of DC Residents Tested":
            CMU(
                category="unspecified_tests_total",
                measurement="cumulative",
                unit="unique_people",
            ),
            "Total ICU Beds in Hospitals":
            CMU(
                category="icu_beds_capacity",
                measurement="current",
                unit="beds",
            ),
            "ICU Beds Available":
            CMU(
                category="icu_beds_available",
                measurement="current",
                unit="beds",
            ),
            "Total Reported Ventilators in Hospitals":
            CMU(
                category="ventilators_capacity",
                measurement="current",
                unit="people",
            ),
            "In-Use Ventilators in Hospitals":
            CMU(
                category="ventilators_in_use",
                measurement="current",
                unit="people",
            ),
            "Available Ventilators in Hospitals":
            CMU(
                category="ventilators_available",
                measurement="current",
                unit="people",
            ),
            "Total COVID-19 Patients in ICU":
            CMU(  ##check
                category="icu_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
        }

        # return df in correct format for put() with new/renamed cols
        return self._reshape(df, crename)
Exemple #9
0
    def normalize(self, data):
        data = pd.ExcelFile(data.content)
        df = self._wrangle(data.parse("Total Cases by Race"))

        crename = {
            "All":
            CMU(
                category="cases",
                measurement="cumulative",
                unit="people",
            ),
            "Unknown":
            CMU(
                category="cases",
                measurement="cumulative",
                unit="people",
                race="unknown",
            ),
            "White":
            CMU(category="cases",
                measurement="cumulative",
                unit="people",
                race="white"),
            "Black/African American":
            CMU(category="cases",
                measurement="cumulative",
                unit="people",
                race="black"),
            "Asian":
            CMU(category="cases",
                measurement="cumulative",
                unit="people",
                race="asian"),
            "American Indian/Alaska Native":
            CMU(  ##question?
                category="cases",
                measurement="cumulative",
                unit="people",
                race="native_american",
            ),
            "Native Hawaiin Pacific Islander":
            CMU(  ##question?
                category="cases",
                measurement="cumulative",
                unit="people",
                race="pacific_islander",
            ),
            "Other/Multi-Racial":
            CMU(  ##question?
                category="cases",
                measurement="cumulative",
                unit="people",
                race="multiple_other",
            ),
        }
        return self._reshape(df, crename)
class ALCountyVaccineSex(ALCountyVaccine):
    variables = {
        "F":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="female",
        ),
        "M":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="male",
        ),
        "U":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            sex="unknown",
        ),
    }

    variable_columns = ["RECIP_SEX"]
    sheet_num = 6
    service = "Vaccination_Dashboard_AGOL_v4_PUBLIC_VIEW"

    def fetch(self):
        service = "Vaccination_Dashboard_AGOL_v4_PUBLIC_VIEW"
        return self.get_all_jsons(service, self.sheet_num, "7")

    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df = df.pivot_table(index="CNTYFIPS",
                            columns=self.variable_columns,
                            values="COUNTS").reset_index()
        df = df.rename_axis(None, axis=1)
        df = self._rename_or_add_date_and_location(df,
                                                   location_column="CNTYFIPS",
                                                   timezone="US/Central")
        df = self._reshape_variables(df, self.variables)
        locations_to_drop = [0, 99999]
        df = df.query("location != @locations_to_drop")
        return df
class MontanaStateVaccine(MontanaCountyVaccine):
    location_type = "state"
    has_location = True
    crename = {
        "Total_Montanans_Immunized":
        CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
        ),
        "Total_Doses_Administered":
        CMU(
            category="total_vaccine_doses_administered",
            measurement="cumulative",
            unit="doses",
        ),
    }

    def fetch(self):
        return self.get_all_jsons("COVID_Vaccination_PRD_View", 1, "")

    def normalize(self, data):
        df = (self.arcgis_jsons_to_df(data).fillna(0).rename(
            columns={"Report_Date": "dt"}))
        df["dt"] = df["dt"].map(self._esri_ts_to_dt)
        df["location"] = self.state_fips

        out = self._transform_df(df)

        # this scraper has some duplicates. Drop them here
        return out.drop_duplicates(
            subset=[
                "vintage",
                "dt",
                "location",
                "category",
                "measurement",
                "unit",
                "age",
                "race",
                "ethnicity",
                "sex",
            ],
            keep="last",
        )
Exemple #12
0
class ALCountyVaccineRace(ALCountyVaccineSex):
    variable_columns = ["RACE_LBL"]
    sheet_num = 4
    variables = {
        "Native Hawaiian or other Pacific Islander": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="pacific_islander",
        ),
        "Two or More Races": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="multiple",
        ),
        "Other Race": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="other",
        ),
        "American Indian or Alaskan Native": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="ai_an",
        ),
        "White": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="white",
        ),
        "Unknown": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="unknown",
        ),
        "Black or African American": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="black",
        ),
        "Asian": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
            race="asian",
        ),
    }
Exemple #13
0
    def pre_normalize(self, data) -> pd.DataFrame:
        df = self.arcgis_jsons_to_df(data)

        # Make columns names all-lowercase
        df.columns = [x.lower() for x in list(df)]
        df = df.rename(columns={"county": "location_name"})

        crename = {
            "cases":
            CMU(category="cases", measurement="cumulative", unit="people"),
            "deaths":
            CMU(category="deaths", measurement="cumulative", unit="people"),
            # "probable": CMU(
            #     category="cases_probable",
            #     measurement="cumulative",
            #     unit="people",
            # ),
            "negative":
            CMU(
                category="pcr_tests_negative",
                measurement="cumulative",
                unit="unique_people",
            ),
            "confirmed":
            CMU(
                category="pcr_tests_positive",
                measurement="cumulative",
                unit="unique_people",
            ),
        }
        out = (df.melt(
            id_vars=["location_name"], value_vars=crename.keys()).assign(
                dt=self._retrieve_dt("US/Eastern")).dropna().replace(
                    dict(location_name=dict(Mckean="McKean"))))
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Extract category information and add other variable context
        out = self.extract_CMU(out, crename)

        return out.loc[:, self.cols_to_keep].query(
            "location_name != 'Pennsylvania'")
Exemple #14
0
class TexasTests(TexasCasesDeaths):
    """
    Get  testing data on all TX counties from the TX ArcGIS dashboard
    """

    service: str = "DSHS_COVID19_TestData_Service"
    crename: Dict[str, CMU] = {
        "ViralTest": CMU(
            category="pcr_tests_total", measurement="cumulative", unit="specimens"
        ),
        "AntibodyTe": CMU(
            category="antibody_tests_total",
            measurement="cumulative",
            unit="specimens",
        ),
        "Cumulative": CMU(
            category="unspecified_tests_total",
            measurement="cumulative",
            unit="unknown",
        ),
    }
class WisconsinVaccineStateAge(TableauDashboard):
    has_location = False
    source = "https://www.dhs.wisconsin.gov/covid-19/vaccine-data.htm#summary"
    source_name = "Wisconsin Department of Health Services"
    state_fips = int(us.states.lookup("Wisconsin").fips)
    baseurl = "https://bi.wisconsin.gov/t/DHS"
    viewPath = (
        "VaccinesAdministeredtoWIResidents_16212677845310/VaccinatedWisconsin-County"
    )

    timezone = "US/Central"
    data_tableau_table = "Age vax/unvax County"
    # age does not report missing/unknown entries
    missing_tableau_table = ""
    location_name_col = "AGG(Geography TT)-alias"
    location_type = "state"

    # map wide form column names into CMUs
    cmus = {
        "SUM(Initiation or completed count for TT)-alias":
        CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
        )
    }

    def _get_demographic(self, df: pd.DataFrame, demo: str,
                         demo_col_name: str) -> pd.DataFrame:
        """
        description: a general "normalize" function to avoid extra/copied code
                     each demographic uses this in its respective normalize

        params:
            demo: the demographic as labeled according to CMU (age,sex,race, etc...)
            demo_col_name: the name of the demographic column from the fetched data

        returns: normalized data in long format
        """

        # county names (converted to title case)
        df["location_name"] = df[self.location_name_col].str.title()
        # fix county names
        df = df.replace({
            "location_name": {
                "St Croix": "St. Croix",
                "Fond Du Lac": "Fond du Lac"
            }
        })

        # parse out data columns
        value_cols = list(set(df.columns) & set(self.cmus.keys()))
        assert len(value_cols) == len(self.cmus)

        df = (df.melt(id_vars=[demo_col_name, "location_name"],
                      value_vars=value_cols).dropna().assign(
                          dt=self._retrieve_dt(self.timezone),
                          vintage=self._retrieve_vintage(),
                          value=lambda x: pd.to_numeric(x["value"].astype(str).
                                                        str.replace(",", "")),
                      ).pipe(self.extract_CMU, cmu=self.cmus))
        df[demo] = df[demo_col_name]
        return df.drop(["variable", demo_col_name], axis=1)

    def fetch(self) -> pd.DataFrame:
        if self.missing_tableau_table:
            # extract both data table and missing data table
            dfs = [
                self.get_tableau_view().get(table) for table in
                [self.data_tableau_table, self.missing_tableau_table]
            ]
            return pd.concat(dfs)
        else:
            return self.get_tableau_view()[self.data_tableau_table]

    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self._get_demographic(df, "age", "Age-value")
        return df.replace({"age": {"65+": "65_plus"}})
Exemple #16
0
class WisconsinArcGIS(ArcGIS, ABC):
    """
    ArcGIS scraper that retrieves dashboard information for the
    state of Wisconsin (which has their own self-hosted ArcGIS
    instance)
    """

    has_location = True
    state_fips = int(us.states.lookup("Wisconsin").fips)
    source = "https://www.dhs.wisconsin.gov/covid-19/data.htm"

    location_type: str
    SERVICE: str = "DHS_COVID19/COVID19_WI"
    SHEET: int

    crename = {
        "positive": CMU(
            category="cases",
            measurement="cumulative",
            unit="people",
        ),
        "negative": CMU(
            category="pcr_tests_negative",
            measurement="cumulative",
            unit="unique_people",
        ),
        "pos_new": CMU(
            category="cases",
            measurement="new",
            unit="people",
        ),
        "neg_new": CMU(
            category="pcr_tests_negative",
            measurement="new",
            unit="unique_people",
        ),
        "test_new": CMU(
            category="pcr_tests_total",
            measurement="new",
            unit="unique_people",
        ),
        "deaths": CMU(category="deaths", measurement="cumulative", unit="people"),
        "dth_new": CMU(category="deaths", measurement="new", unit="people"),
        "hosp_yes": CMU(
            category="hospital_beds_in_use_covid",
            measurement="cumulative",
            unit="people",
        ),
        # sex
        "pos_fem": case_cmu(sex="female"),
        "pos_male": case_cmu(sex="male"),
        "dths_fem": deaths_cmu(sex="female"),
        "dths_male": deaths_cmu(sex="male"),
        # age
        "pos_0_9": case_cmu(age="0-9"),
        "pos_10_19": case_cmu(age="10-19"),
        "pos_20_29": case_cmu(age="20-29"),
        "pos_30_39": case_cmu(age="30-39"),
        "pos_40_49": case_cmu(age="40-49"),
        "pos_50_59": case_cmu(age="50-59"),
        "pos_60_69": case_cmu(age="60-69"),
        "pos_70_79": case_cmu(age="70-79"),
        "pos_80_89": case_cmu(age="80-89"),
        "pos_90": case_cmu(age="90_plus"),
        "dths_0_9": deaths_cmu(age="0-9"),
        "dths_10_19": deaths_cmu(age="10-19"),
        "dths_20_29": deaths_cmu(age="20-29"),
        "dths_30_39": deaths_cmu(age="30-39"),
        "dths_40_49": deaths_cmu(age="40-49"),
        "dths_50_59": deaths_cmu(age="50-59"),
        "dths_60_69": deaths_cmu(age="60-69"),
        "dths_70_79": deaths_cmu(age="70-79"),
        "dths_80_89": deaths_cmu(age="80-89"),
        "dths_90": deaths_cmu(age="90_plus"),
        # race and ethnicity
        "pos_aian": case_cmu(race="ai_an"),
        "pos_asn": case_cmu(race="asian"),
        "pos_blk": case_cmu(race="black"),
        "pos_wht": case_cmu(race="white"),
        "pos_mltoth": case_cmu(race="multiple_other"),
        "pos_unk": case_cmu(race="unknown"),
        "pos_e_hsp": case_cmu(ethnicity="hispanic"),
        "pos_e_nhsp": case_cmu(ethnicity="non-hispanic"),
        "pos_e_unk": case_cmu(ethnicity="unknown"),
        "dths_aian": deaths_cmu(race="ai_an"),
        "dths_asn": deaths_cmu(race="asian"),
        "dths_blk": deaths_cmu(race="black"),
        "dths_wht": deaths_cmu(race="white"),
        "dths_mltoth": deaths_cmu(race="multiple_other"),
        "dths_unk": deaths_cmu(race="unknown"),
        "dths_e_hsp": deaths_cmu(ethnicity="hispanic"),
        "dths_e_nhsp": deaths_cmu(ethnicity="non-hispanic"),
        "dths_e_unk": deaths_cmu(ethnicity="unknown"),
    }

    @abstractmethod
    def get_location(self, df: pd.DataFrame):
        pass

    def fetch(self):
        return self.get_all_jsons(self.SERVICE, self.SHEET, "server")

    def arcgis_query_url(
        self,
        service="DHS_COVID19/COVID19_WI",
        sheet=1,
        srvid="server",
    ):
        out = f"https://dhsgis.wi.gov/{srvid}/rest/services/{service}/MapServer/{sheet}/query"

        return out

    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df.columns = [x.lower() for x in list(df)]
        df["location"] = self.get_location(df)

        value_cols = list(set(df.columns) & set(self.crename.keys()))

        out = (
            df.melt(id_vars=["location"], value_vars=value_cols)
            .assign(
                dt=self._retrieve_dt("US/Central"), vintage=self._retrieve_vintage()
            )
            .dropna()
        )
        out.loc[:, "value"] = pd.to_numeric(out["value"])

        # Extract category information and add other variable context
        out = self.extract_CMU(out, self.crename)

        cols_to_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #17
0
def case_cmu(**kw):
    kwargs = dict(category="cases", measurement="cumulative", unit="people")
    kwargs.update(kw)
    return CMU(**kwargs)
Exemple #18
0
    def normalize(self, data):
        # retrieve data and convert dataframe structure
        data = pd.ExcelFile(data.content)
        df_age = self._wrangle(data.parse("Lives Lost by Age"))
        df_sex = self._wrangle(data.parse("Lives Lost by Sex"))
        df_race = self._wrangle(data.parse("Lives Lost by Race"))

        # maps for each df
        crename_age = {
            "<19":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="0-19",
            ),
            "20-29":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="20-29",
            ),
            "30-39":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="30-39",
            ),
            "40-49":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="40-49",
            ),
            "50-59":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="50-59",
            ),
            "60-69":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="60-69",
            ),
            "70-79":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="70-79",
            ),
            "80+":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="80_plus",
            ),
        }
        crename_sex = {
            "Female":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                sex="female",
            ),
            "Male":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                sex="male",
            ),
        }
        crename_race = {
            "Asian":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="asian",
            ),
            "Black/African American":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="black",
            ),
            "Hispanic/Latinx":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="all",
                ethnicity="hispanic",
            ),
            "Non-Hispanic White":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="white",
                ethnicity="non-hispanic",
            ),
            "Unknown":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="unknown",
            ),
            "All":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
            ),
        }

        # rename and add columns according to map
        df_age = self._reshape(df_age, crename_age)
        df_sex = self._reshape(df_sex, crename_sex)
        df_race = self._reshape(df_race, crename_race)

        # combine all into one df
        df = pd.DataFrame()
        df = (df.append(df_age, ignore_index=True).append(
            df_sex, ignore_index=True).append(df_race, ignore_index=True))

        # we have two dt that are pretty much identical
        # they are:
        #   numpy.datetime64('2020-06-07T00:00:00.100000000')
        #   numpy.datetime64('2020-06-07T00:00:00.000000000')
        # We drop one of them
        bad = df["dt"] == np.datetime64("2020-06-07T00:00:00.100000000")

        return df.loc[~bad, :]
Exemple #19
0
"""Commonly used variables"""

from can_tools.scrapers import CMU

INITIATING_VACCINATIONS_ALL = CMU(
    category="total_vaccine_initiated",
    measurement="cumulative",
    unit="people",
)

FULLY_VACCINATED_ALL = CMU(
    category="total_vaccine_completed",
    measurement="cumulative",
    unit="people",
)

TOTAL_DOSES_ADMINISTERED_ALL = CMU(
    category="total_vaccine_doses_administered",
    measurement="cumulative",
    unit="doses",
)

PERCENTAGE_PEOPLE_INITIATING_VACCINE = CMU(
    category="total_vaccine_initiated",
    measurement="current",
    unit="percentage",
)

PERCENTAGE_PEOPLE_COMPLETING_VACCINE = CMU(
    category="total_vaccine_completed",
    measurement="current",
Exemple #20
0
    def normalize(self, resjson: dict) -> pd.DataFrame:
        # Extract components we care about from json
        foo = resjson["results"][0]["result"]["data"]
        descriptor = foo["descriptor"]["Select"]
        data = foo["dsr"]["DS"][0]["PH"][1]["DM1"]

        # Build dict of dicts with relevant info
        col_names = [x["N"] for x in data[0]["S"]]
        col_mapping = {x["Value"]: x["Name"] for x in descriptor}

        # Iterate through all of the rows and store relevant data
        data_rows = []
        for row in data:
            data_rows.append(row["C"])

        # Dump records into a DataFrame
        df = (pd.DataFrame.from_records(data_rows, columns=col_names).rename(
            columns=col_mapping).rename(columns={"county": "location_name"}))
        # Reshape
        crename = {
            "doses_initiated":
            CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "doses_completed":
            CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
            "doses_administered":
            CMU(
                category="total_vaccine_doses_administered",
                measurement="cumulative",
                unit="doses",
            ),
        }
        out = df.melt(id_vars=["location_name"], value_vars=crename.keys())

        # Add CMU, dt, vintage
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Pacific")
        out["vintage"] = self._retrieve_vintage()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #21
0
    def normalize(self, resjson: dict) -> pd.DataFrame:
        # Extract components we care about from json
        foo = resjson["results"][0]["result"]["data"]
        descriptor = foo["descriptor"]["Select"]
        data = foo["dsr"]["DS"][0]["PH"][1]["DM1"]

        # Build dict of dicts with relevant info
        col_mapping = {x["Value"]: x["Name"] for x in descriptor}
        col_keys = list(col_mapping.keys())

        # Iterate through all of the rows and store relevant data
        data_rows = []
        row_names = [col_mapping[desc["N"]] for desc in data[0]["S"]]
        for record in data:
            Crecord = record["C"]
            if "County" not in str(Crecord[0]):
                continue
            data_rows.append(record["C"])

        # Dump records into a DataFrame
        df = pd.DataFrame.from_records(data_rows, columns=row_names)

        # Title case and remove the word county
        df["location_name"] = df["county"].str.replace("County, ME",
                                                       "").str.strip()

        # Change into percentage
        for col in [
                "total_vaccine_initiated_percent",
                "total_vaccine_completed_percent",
        ]:
            df.loc[:, col] = 100 * df.loc[:, col]

        # Reshape
        crename = {
            "total_vaccine_administered":
            CMU(
                category="total_vaccine_doses_administered",
                measurement="cumulative",
                unit="doses",
            ),
            "total_vaccine_initiated":
            CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "total_vaccine_completed":
            CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
            "total_vaccine_initiated_percent":
            CMU(
                category="total_vaccine_initiated",
                measurement="current",
                unit="percentage",
            ),
            "total_vaccine_completed_percent":
            CMU(
                category="total_vaccine_completed",
                measurement="current",
                unit="percentage",
            ),
        }
        out = df.melt(id_vars=["location_name"],
                      value_vars=crename.keys()).dropna()

        # Add CMU, dt, vintage
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Eastern")
        out["vintage"] = self._retrieve_vintage()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #22
0
    def normalize(self, data):
        df = self.arcgis_jsons_to_df(data)
        df.columns = [x.lower() for x in list(df)]
        df["location"] = self.state_fips
        df = (
            df.sort_values(by=["reportdate"], ascending=True)
            .tail(1)
            .reset_index(drop=True)
        )
        df = df[
            [
                "location",
                "totalcases",
                "casedelta",
                "negativetests",
                "negdelta",
                "bedstotal",
                "bedsicu",
                "bedsdelta",
                "deaths",
                "deathsdelta",
                "totaltests",
                "testsdelta",
                "postestpercent",
            ]
        ]
        crename = {
            "totalcases": CMU(
                category="cases", measurement="cumulative", unit="people"
            ),
            "casedelta": CMU(category="cases", measurement="new", unit="people"),
            "negativetests": CMU(
                category="pcr_tests_negative",
                measurement="cumulative",
                unit="specimens",
            ),
            "negdelta": CMU(
                category="pcr_tests_negative", measurement="new", unit="specimens"
            ),
            "bedstotal": CMU(
                category="hospital_beds_in_use_covid",
                measurement="current",
                unit="beds",
            ),
            "bedsicu": CMU(
                category="icu_beds_in_use_covid", measurement="current", unit="beds"
            ),
            "bedsdelta": CMU(
                category="hospital_beds_in_use_covid", measurement="new", unit="beds"
            ),
            "deaths": CMU(category="deaths", measurement="cumulative", unit="people"),
            "deathsdelta": CMU(category="deaths", measurement="new", unit="people"),
            "totaltests": CMU(
                category="pcr_tests_total",
                measurement="cumulative",
                unit="specimens",
            ),
            "testsdelta": CMU(
                category="pcr_tests_total",
                measurement="new",
                unit="specimens",
            ),
            "postestpercent": CMU(
                category="pcr_tests_positive",
                measurement="rolling_average_7_day",
                unit="percentage",
            ),
        }

        out = (
            df.melt(id_vars=["location"], value_vars=crename.keys())
            .assign(
                dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage()
            )
            .dropna()
        )
        out.loc[:, "value"] = pd.to_numeric(out["value"])
        # Extract category information and add other variable context
        out = self.extract_CMU(out, crename)
        cols_to_keep = [
            "vintage",
            "dt",
            "location",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]
        return out.loc[:, cols_to_keep]
Exemple #23
0
    def normalize(self, resjson: dict) -> pd.DataFrame:
        # Extract components we care about from json
        foo = resjson["results"][0]["result"]["data"]
        descriptor = foo["descriptor"]["Select"]
        data = foo["dsr"]["DS"][0]["PH"][1]["DM1"]

        # Build dict of dicts with relevant info
        col_mapping = {x["Value"]: x["Name"] for x in descriptor}
        col_keys = list(col_mapping.keys())

        # Iterate through all of the rows and store relevant data
        data_rows = []
        row_names = [col_mapping[desc["N"]] for desc in data[0]["S"]]
        for record in data:
            data_rows.append(record["C"])

        # Dump records into a DataFrame
        df = pd.DataFrame.from_records(data_rows, columns=row_names)

        # Title case and remove the word county
        df["location_name"] = (
            df["county"].str.title().str.replace("County", "").str.strip()
        )
        df = df.query("~location_name.str.contains('Unknown')")

        # Rename certain counties
        df = df.replace(
            {
                "location_name": {
                    "Lac Qui Parle": "Lac qui Parle",
                    "Mcleod": "McLeod",
                    "Lake Of The Woods": "Lake of the Woods",
                }
            }
        )

        # Turn strings into numbers
        df["total_vaccine_initiated"] = pd.to_numeric(
            df["total_vaccine_initiated_display"].str.replace("L", "")
        )
        df["total_vaccine_completed"] = pd.to_numeric(
            df["total_vaccine_completed_display"].str.replace("L", "")
        )

        # Reshape
        crename = {
            "total_vaccine_initiated": CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "total_vaccine_completed": CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
        }
        out = df.melt(id_vars=["location_name"], value_vars=crename.keys())

        # Add CMU, dt, vintage
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Central")
        out["vintage"] = self._retrieve_vintage()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep].dropna()
Exemple #24
0
    def normalize(self, resjson):
        df, vds = self.normalize_preprocess(resjson)

        # Replace indexes with values
        county_replacer = {i: vd for i, vd in enumerate(vds["D0"])}
        coverage_replacer = {i: vd for i, vd in enumerate(vds["D1"])}
        dem_replacer = {i: vd for i, vd in enumerate(vds["D2"])}
        df = (df.query("coverage < 2").replace({
            "county": county_replacer,
            "coverage": coverage_replacer,
            self.demographic: dem_replacer,
        }).rename(
            columns={
                "county": "location_name",
                "coverage": "variable",
                "count": "value",
            }).replace({
                "variable": {
                    "Partially Covered": "total_vaccine_initiated",
                    "Fully Covered": "total_vaccine_completed",
                },
                self.demographic: self.value_renamer,
            }).pivot_table(
                index=["location_name", self.demographic],
                columns="variable",
                values="value",
            ).reset_index())
        df = self.clean_pa_location_names(df)

        # Initiated is not at least one dose for PA
        df["total_vaccine_initiated"] = df.eval(
            "total_vaccine_initiated + total_vaccine_completed")
        df = df.melt(id_vars=["location_name", self.demographic])

        # Reshape
        crename = {
            "total_vaccine_initiated":
            CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "total_vaccine_completed":
            CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
        }

        categories = [
            "category",
            "measurement",
            "unit",
            "age",
            "sex",
            "race",
            "ethnicity",
        ]
        categories.remove(self.demographic)
        return self.normalize_postprocess(df, categories, crename)
Exemple #25
0
    def normalize(self, resjson):

        # Extract components we care about from json
        foo = resjson["results"][0]["result"]["data"]
        descriptor = foo["descriptor"]["Select"]
        data = foo["dsr"]["DS"][0]["PH"][0]["DM0"]

        # Build dict of dicts with relevant info
        col_mapping = {x["Value"]: x["Name"] for x in descriptor}
        col_keys = list(col_mapping.keys())

        # Iterate through all of the rows and store relevant data
        data_rows = []
        for record in data:
            flat_record = flatten_dict(record)

            row = {}
            for k in col_keys:
                flat_record_key = [
                    frk for frk in flat_record.keys() if k in frk
                ]

                if len(flat_record_key) > 0:
                    row[col_mapping[k]] = flat_record[flat_record_key[0]]

            data_rows.append(row)

        # Dump records into a DataFrame
        df = pd.DataFrame.from_records(data_rows).dropna()
        df = df.query("location_name != '' & location_name != 'Out-of-State'")

        # Initiated is not at least one dose for PA -- it is a count of
        # individuals that are currently partially covered by a vaccine
        df["total_vaccine_initiated"] = df.eval(
            "total_vaccine_initiated + total_vaccine_completed")

        # Make sure McKean follows capitalization in db
        df = df.replace({"location_name": {"Mckean": "McKean"}})

        # Reshape
        crename = {
            "total_vaccine_initiated":
            CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "total_vaccine_completed":
            CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
        }
        out = df.melt(id_vars=["location_name"])

        # Add CMU, dt, vintage
        out = self.extract_CMU(out, crename)
        out["dt"] = self._retrieve_dt("US/Eastern")
        out["vintage"] = self._retrieve_vintage()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #26
0
class TXVaccineCountyAge(TexasVaccineParent):
    location_type = "county"
    has_location = False
    cmus = {
        "Doses Administered": CMU(
            category="total_vaccine_doses_administered",
            measurement="cumulative",
            unit="doses",
        ),
        "People Vaccinated with at least One Dose": CMU(
            category="total_vaccine_initiated",
            measurement="cumulative",
            unit="people",
        ),
        "People Fully Vaccinated": CMU(
            category="total_vaccine_completed",
            measurement="cumulative",
            unit="people",
        ),
    }
    cmu_id_vars = ["age"]
    sheet_name = "By County, Age"
    replacers = {
        "age": {
            "16-49 years": "16-49",
            "50-64 years": "50-64",
            "65-79 years": "65-79",
            "80+ years": "80_plus",
            "Unknown": "unknown",
            "Total": "all",
        },
        "race": {
            "American Indian/Alaskan Native": "ai_an",
            "Asian": "asian",
            "Black": "black",
            "Multiple Races": "multiple",
            "Native Hawaiian/Other Pacific Islander": "pacific_islander",
            "Other": "other",
            "Unknown Race": "unknown",
            "Unknown": "unknown",
            "White": "white",
        },
        "ethnicity": {
            "Hispanic": "hispanic",
            "Not Hispanic": "non-hispanic",
            "Unknown": "unknown",
        },
        "sex": {
            "Female": "female",
            "Male": "male",
            "Unknown": "unknown",
        },
    }

    @property
    def cmu_columns(self):
        return list(
            set(["category", "measurement", "unit", "age", "race", "sex", "ethnicity"])
            - set(self.cmu_id_vars)
        )

    def normalize(self, data) -> pd.DataFrame:
        # Read in data, set location, and drop totals
        non_counties = ["Other", "Grand Total"]
        df = (
            self.excel_to_dataframe(data, self.sheet_name)
            .rename(columns=str.strip)
            .rename(
                columns={
                    "Age Group": "age",
                    "Race/Ethnicity": "race",
                    "County Name": "location_name",
                }
            )
            .rename(columns=str.strip)
            .melt(
                id_vars=["dt", "location_name"] + self.cmu_id_vars,
                value_vars=list(self.cmus.keys()),
            )
            .replace(self.replacers)
            .pipe(self.extract_CMU, cmu=self.cmus, columns=self.cmu_columns)
            .pipe(lambda x: x.loc[~x["location_name"].isin(["*Other", "Total"]), :])
            .assign(vintage=self._retrieve_vintage())
            .query("location_name not in @non_counties")
            .dropna(subset=["value"])
        )

        return df
Exemple #27
0
class TexasCasesDeaths(ArcGIS):
    """
    Get cases and deaths data on all TX counties from the TX ArcGIS dashboard
    """

    ARCGIS_ID = "ACaLB9ifngzawspq"
    source = (
        "https://txdshs.maps.arcgis.com/apps/opsdashboard/index.html"
        "#/ed483ecd702b4298ab01e8b9cafc8b83"
    )
    state_fips = int(us.states.lookup("Texas").fips)
    has_location = False
    service: str = "DSHS_COVID19_Cases_Service"
    crename: Dict[str, CMU] = {
        "Positive": CMU(category="cases", measurement="cumulative", unit="people"),
        "Fatalities": CMU(category="deaths", measurement="cumulative", unit="people"),
    }
    location_type = "county"

    def fetch(self) -> Any:
        return self.get_all_jsons(self.service, 0, 5)

    def normalize(self, data: Any) -> pd.DataFrame:
        """
        Fetch county level cases and deaths data

        Returns
        -------
        df: pd.DataFrame
            pandas DataFrame containing data on cases and deaths
            for all counties in TX

        """
        # Load data and rename county/convert date
        df = self.arcgis_jsons_to_df(data).rename(columns={"County": "location_name"})
        df["dt"] = self._retrieve_dt("US/Central")

        # Put into long format
        out = df.melt(
            id_vars=["location_name", "dt"], value_vars=self.crename.keys()
        ).dropna()
        out["value"] = out["value"].astype(int)
        out["vintage"] = self._retrieve_vintage()

        # Extract category information and add other variable context
        out = self.extract_CMU(out, self.crename)

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]

        return out.loc[:, cols_to_keep]
Exemple #28
0
    def normalize(self, data):
        data = (self.arcgis_jsons_to_df(data).fillna(0).rename(
            columns={"County": "location_name"}))
        data = self._get_clean_data(data)
        newest_date = data["dt"].max()

        # get cumulative data
        df = data.melt(id_vars=["dt", "location_name"],
                       value_vars=self.var_columns).dropna()

        # sum total values by county for first and second dose
        # NOTE: This currently uses a sum to get the total number of
        #       vaccines administered -- This depends on the fact
        #       that we're only working with sex for now
        df1 = (df.query('variable.str.contains("dose_1")').groupby(
            "location_name", as_index=False)["value"].sum())
        df1["category"] = "total_vaccine_initiated"
        df2 = (df.query('variable.str.contains("dose_2")').groupby(
            "location_name", as_index=False)["value"].sum())
        df2["category"] = "total_vaccine_completed"

        # combine dfs and fill needed columns
        df = pd.concat(
            [
                df1,
                df2,
            ],
            axis=0,
            ignore_index=True,
        )
        cumulative_df = self._populate_cols(df, newest_date)

        # get weekly snapshots
        # create "total" 1st and 2nd dose columns by week
        weekly_df = self._get_clean_data(data)
        weekly_df["dose_1"] = (weekly_df["dose_1_male"] +
                               weekly_df["dose_1_female"] +
                               weekly_df["dose_1_sex_unknown"])
        weekly_df["dose_2"] = (weekly_df["dose_2_male"] +
                               weekly_df["dose_2_female"] +
                               weekly_df["dose_2_sex_unknown"])

        crename = {
            "dose_1":
            CMU(
                category="total_vaccine_initiated",
                measurement="new_7_day",
                unit="people",
            ),
            "dose_2":
            CMU(
                category="total_vaccine_completed",
                measurement="new_7_day",
                unit="people",
            ),
            "dose_1_male":
            CMU(
                category="total_vaccine_initiated",
                measurement="new_7_day",
                unit="people",
                sex="male",
            ),
            "dose_2_male":
            CMU(
                category="total_vaccine_completed",
                measurement="new_7_day",
                unit="people",
                sex="male",
            ),
            "dose_1_female":
            CMU(
                category="total_vaccine_initiated",
                measurement="new_7_day",
                unit="people",
                sex="female",
            ),
            "dose_2_female":
            CMU(
                category="total_vaccine_completed",
                measurement="new_7_day",
                unit="people",
                sex="female",
            ),
            "dose_1_sex_unknown":
            CMU(
                category="total_vaccine_initiated",
                measurement="new_7_day",
                unit="people",
                sex="unknown",
            ),
            "dose_2_sex_unknown":
            CMU(
                category="total_vaccine_completed",
                measurement="new_7_day",
                unit="people",
                sex="unknown",
            ),
        }

        weekly_df = weekly_df.melt(id_vars=["location_name", "dt"],
                                   value_vars=crename.keys()).dropna()
        weekly_df["value"] = weekly_df["value"].astype(int)
        weekly_df["vintage"] = self._retrieve_vintage()

        # Extract category information and add other variable context
        weekly_df = self.extract_CMU(weekly_df, crename)
        weekly_df = weekly_df.drop(columns={"variable"})

        return pd.concat([cumulative_df, weekly_df], ignore_index=True)
Exemple #29
0
    def normalize(self, data):
        # read in data, remove extra header cols, rename column names
        dfs = []
        for el in data:
            dfs.append(self._truncate_data(el.df))
        df = pd.concat(dfs)

        # # Ignore data from unknown region (no fips code) and fix naming convention for problem counties, and total state vals
        df = df.query(
            "location_name != 'Unknown' &"
            "location_name != 'Out-Of-State' &"
            "location_name != 'Total'"
        )
        df = df.replace({"location_name": {"Desoto": "DeSoto", "Dade": "Miami-Dade"}})

        # Make all columns (except location) numeric
        for col in df.columns:
            if col == "location_name":
                continue
            else:
                df.loc[:, col] = pd.to_numeric(df.loc[:, col].str.replace(",", ""))

        # First dose and second dose need to be added together to get at least one vaccinated
        df.loc[:, "first_dose_total"] = df.eval(
            "first_dose_total + series_complete_total"
        )

        crename = {
            "first_dose_new": CMU(
                category="total_vaccine_initiated",
                measurement="new",
                unit="people",
            ),
            "series_complete_new": CMU(
                category="total_vaccine_completed",
                measurement="new",
                unit="people",
            ),
            "total_people_vaccinated_new": CMU(
                category="total_vaccine_doses_administered",
                measurement="new",
                unit="doses",
            ),
            "first_dose_total": CMU(
                category="total_vaccine_initiated",
                measurement="cumulative",
                unit="people",
            ),
            "series_complete_total": CMU(
                category="total_vaccine_completed",
                measurement="cumulative",
                unit="people",
            ),
            "total_people_vaccinated_total": CMU(
                category="total_vaccine_doses_administered",
                measurement="cumulative",
                unit="doses",
            ),
        }

        out = df.melt(id_vars=["location_name"], value_vars=crename.keys()).dropna()
        out = self.extract_CMU(out, crename)
        out["vintage"] = self._retrieve_vintage()
        out["dt"] = self._get_date()

        cols_to_keep = [
            "vintage",
            "dt",
            "location_name",
            "category",
            "measurement",
            "unit",
            "age",
            "race",
            "ethnicity",
            "sex",
            "value",
        ]
        return out.loc[:, cols_to_keep]
Exemple #30
0
    def normalize(self, data):
        # retrieve data and convert dataframe structure
        data = pd.ExcelFile(data.content)
        df_age = self._wrangle(data.parse("Lives Lost by Age"))
        df_sex = self._wrangle(data.parse("Lives Lost by Sex"))
        df_race = self._wrangle(data.parse("Lives Lost by Race"))

        # maps for each df
        crename_age = {
            "<19":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="0-19",
            ),
            "20-29":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="20-29",
            ),
            "30-39":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="30-39",
            ),
            "40-49":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="40-49",
            ),
            "50-59":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="50-59",
            ),
            "60-69":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="60-69",
            ),
            "70-79":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="70-79",
            ),
            "80+":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                age="80_plus",
            ),
        }
        crename_sex = {
            "Female":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                sex="female",
            ),
            "Male":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                sex="male",
            ),
        }
        crename_race = {
            "Asian":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="asian",
            ),
            "Black/African American":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="black",
            ),
            "Hispanic/Latinx":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="hispanic",
            ),
            "Non-Hispanic White":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="white",
            ),
            "Unknown":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
                race="unknown",
            ),
            "All":
            CMU(
                category="deaths",
                measurement="cumulative",
                unit="people",
            ),
        }

        # rename and add columns according to map
        df_age = self._reshape(df_age, crename_age)
        df_sex = self._reshape(df_sex, crename_sex)
        df_race = self._reshape(df_race, crename_race)

        # combine all into one df
        df = pd.DataFrame()
        df = (df.append(df_age, ignore_index=True).append(
            df_sex, ignore_index=True).append(df_race, ignore_index=True))

        return df