def geo_reindex(self, data):
        """
        Reindex dataframe based on desired output geography.

        Args:
            data: dataframe, the output of load_data::load_data()

        Returns:
            reindexed dataframe

        """
        geo_map = GeoMapper()
        if self.geo == "county":
            data_frame = geo_map.fips_to_megacounty(
                data,
                Config.MIN_DEN,
                Config.MAX_BACKWARDS_PAD_LENGTH,
                thr_col="den",
                mega_col=self.geo)
        elif self.geo == "state":
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_col=self.geo,
                                                 new_code="state_id")
            data_frame[self.geo] = data_frame[self.geo]
        elif self.geo in ["msa", "hhs", "nation"]:
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_code=self.geo)
        elif self.geo == "hrr":
            data_frame = data  # data is already adjusted in aggregation step above
        else:
            logging.error(
                "%s is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', nation'",
                self.geo)
            return False

        unique_geo_ids = pd.unique(data_frame[self.geo])
        data_frame.set_index([self.geo, 'date'], inplace=True)

        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product(
            (unique_geo_ids, self.fit_dates), names=[self.geo, "date"])
        assert (
            len(multiindex) <=
            (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
        ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame
Beispiel #2
0
    def geo_reindex(self, data):
        """Reindex based on geography, include all date, geo pairs.

        Args:
            data: dataframe, the output of loadcombineddata
        Returns:
            dataframe
        """
        # get right geography
        geo = self.geo
        gmpr = GeoMapper()
        if geo not in {"county", "state", "msa", "hrr", "nation", "hhs"}:
            logging.error("{0} is invalid, pick one of 'county', "
                          "'state', 'msa', 'hrr', 'hss','nation'".format(geo))
            return False
        if geo == "county":
            data_frame = gmpr.fips_to_megacounty(data,
                                                 Config.MIN_DEN,
                                                 Config.MAX_BACKFILL_WINDOW,
                                                 thr_col="den",
                                                 mega_col=geo)
        elif geo == "state":
            data_frame = gmpr.replace_geocode(data,
                                              "fips",
                                              "state_id",
                                              new_col="state")
        else:
            data_frame = gmpr.replace_geocode(data, "fips", geo)

        unique_geo_ids = pd.unique(data_frame[geo])
        data_frame.set_index([geo, Config.DATE_COL], inplace=True)
        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product(
            (unique_geo_ids, self.fit_dates), names=[geo, Config.DATE_COL])
        assert (
            len(multiindex) <=
            (len(gmpr.get_geo_values(gmpr.as_mapper_name(geo))) *
             len(self.fit_dates))
        ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame