Exemple #1
0
    def normalize_named_counties(self, df):
        df = df.copy()
        del df["fipscode"], df["fipscode2"], df["total.votes"]
        county_normalizer = e.usa_county_to_fips(
            "state", alaska_handler=self.alaska_handler)
        county_normalizer.rewrite["alaska"] = "ERROR"
        county_normalizer.rewrite["baltimore"] = "baltimore county"
        county_normalizer.rewrite["franklin"] = "franklin county"
        county_normalizer.rewrite["richmond"] = "richmond county"
        county_normalizer.rewrite["jodaviess"] = "jo daviess"
        county_normalizer.rewrite["bedford"] = "bedford county"
        county_normalizer.rewrite["fairfax"] = "fairfax county"
        county_normalizer.rewrite["roanoke"] = "roanoke county"
        county_normalizer.rewrite["jeff davis"] = "jefferson davis"
        county_normalizer.rewrite["leflore"] = "le flore"
        county_normalizer.rewrite[
            "oglala lakota (formerly shannon)"] = "oglala lakota"
        county_normalizer.rewrite["somerset cty townships"] = "somerset county"
        county_normalizer.rewrite["cook suburbs"] = "cook"
        county_normalizer.rewrite["oxford cty townships"] = "oxford county"
        county_normalizer.rewrite["state uocava"] = "ERROR"
        county_normalizer.rewrite["aroostook cty townships"] = "aroostook"
        county_normalizer.rewrite["franklin cty townships"] = "franklin county"
        county_normalizer.rewrite["hancock cty townships"] = "hancock county"
        county_normalizer.apply_to_df(df,
                                      col_in="county",
                                      col_out="county_fips",
                                      var_name="county_normalizer")
        df = e.remove_errors(df, "county_fips")

        return df
 def puerto_rico(self):
     df = pd.read_excel(
         "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx"
     )
     df = np.array(df)
     df = df[4:-5]
     counts = df[:, 1:]
     countystate = [x[1:].split(", ") for x in df[:, 0]]
     cols = [
         "CTYNAME",
         "STNAME",
         "CENSUS2010POP",
         "ESTIMATESBASE2010",
         "POPESTIMATE2010",
         "POPESTIMATE2011",
         "POPESTIMATE2012",
         "POPESTIMATE2013",
         "POPESTIMATE2014",
         "POPESTIMATE2015",
         "POPESTIMATE2016",
         "POPESTIMATE2017",
         "POPESTIMATE2018",
         "POPESTIMATE2019",
     ]
     df = pd.DataFrame(np.concatenate([countystate, counts], axis=1),
                       columns=cols)
     normalizer = e.usa_county_to_fips("STNAME")
     normalizer.apply_to_df(df, "CTYNAME", "FIPS", var_name="normalizer")
     return df
Exemple #3
0
    def get_direct(self):
        colleges = pd.read_csv(
            "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
        )
        norm = e.usa_county_to_fips("state")
        norm.rewrite["baltimore"] = "baltimore city"
        norm.rewrite["st. louis"] = "st. louis city"
        norm.rewrite["new york city"] = "new york"
        norm.rewrite["franklin"] = "franklin city"
        norm.rewrite["richmond"] = "richmond city"
        norm.rewrite["fairfax"] = "fairfax city"
        norm.rewrite["roanoke"] = "roanoke city"
        norm.rewrite["st. thomas"] = "st. thomas island"
        norm.rewrite["doña ana"] = "dona ana"
        norm.rewrite["bayam_n"] = "bayamon"
        norm.rewrite["maoputasi"] = "ERROR"
        norm.rewrite["mangilao village"] = "ERROR"
        norm.rewrite["nan"] = "ERROR"
        norm.rewrite["joplin"] = "jasper"
        norm.rewrite["kansas city"] = "jackson"
        norm.rewrite["washington, d.c."] = "ERROR"
        norm.apply_to_df(colleges, "county", "county_fips", var_name="norm")

        colleges = e.remove_errors(colleges, "county_fips")
        agg = e.Aggregator(
            grouped_columns=["county_fips"],
            aggregation_functions={"college": lambda x: len(set(x))},
        )

        agg.removed_columns.append("cases")
        agg.removed_columns.append("cases_2021")
        agg.removed_columns.append("city")
        agg.removed_columns.append("college")
        agg.removed_columns.append("county")
        agg.removed_columns.append("ipeds_id")
        agg.removed_columns.append("notes")
        agg.removed_columns.append("state")
        agg.removed_columns.append("date")

        return agg(colleges, var_name="agg")
    def get_direct(self):

        df = e.to_csv(
            e.download(
                "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
            ))
        df = df[df.COUNTY != 0].copy()
        counts = [
            "CENSUS2010POP",
            "ESTIMATESBASE2010",
            "POPESTIMATE2010",
            "POPESTIMATE2011",
            "POPESTIMATE2012",
            "POPESTIMATE2013",
            "POPESTIMATE2014",
            "POPESTIMATE2015",
            "POPESTIMATE2016",
            "POPESTIMATE2017",
            "POPESTIMATE2018",
            "POPESTIMATE2019",
        ]
        df = df[["STNAME", "CTYNAME", *counts]]
        normalizer = e.usa_county_to_fips("STNAME",
                                          alaska_handler=self.alaska_handler)
        normalizer.rewrite["doña ana county"] = "dona ana county"
        normalizer.apply_to_df(df, "CTYNAME", "FIPS")
        df.loc[df.FIPS == "02AL", "CTYNAME"] = "Alaska"
        df = e.Aggregator(grouped_columns=["FIPS"],
                          aggregation_functions={c: np.sum
                                                 for c in counts})(df)
        df = e.merge(
            by_source=dict(mainland=df, pr=self.puerto_rico()),
            join_columns=["FIPS"],
            ignore_duplication={},
            resolvers=[],
        )
        return df
    def get_direct(self):
        df_harvard = HarvardDataverse2018General(self.alaska_handler).get()

        df_mit = MITElectionLab2018General().get()
        df_mit = df_mit.copy()

        df_mit["state"] = df_mit["state_po"]

        df_mit = df_mit[[
            "state",
            "office",
            "district",
            "votes_DEM",
            "votes_GOP",
            "votes_other",
            "county_fips",
            "special",
        ]]

        df_mit = df_mit[df_mit.office.apply(
            lambda x: x in {"us house", "us senate", "us state governor"})]
        df_mit = df_mit[df_mit.district != "District 0"]

        e.district_normalizer().apply_to_df(df_mit, "district", "district")

        df = e.merge(
            by_source={
                "harvard": df_harvard,
                "mit": df_mit
            },
            join_columns=[
                "county_fips", "office", "district", "state", "special"
            ],
            ignore_duplication={"votes_other": np.mean},
            resolvers=self.resolvers(),
            checksum=e.Aggregator(
                grouped_columns=["district", "office", "state", "special"],
                aggregation_functions={
                    "votes_DEM": sum,
                    "votes_GOP": sum,
                    "votes_other": sum,
                },
                removed_columns=["county_fips"],
            ),
        )
        by_district = e.Aggregator(
            grouped_columns=["district", "office", "state", "special"],
            removed_columns=["county_fips"],
            aggregation_functions=dict(votes_other=sum,
                                       votes_DEM=sum,
                                       votes_GOP=sum),
        )(df)
        summary = HarvardDataverseCongressDistrict().get_direct()
        e.validate_same(
            by_district[by_district.office == "us house"],
            summary[(summary.year == 2018) & (summary.office == "us house")],
            key_cols=["state", "district", "special"],
            check_cols=["votes_DEM", "votes_GOP"],
            ignore_missing=(
                [
                    ("FL", 10, False),
                    ("FL", 14, False),
                    ("FL", 21, False),
                    ("FL", 24, False),
                ],
                [("NY", 25, True)],
            ),
            ignore_discrepancies=lambda k: k[0] == "ME",
        )
        e.validate_same(
            by_district[by_district.office == "us senate"],
            summary[(summary.year == 2018) & (summary.office == "us senate")],
            key_cols=["state", "district", "special"],
            check_cols=["votes_DEM", "votes_GOP"],
            ignore_discrepancies=lambda k: k[0] == "ME",
        )

        df = e.handle_uncontested(
            df,
            missing_counties=[(e.usa_county_to_fips("state")(county,
                                                             dict(state="FL")),
                               party) for county, party in [
                                   ("Hillsborough", "DEM"),
                                   ("Miami-Dade", "DEM"),
                                   ("Broward", "DEM"),
                                   ("Orange", "DEM"),
                               ]],
            missing_office="us house",
            replacement_offices=self.uncontested_replacements,
            fix_cols=["votes_DEM", "votes_GOP", "votes_other"],
            replacement_mode=self.uncontested_replacement_mode,
        )

        return df