def normalize_named_counties(self, df): df = df.copy() del df["fipscode"], df["fipscode2"], df["total.votes"] county_normalizer = e.usa_county_to_fips( "state", alaska_handler=self.alaska_handler) county_normalizer.rewrite["alaska"] = "ERROR" county_normalizer.rewrite["baltimore"] = "baltimore county" county_normalizer.rewrite["franklin"] = "franklin county" county_normalizer.rewrite["richmond"] = "richmond county" county_normalizer.rewrite["jodaviess"] = "jo daviess" county_normalizer.rewrite["bedford"] = "bedford county" county_normalizer.rewrite["fairfax"] = "fairfax county" county_normalizer.rewrite["roanoke"] = "roanoke county" county_normalizer.rewrite["jeff davis"] = "jefferson davis" county_normalizer.rewrite["leflore"] = "le flore" county_normalizer.rewrite[ "oglala lakota (formerly shannon)"] = "oglala lakota" county_normalizer.rewrite["somerset cty townships"] = "somerset county" county_normalizer.rewrite["cook suburbs"] = "cook" county_normalizer.rewrite["oxford cty townships"] = "oxford county" county_normalizer.rewrite["state uocava"] = "ERROR" county_normalizer.rewrite["aroostook cty townships"] = "aroostook" county_normalizer.rewrite["franklin cty townships"] = "franklin county" county_normalizer.rewrite["hancock cty townships"] = "hancock county" county_normalizer.apply_to_df(df, col_in="county", col_out="county_fips", var_name="county_normalizer") df = e.remove_errors(df, "county_fips") return df
def puerto_rico(self): df = pd.read_excel( "https://www2.census.gov/programs-surveys/popest/tables/2010-2019/municipios/totals/prm-est2019-annres.xlsx" ) df = np.array(df) df = df[4:-5] counts = df[:, 1:] countystate = [x[1:].split(", ") for x in df[:, 0]] cols = [ "CTYNAME", "STNAME", "CENSUS2010POP", "ESTIMATESBASE2010", "POPESTIMATE2010", "POPESTIMATE2011", "POPESTIMATE2012", "POPESTIMATE2013", "POPESTIMATE2014", "POPESTIMATE2015", "POPESTIMATE2016", "POPESTIMATE2017", "POPESTIMATE2018", "POPESTIMATE2019", ] df = pd.DataFrame(np.concatenate([countystate, counts], axis=1), columns=cols) normalizer = e.usa_county_to_fips("STNAME") normalizer.apply_to_df(df, "CTYNAME", "FIPS", var_name="normalizer") return df
def get_direct(self): colleges = pd.read_csv( "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv" ) norm = e.usa_county_to_fips("state") norm.rewrite["baltimore"] = "baltimore city" norm.rewrite["st. louis"] = "st. louis city" norm.rewrite["new york city"] = "new york" norm.rewrite["franklin"] = "franklin city" norm.rewrite["richmond"] = "richmond city" norm.rewrite["fairfax"] = "fairfax city" norm.rewrite["roanoke"] = "roanoke city" norm.rewrite["st. thomas"] = "st. thomas island" norm.rewrite["doña ana"] = "dona ana" norm.rewrite["bayam_n"] = "bayamon" norm.rewrite["maoputasi"] = "ERROR" norm.rewrite["mangilao village"] = "ERROR" norm.rewrite["nan"] = "ERROR" norm.rewrite["joplin"] = "jasper" norm.rewrite["kansas city"] = "jackson" norm.rewrite["washington, d.c."] = "ERROR" norm.apply_to_df(colleges, "county", "county_fips", var_name="norm") colleges = e.remove_errors(colleges, "county_fips") agg = e.Aggregator( grouped_columns=["county_fips"], aggregation_functions={"college": lambda x: len(set(x))}, ) agg.removed_columns.append("cases") agg.removed_columns.append("cases_2021") agg.removed_columns.append("city") agg.removed_columns.append("college") agg.removed_columns.append("county") agg.removed_columns.append("ipeds_id") agg.removed_columns.append("notes") agg.removed_columns.append("state") agg.removed_columns.append("date") return agg(colleges, var_name="agg")
def get_direct(self): df = e.to_csv( e.download( "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv" )) df = df[df.COUNTY != 0].copy() counts = [ "CENSUS2010POP", "ESTIMATESBASE2010", "POPESTIMATE2010", "POPESTIMATE2011", "POPESTIMATE2012", "POPESTIMATE2013", "POPESTIMATE2014", "POPESTIMATE2015", "POPESTIMATE2016", "POPESTIMATE2017", "POPESTIMATE2018", "POPESTIMATE2019", ] df = df[["STNAME", "CTYNAME", *counts]] normalizer = e.usa_county_to_fips("STNAME", alaska_handler=self.alaska_handler) normalizer.rewrite["doña ana county"] = "dona ana county" normalizer.apply_to_df(df, "CTYNAME", "FIPS") df.loc[df.FIPS == "02AL", "CTYNAME"] = "Alaska" df = e.Aggregator(grouped_columns=["FIPS"], aggregation_functions={c: np.sum for c in counts})(df) df = e.merge( by_source=dict(mainland=df, pr=self.puerto_rico()), join_columns=["FIPS"], ignore_duplication={}, resolvers=[], ) return df
def get_direct(self): df_harvard = HarvardDataverse2018General(self.alaska_handler).get() df_mit = MITElectionLab2018General().get() df_mit = df_mit.copy() df_mit["state"] = df_mit["state_po"] df_mit = df_mit[[ "state", "office", "district", "votes_DEM", "votes_GOP", "votes_other", "county_fips", "special", ]] df_mit = df_mit[df_mit.office.apply( lambda x: x in {"us house", "us senate", "us state governor"})] df_mit = df_mit[df_mit.district != "District 0"] e.district_normalizer().apply_to_df(df_mit, "district", "district") df = e.merge( by_source={ "harvard": df_harvard, "mit": df_mit }, join_columns=[ "county_fips", "office", "district", "state", "special" ], ignore_duplication={"votes_other": np.mean}, resolvers=self.resolvers(), checksum=e.Aggregator( grouped_columns=["district", "office", "state", "special"], aggregation_functions={ "votes_DEM": sum, "votes_GOP": sum, "votes_other": sum, }, removed_columns=["county_fips"], ), ) by_district = e.Aggregator( grouped_columns=["district", "office", "state", "special"], removed_columns=["county_fips"], aggregation_functions=dict(votes_other=sum, votes_DEM=sum, votes_GOP=sum), )(df) summary = HarvardDataverseCongressDistrict().get_direct() e.validate_same( by_district[by_district.office == "us house"], summary[(summary.year == 2018) & (summary.office == "us house")], key_cols=["state", "district", "special"], check_cols=["votes_DEM", "votes_GOP"], ignore_missing=( [ ("FL", 10, False), ("FL", 14, False), ("FL", 21, False), ("FL", 24, False), ], [("NY", 25, True)], ), ignore_discrepancies=lambda k: k[0] == "ME", ) e.validate_same( by_district[by_district.office == "us senate"], summary[(summary.year == 2018) & (summary.office == "us senate")], key_cols=["state", "district", "special"], check_cols=["votes_DEM", "votes_GOP"], ignore_discrepancies=lambda k: k[0] == "ME", ) df = e.handle_uncontested( df, missing_counties=[(e.usa_county_to_fips("state")(county, dict(state="FL")), party) for county, party in [ ("Hillsborough", "DEM"), ("Miami-Dade", "DEM"), ("Broward", "DEM"), ("Orange", "DEM"), ]], missing_office="us house", replacement_offices=self.uncontested_replacements, fix_cols=["votes_DEM", "votes_GOP", "votes_other"], replacement_mode=self.uncontested_replacement_mode, ) return df