def test_int_wlog(): # caplog.set_level(logging.INFO) assert das_utils.int_wlog("2", "var") == 2 assert das_utils.int_wlog("2003 ", "var") == 2003 with pytest.raises(ValueError) as err: das_utils.int_wlog("2c03 ", "var") assert "invalid literal" in str(err.value)
def elderly_recode(self, row: dict): if int_wlog(row[self.p75], self.p75) == 1: return 3 elif int_wlog(row[self.p65], self.p65) == 1: return 2 elif int_wlog(row[self.p60], self.p60) == 1: return 1 else: return 0
def map_to_hhgq(gqtype): """ args: gqtype - 3 char str encoding gqtype returns: hhgq for 2018 tab of P42. """ assert isinstance(gqtype, str), "GQTYPE is not a str" assert len(gqtype) == 3, "GQTYPE is not a str of length 3" # Empty/NULL fields stands for Housing Unit, not GQ if gqtype.strip() == "": return 0 gq = int_wlog(gqtype, "GQTYPE") # For the 2018 End_to_end, not all GQ codes are used, and those that are used are collapsed to a 101,201,301 etc, # and mapped to a single digit within the DAS # 700s,800s,900s, all mapped to 7 if gq < 700: return gq // 100 # if 101 <= gq <= 106: return 1 # if 201 <= gq <= 203: return 2 # if gq == 301: return 3 # if 401 <= gq <= 405: return 4 # if 501 <= gq <= 502: return 5 # if 601 <= gq <= 602: return 6 if gq >= 700: return 7
def grfc_map_to_oidtb_geocode(row): """ :param row: Spark Row :return: """ geocode = row.TABBLKST + row.TABBLKCOU + row.TABTRACTCE + row.TABBLKGRPCE + row.TABBLK return (int_wlog(row.OIDTABBLK,"OIDTABBLK"), geocode) # new test data should work as excepted.
def cvap_race_recode(self, row: dict): _NOT_HISPANIC = "1" _HISPANIC = "2" allowed = [_HISPANIC, _NOT_HISPANIC] cenhisp = row[self.cenhisp] if cenhisp not in allowed: raise ValueError(f"CENHISP not in {allowed}") if cenhisp == _HISPANIC: return 0 cenrace = int_wlog(row[self.cenrace], "CENRACE") if not 1 <= cenrace <= 63: raise ValueError(f"CENRACE not in 1-63") # 1-6 in MDF/CEF are the categories for single race; 7-9 are the three combinations with white in the same order as in CVAPRace attribute if cenrace in range(1, 10): # We normally subtract 1 since histograms start with 0, but here we'd add it back in, to account for Hispanic level 0 in CVAPRace attribute return cenrace # Skip 10-11, which are white+nhopi and white+SOR, not in CVAPRace separately # 12 is Black and AIAN if cenrace == 12: return 10 # The rest is remainder of 2+ races return 11
def unit_keyed_by_mafid(cef_unit): """ args: cef_unit - CEF_UNIT obj, spec TBD but assuming MAFID, TEN, QGQTYP, OIDTB attr. returns: tuple keyed by MAFID. """ maf = int_wlog(cef_unit.MAFID, "MAFID") ten = int_wlog(cef_unit.TEN, "TEN") if ten > 4 or ten < 0: err_msg = f"TEN is outside legal values range [0,4] ({cef_unit.TEN})" logging.error(err_msg) raise ValueError(err_msg) # For 2008 End-to-end only "Unoccupied"(0)/"Occupied"(1) are used for tenure variable TEN ten = int(bool(ten)) gqtype = map_to_hhgq(cef_unit.QGQTYP) geo = cef_unit.OIDTB return (maf, (ten, gqtype, geo))
def multi_recode(self, row: dict): multi = int_wlog(row[self.multi], self.multi) if multi in [0, 1]: return 0 elif multi == 2: return 1 else: raise DASValueError( f"MULTG variable values {multi} is out of specification", multi)
def recode(self, row): """ Coding in the CVAP file is as follows 1 = non-Hispanic AIAN 2 = non-Hispanic Asian 3 = non-Hispanic Black 4 = non-Hispanic NHOPI 5 = non-Hispanic White 6 = non-Hispanic Some Other Race 7 = non-Hispanic AIAN & White 8 = non-Hispanic Asian & White 9 = non-Hispanic Black & White 10 = non-Hispanic AIAN & Black 11 = non-Hispanic Remainder of Two or More Races 12 = Hispanic or Latino Convert them to the ones in the DAS schema attribute programs.schema.attributes.race_cvap.CVAPRace: "Hispanic or Latino": [0], "White alone": [1], "Black or African American alone": [2], "American Indian and Alaska Native alone": [3], "Asian alone": [4], "Native Hawaiian and Other Pacific Islander alone": [5], "Some Other Race alone": [6], "Black or African American and White": [7], "American Indian and Alaska Native and White": [8], "Asian and White": [9], "American Indian and Alaska Native and Black or African American": [10], "Remainder of Two or More Race Responses": [11], """ cvap_race = int_wlog(row[self.cvap_race], "cvap_race") if not 1 <= cvap_race <= 12: raise ValueError("CVAP_RACE is not in 1-12") cvap_file_2_das = { 1: 3, # AIAN 2: 4, # Asian 3: 2, # Black 4: 5, # NHOPI 5: 1, # White 6: 6, # SOR 7: 8, # AIAN + White 8: 9, # Asian + White 9: 7, # Black + White 10: 10, # AIAN + Black 11: 11, # Remainder of Two or More Races 12: 0, # Hispanic } race_cvap_das = cvap_file_2_das[cvap_race] return Row(**row.asDict(), race_cvap_das=race_cvap_das)
def unit_keyed_by_joinkey(cef_unit): """ args: cef_unit - sql row that includes serial, statefip, county, enumdist, gqtype, gq returns: tuple keyed by MAFID. """ # TODO: Change to 1940 spec # TODO: check if GQ could be vacant in 1940. # use numprec for vacant check? or use GQ. joinkey = int_wlog(cef_unit.serial, "serial") ten = int_wlog(cef_unit.gq, "gq") if ten > 6 or ten < 0: err_msg = f"TEN is outside legal values range [0,6] ({cef_unit.gq})" logging.error(err_msg) raise ValueError(err_msg) # For 2008 End-to-end only "Unoccupied"(0)/"Occupied"(1) are used for tenure variable TEN ten = int(bool(ten)) gqtype = map_to_hhgq(cef_unit.gqtype) geo = cef_unit.statefip + cef_unit.county + cef_unit.enumdist return (joinkey, (ten, gqtype, geo))
def per_keyed_by_joinkey(cef_per): """ args: cef_per - sql row that includes serial, age, hispan, race. returns: tuple keyed by MAFID. """ joinkey = int_wlog(cef_per.serial, "serial") age = int(int_wlog(cef_per.age, "age") >= 18) hisp = int(int_wlog(cef_per.hispan, "hispan") > 0) if hisp > 1 or hisp < 0: err_msg = f"hispan is invalid ({cef_per.hispan})" logging.error(err_msg) raise ValueError(err_msg) race = int_wlog(cef_per.race, "race") - 1 if race > 5 or race < 0: err_msg = f"race is outside [1,6] range ({cef_per.race})" logging.error(err_msg) raise ValueError(err_msg) return (joinkey, (age, hisp, race))
def per_keyed_by_mafid(cef_per): """ args: cef_per - CEF_PER obj, spec TBD but assuming MAFID, QAGE, CENHISP, CENRACE attr. returns: tuple keyed by MAFID. """ maf = int_wlog(cef_per.MAFID, "MAFID") age = int(int_wlog(cef_per.QAGE, "QAGE") >= 18) hisp = int_wlog(cef_per.CENHISP, "CENHISP") - 1 if hisp > 1 or hisp < 0: err_msg = f"CENHISP is neither 1, nor 2 ({cef_per.CENHISP})" logging.error(err_msg) raise ValueError(err_msg) race = int_wlog(cef_per.CENRACE, "CENRACE") - 1 if race > 62 or race < 0: err_msg = f"CENRACE is outside [1,63] range ({cef_per.CENHISP})" logging.error(err_msg) raise ValueError(err_msg) return (maf, (age, hisp, race))
def map_to_hhgq(gqtype): """ args: gqtype - 3 char str encoding gqtype returns: hhgq for 2018 tab of P42. """ assert isinstance(gqtype, str), "GQTYPE is not a str" gqtype = int_wlog(gqtype, "gqtype") # For the 1940 data, gqtypes 1,5 do not appear. Only 0,2,3,4,6,7,8,9. Values are shifted down to 0-7. if gqtype in [2, 3, 4]: gqtype = gqtype - 1 elif gqtype in [6, 7, 8, 9]: gqtype = gqtype - 2 return gqtype
def to_per_line(record): # Check that the record has appropriate number of fields exp_len = 7 act_len = len(record) assert act_len == exp_len, f"Record {record} has {act_len} instead of {exp_len} fields" geocode, euid, epnum, hhgq, va, hisp, race = record SCHEMA_TYPE_CODE = "MPD1940" SCHEMA_BUILD_ID = "3.1.4" ## Check that geocode length is 10 - length for 1940. assert isinstance( geocode, str), f"Geocode ({geocode}) in record ({record}) is not in str format" assert len( geocode ) == 10, "Geocode of record {} has length {} instead of 16".format( record, len(geocode)) STATE = geocode[:2] COUNTY = geocode[2:6] ENUMDIST = geocode[6:10] EUID = int_wlog(euid, "EUID") EPNUM = int_wlog(epnum, "EPNUM") + 1 t = type(hhgq) assert np.issubdtype( t, int ), f"HHGQ ({hhgq}) in record ({record}) is not in integer format has type {t}" t = type(va) assert np.issubdtype( t, int ), f"VAGE ({va}) in record ({record}) is not in integer format has type {t}" RTYPE = "3" if hhgq == 0 else "5" QREL = "99" QSEX = "9" QAGE = "17" if va == 0 else "18" CENHISP = str(int_wlog(hisp, "CENHISP") + 1) CENRACE = str(int_wlog(race, "CENRACE") + 1).zfill(1) if hisp > 1 or hisp < 0: err_msg = f"CENHISP is neither 1, nor 2 ({CENHISP})" logging.error(err_msg) raise ValueError(err_msg) if race > 5 or race < 0: err_msg = f"CENRACE is outside [1,6] range ({CENHISP})" logging.error(err_msg) raise ValueError(err_msg) QSPANX = "9999" QRACE1 = "9999" QRACE2 = "9999" QRACE3 = "9999" QRACE4 = "9999" QRACE5 = "9999" QRACE6 = "9999" QRACE7 = "9999" QRACE8 = "9999" CIT = "9" line = [ SCHEMA_TYPE_CODE, SCHEMA_BUILD_ID, STATE, COUNTY, ENUMDIST, EUID, EPNUM, RTYPE, QREL, QSEX, QAGE, CENHISP, CENRACE, QSPANX, QRACE1, QRACE2, QRACE3, QRACE4, QRACE5, QRACE6, QRACE7, QRACE8, CIT ] return "|".join([str(x) for x in line])
def hhgq_recode(self, row: dict): gq = int_wlog(row[self.gqtype], self.gqtype) return gq // 100
def citizen_das_recode(self, row: dict): return 2 - int_wlog(row[self.citizen], self.citizen)
def subtract_one(row, name): return int_wlog(row[name], name) - 1
def to_per_line(record): # TODO: confirm line order or stick in config. # assume record is tuple (geocode, hhgq, va, hisp, race) # Check that the record has appropriate number of fields exp_len = 7 act_len = len(record) assert act_len == exp_len, f"Record {record} has {act_len} instead of {exp_len} fields" geocode, euid, epnum, hhgq, va, hisp, race = record SCHEMA_TYPE_CODE = "MPD" SCHEMA_BUILD_ID = "3.1.4" # Check that geocode length is 16 assert isinstance( geocode, str), f"Geocode ({geocode}) in record ({record}) is not in str format" assert len( geocode ) == 16, "Geocode of record {} has length {} instead of 16".format( record, len(geocode)) TABBLKST = geocode[:2] TABBLKCOU = geocode[2:5] TABTRACTCE = geocode[5:11] TABBLKGRPCE = geocode[11:12] TABBLK = geocode[12:16] EUID = int_wlog(euid, "EUID") EPNUM = int_wlog(epnum, "EPNUM") + 1 t = type(hhgq) assert np.issubdtype( t, int ), f"HHGQ ({hhgq}) in record ({record}) is not in integer format has type {t}" t = type(va) assert np.issubdtype( t, int ), f"VAGE ({va}) in record ({record}) is not in integer format has type {t}" RTYPE = "3" if hhgq == 0 else "5" QREL = "99" QSEX = "9" QAGE = "17" if va == 0 else "18" CENHISP = str(int_wlog(hisp, "CENHISP") + 1) CENRACE = str(int_wlog(race, "CENRACE") + 1).zfill(2) if hisp > 1 or hisp < 0: err_msg = f"CENHISP is neither 1, nor 2 ({CENHISP})" logging.error(err_msg) raise ValueError(err_msg) if race > 62 or race < 0: err_msg = f"CENRACE is outside [1,63] range ({CENHISP})" logging.error(err_msg) raise ValueError(err_msg) QSPANX = "9999" QRACE1 = "9999" QRACE2 = "9999" QRACE3 = "9999" QRACE4 = "9999" QRACE5 = "9999" QRACE6 = "9999" QRACE7 = "9999" QRACE8 = "9999" CIT = "9" line = [ SCHEMA_TYPE_CODE, SCHEMA_BUILD_ID, TABBLKST, TABBLKCOU, TABTRACTCE, TABBLKGRPCE, TABBLK, EUID, EPNUM, RTYPE, QREL, QSEX, QAGE, CENHISP, CENRACE, QSPANX, QRACE1, QRACE2, QRACE3, QRACE4, QRACE5, QRACE6, QRACE7, QRACE8, CIT ] return "|".join([str(x) for x in line])