Ejemplo n.º 1
0
def test_int_wlog():
    # caplog.set_level(logging.INFO)
    assert das_utils.int_wlog("2", "var") == 2
    assert das_utils.int_wlog("2003 ", "var") == 2003
    with pytest.raises(ValueError) as err:
        das_utils.int_wlog("2c03 ", "var")
    assert "invalid literal" in str(err.value)
Ejemplo n.º 2
0
 def elderly_recode(self, row: dict):
     if int_wlog(row[self.p75], self.p75) == 1:
         return 3
     elif int_wlog(row[self.p65], self.p65) == 1:
         return 2
     elif int_wlog(row[self.p60], self.p60) == 1:
         return 1
     else:
         return 0
Ejemplo n.º 3
0
def map_to_hhgq(gqtype):
    """
        args:
            gqtype - 3 char str encoding gqtype
        returns: hhgq for 2018 tab of P42.
    """
    assert isinstance(gqtype, str), "GQTYPE is not a str"
    assert len(gqtype) == 3, "GQTYPE is not a str of length 3"

    # Empty/NULL fields stands for Housing Unit, not GQ
    if gqtype.strip() == "":
        return 0

    gq = int_wlog(gqtype, "GQTYPE")

    # For the 2018 End_to_end, not all GQ codes are used, and those that are used are collapsed to a 101,201,301 etc,
    # and mapped to a single digit within the DAS
    # 700s,800s,900s, all mapped to 7
    if gq < 700:
        return gq // 100
    # if 101 <= gq <= 106: return 1
    # if 201 <= gq <= 203: return 2
    # if gq == 301: return 3
    # if 401 <= gq <= 405: return 4
    # if 501 <= gq <= 502: return 5
    # if 601 <= gq <= 602: return 6
    if gq >= 700:
        return 7
Ejemplo n.º 4
0
def grfc_map_to_oidtb_geocode(row):
    """
    :param row: Spark Row
    :return:
    """
    geocode = row.TABBLKST + row.TABBLKCOU + row.TABTRACTCE + row.TABBLKGRPCE + row.TABBLK
    return (int_wlog(row.OIDTABBLK,"OIDTABBLK"), geocode) # new test data should work as excepted.
    def cvap_race_recode(self, row: dict):
        _NOT_HISPANIC = "1"
        _HISPANIC = "2"
        allowed = [_HISPANIC, _NOT_HISPANIC]
        cenhisp = row[self.cenhisp]
        if cenhisp not in allowed:
            raise ValueError(f"CENHISP not in {allowed}")
        if cenhisp == _HISPANIC:
            return 0

        cenrace = int_wlog(row[self.cenrace], "CENRACE")
        if not 1 <= cenrace <= 63:
            raise ValueError(f"CENRACE not in 1-63")

        # 1-6 in MDF/CEF are the categories for single race; 7-9 are the three combinations with white in the same order as in CVAPRace attribute
        if cenrace in range(1, 10):
            #  We normally subtract 1 since histograms start with 0, but here we'd add it back in, to account for Hispanic level 0 in CVAPRace attribute
            return cenrace
        # Skip 10-11, which are white+nhopi and white+SOR, not in CVAPRace separately
        # 12 is Black and AIAN
        if cenrace == 12:
            return 10

        # The rest is remainder of 2+ races
        return 11
Ejemplo n.º 6
0
def unit_keyed_by_mafid(cef_unit):
    """
        args:
            cef_unit - CEF_UNIT obj, spec TBD but assuming MAFID, TEN, QGQTYP, OIDTB attr.
        returns: tuple keyed by MAFID.
    """
    maf = int_wlog(cef_unit.MAFID, "MAFID")

    ten = int_wlog(cef_unit.TEN, "TEN")
    if ten > 4 or ten < 0:
        err_msg = f"TEN is outside legal values range [0,4] ({cef_unit.TEN})"
        logging.error(err_msg)
        raise ValueError(err_msg)
    # For 2008 End-to-end only "Unoccupied"(0)/"Occupied"(1) are used for tenure variable TEN
    ten = int(bool(ten))
    gqtype = map_to_hhgq(cef_unit.QGQTYP)
    geo = cef_unit.OIDTB
    return (maf, (ten, gqtype, geo))
Ejemplo n.º 7
0
 def multi_recode(self, row: dict):
     multi = int_wlog(row[self.multi], self.multi)
     if multi in [0, 1]:
         return 0
     elif multi == 2:
         return 1
     else:
         raise DASValueError(
             f"MULTG variable values {multi} is out of specification",
             multi)
Ejemplo n.º 8
0
    def recode(self, row):
        """
        Coding in the CVAP file is as follows

            1 = non-Hispanic AIAN
            2 = non-Hispanic Asian
            3 = non-Hispanic Black
            4 = non-Hispanic NHOPI
            5 = non-Hispanic White
            6 = non-Hispanic Some Other Race
            7 = non-Hispanic AIAN & White
            8 = non-Hispanic Asian & White
            9 = non-Hispanic Black & White
            10 = non-Hispanic AIAN & Black
            11 = non-Hispanic Remainder of Two or More Races
            12 = Hispanic or Latino

        Convert them to the ones in the DAS schema attribute programs.schema.attributes.race_cvap.CVAPRace:

             "Hispanic or Latino": [0],
             "White alone": [1],
             "Black or African American alone": [2],
             "American Indian and Alaska Native alone": [3],
             "Asian alone": [4],
             "Native Hawaiian and Other Pacific Islander alone": [5],
             "Some Other Race alone": [6],
             "Black or African American and White": [7],
             "American Indian and Alaska Native and White": [8],
             "Asian and White": [9],
             "American Indian and Alaska Native and Black or African American": [10],
             "Remainder of Two or More Race Responses": [11],

        """
        cvap_race = int_wlog(row[self.cvap_race], "cvap_race")
        if not 1 <= cvap_race <= 12:
            raise ValueError("CVAP_RACE is not in 1-12")

        cvap_file_2_das = {
            1: 3,  # AIAN
            2: 4,  # Asian
            3: 2,  # Black
            4: 5,  # NHOPI
            5: 1,  # White
            6: 6,  # SOR
            7: 8,  # AIAN + White
            8: 9,  # Asian + White
            9: 7,  # Black + White
            10: 10,  # AIAN + Black
            11: 11,  # Remainder of Two or More Races
            12: 0,  # Hispanic
        }

        race_cvap_das = cvap_file_2_das[cvap_race]
        return Row(**row.asDict(), race_cvap_das=race_cvap_das)
Ejemplo n.º 9
0
def unit_keyed_by_joinkey(cef_unit):
    """
        args:
            cef_unit - sql row that includes serial, statefip, county, enumdist, gqtype, gq
        returns: tuple keyed by MAFID.
    """
    # TODO: Change to 1940 spec
    # TODO: check if GQ could be vacant in 1940.
    # use numprec for vacant check? or use GQ.

    joinkey = int_wlog(cef_unit.serial, "serial")

    ten = int_wlog(cef_unit.gq, "gq")
    if ten > 6 or ten < 0:
        err_msg = f"TEN is outside legal values range [0,6] ({cef_unit.gq})"
        logging.error(err_msg)
        raise ValueError(err_msg)
    # For 2008 End-to-end only "Unoccupied"(0)/"Occupied"(1) are used for tenure variable TEN
    ten = int(bool(ten))
    gqtype = map_to_hhgq(cef_unit.gqtype)
    geo = cef_unit.statefip + cef_unit.county + cef_unit.enumdist
    return (joinkey, (ten, gqtype, geo))
Ejemplo n.º 10
0
def per_keyed_by_joinkey(cef_per):
    """
        args:
            cef_per - sql row that includes serial, age, hispan, race.
        returns: tuple keyed by MAFID.
    """
    joinkey = int_wlog(cef_per.serial, "serial")

    age = int(int_wlog(cef_per.age, "age") >= 18)

    hisp = int(int_wlog(cef_per.hispan, "hispan") > 0)
    if hisp > 1 or hisp < 0:
        err_msg = f"hispan is invalid ({cef_per.hispan})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    race = int_wlog(cef_per.race, "race") - 1
    if race > 5 or race < 0:
        err_msg = f"race is outside [1,6] range ({cef_per.race})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    return (joinkey, (age, hisp, race))
Ejemplo n.º 11
0
def per_keyed_by_mafid(cef_per):
    """
        args:
            cef_per - CEF_PER obj, spec TBD but assuming MAFID, QAGE, CENHISP, CENRACE attr.
        returns: tuple keyed by MAFID.
    """

    maf = int_wlog(cef_per.MAFID, "MAFID")

    age = int(int_wlog(cef_per.QAGE, "QAGE") >= 18)

    hisp = int_wlog(cef_per.CENHISP, "CENHISP") - 1
    if hisp > 1 or hisp < 0:
        err_msg = f"CENHISP is neither 1, nor 2 ({cef_per.CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    race = int_wlog(cef_per.CENRACE, "CENRACE") - 1
    if race > 62 or race < 0:
        err_msg = f"CENRACE is outside [1,63] range ({cef_per.CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    return (maf, (age, hisp, race))
Ejemplo n.º 12
0
def map_to_hhgq(gqtype):
    """
        args:
            gqtype - 3 char str encoding gqtype
        returns: hhgq for 2018 tab of P42.
    """
    assert isinstance(gqtype, str), "GQTYPE is not a str"

    gqtype = int_wlog(gqtype, "gqtype")

    # For the 1940 data, gqtypes 1,5 do not appear. Only 0,2,3,4,6,7,8,9. Values are shifted down to 0-7.

    if gqtype in [2, 3, 4]:
        gqtype = gqtype - 1
    elif gqtype in [6, 7, 8, 9]:
        gqtype = gqtype - 2
    return gqtype
def to_per_line(record):
    # Check that the record has appropriate number of fields
    exp_len = 7
    act_len = len(record)
    assert act_len == exp_len, f"Record {record} has {act_len} instead of {exp_len} fields"

    geocode, euid, epnum, hhgq, va, hisp, race = record

    SCHEMA_TYPE_CODE = "MPD1940"
    SCHEMA_BUILD_ID = "3.1.4"

    ## Check that geocode length is 10 - length for 1940.
    assert isinstance(
        geocode,
        str), f"Geocode ({geocode}) in record ({record}) is not in str format"
    assert len(
        geocode
    ) == 10, "Geocode of record {} has length {} instead of 16".format(
        record, len(geocode))
    STATE = geocode[:2]
    COUNTY = geocode[2:6]
    ENUMDIST = geocode[6:10]

    EUID = int_wlog(euid, "EUID")
    EPNUM = int_wlog(epnum, "EPNUM") + 1

    t = type(hhgq)
    assert np.issubdtype(
        t, int
    ), f"HHGQ ({hhgq}) in record ({record}) is not in integer format has type {t}"
    t = type(va)
    assert np.issubdtype(
        t, int
    ), f"VAGE ({va}) in record ({record}) is not in integer format has type {t}"

    RTYPE = "3" if hhgq == 0 else "5"
    QREL = "99"
    QSEX = "9"
    QAGE = "17" if va == 0 else "18"

    CENHISP = str(int_wlog(hisp, "CENHISP") + 1)
    CENRACE = str(int_wlog(race, "CENRACE") + 1).zfill(1)

    if hisp > 1 or hisp < 0:
        err_msg = f"CENHISP is neither 1, nor 2 ({CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    if race > 5 or race < 0:
        err_msg = f"CENRACE is outside [1,6] range ({CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    QSPANX = "9999"
    QRACE1 = "9999"
    QRACE2 = "9999"
    QRACE3 = "9999"
    QRACE4 = "9999"
    QRACE5 = "9999"
    QRACE6 = "9999"
    QRACE7 = "9999"
    QRACE8 = "9999"
    CIT = "9"
    line = [
        SCHEMA_TYPE_CODE, SCHEMA_BUILD_ID, STATE, COUNTY, ENUMDIST, EUID,
        EPNUM, RTYPE, QREL, QSEX, QAGE, CENHISP, CENRACE, QSPANX, QRACE1,
        QRACE2, QRACE3, QRACE4, QRACE5, QRACE6, QRACE7, QRACE8, CIT
    ]
    return "|".join([str(x) for x in line])
Ejemplo n.º 14
0
 def hhgq_recode(self, row: dict):
     gq = int_wlog(row[self.gqtype], self.gqtype)
     return gq // 100
Ejemplo n.º 15
0
 def citizen_das_recode(self, row: dict):
     return 2 - int_wlog(row[self.citizen], self.citizen)
Ejemplo n.º 16
0
 def subtract_one(row, name):
     return int_wlog(row[name], name) - 1
Ejemplo n.º 17
0
def to_per_line(record):
    # TODO: confirm line order or stick in config.
    # assume record is tuple (geocode, hhgq, va, hisp, race)

    # Check that the record has appropriate number of fields
    exp_len = 7
    act_len = len(record)
    assert act_len == exp_len, f"Record {record} has {act_len} instead of {exp_len} fields"

    geocode, euid, epnum, hhgq, va, hisp, race = record

    SCHEMA_TYPE_CODE = "MPD"
    SCHEMA_BUILD_ID = "3.1.4"

    # Check that geocode length is 16
    assert isinstance(
        geocode,
        str), f"Geocode ({geocode}) in record ({record}) is not in str format"
    assert len(
        geocode
    ) == 16, "Geocode of record {} has length {} instead of 16".format(
        record, len(geocode))
    TABBLKST = geocode[:2]
    TABBLKCOU = geocode[2:5]
    TABTRACTCE = geocode[5:11]
    TABBLKGRPCE = geocode[11:12]
    TABBLK = geocode[12:16]

    EUID = int_wlog(euid, "EUID")
    EPNUM = int_wlog(epnum, "EPNUM") + 1

    t = type(hhgq)
    assert np.issubdtype(
        t, int
    ), f"HHGQ ({hhgq}) in record ({record}) is not in integer format has type {t}"
    t = type(va)
    assert np.issubdtype(
        t, int
    ), f"VAGE ({va}) in record ({record}) is not in integer format has type {t}"

    RTYPE = "3" if hhgq == 0 else "5"
    QREL = "99"
    QSEX = "9"
    QAGE = "17" if va == 0 else "18"

    CENHISP = str(int_wlog(hisp, "CENHISP") + 1)
    CENRACE = str(int_wlog(race, "CENRACE") + 1).zfill(2)

    if hisp > 1 or hisp < 0:
        err_msg = f"CENHISP is neither 1, nor 2 ({CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    if race > 62 or race < 0:
        err_msg = f"CENRACE is outside [1,63] range ({CENHISP})"
        logging.error(err_msg)
        raise ValueError(err_msg)

    QSPANX = "9999"
    QRACE1 = "9999"
    QRACE2 = "9999"
    QRACE3 = "9999"
    QRACE4 = "9999"
    QRACE5 = "9999"
    QRACE6 = "9999"
    QRACE7 = "9999"
    QRACE8 = "9999"
    CIT = "9"
    line = [
        SCHEMA_TYPE_CODE, SCHEMA_BUILD_ID, TABBLKST, TABBLKCOU, TABTRACTCE,
        TABBLKGRPCE, TABBLK, EUID, EPNUM, RTYPE, QREL, QSEX, QAGE, CENHISP,
        CENRACE, QSPANX, QRACE1, QRACE2, QRACE3, QRACE4, QRACE5, QRACE6,
        QRACE7, QRACE8, CIT
    ]
    return "|".join([str(x) for x in line])