Exemple #1
0
 def __init__(self):
     self.privacy_framework = "zcdp"
     self.schema = "TEST"
     self.hist_shape = (2, )
     self.unit_hist_shape = (2, )
     self.hist_vars = ("sex", )
     self.schema_obj = SchemaMaker.fromAttlist("justsex", [SexAttr])
     self.unit_schema_obj = SchemaMaker.fromAttlist(
         "justsex", [SexAttr])
     empty_dict = {
         'invar_names': (),
         'cons_names': (),
     }
     self.inv_con_by_level = {
         'Block': empty_dict,
     }
     self.levels = CC.GEOLEVEL_BLOCK, CC.GEOLEVEL_BLOCK_GROUP, CC.GEOLEVEL_TRACT, "Tract_Group", CC.GEOLEVEL_COUNTY, CC.GEOLEVEL_STATE, CC.GEOLEVEL_US
     self.geo_bottomlevel = 'Block'
     self.spine_type = 'non_aian_spine'
     self.plb_allocation = None
     self.geocode_dict = {1: '2', 3: '4'}
     self.dp_mechanism_name = CC.DISCRETE_GAUSSIAN_MECHANISM
     self.postprocess_only = False
     self.use_spark = False
     self.geolevel_prop_budgets = (Fraction(1, 5), Fraction(1, 5),
                                   Fraction(3, 25), Fraction(3, 25),
                                   Fraction(3, 25), Fraction(3, 25),
                                   Fraction(3, 25))
     self.only_dyadic_rationals = True  # Doesn't matter true or false because the budgets are assigned directly
def getTableBuilder(testtables=None):
    """
    :param testtables: Will run the testTableDefs function to ensure recodes
     from the same dimension aren't crossed in a table. This is useful if you
     get the error because it identifies the table and the cell of the list where
     the error occurs.

    :return:
    """
    schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ)
    tabledict = getTableDict()
    if testtables == True:

        tablebuilder.testTableDefs(schema, tabledict)

    else:
        schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ)
        tabledict = getTableDict()
        builder = tablebuilder.TableBuilder(schema, tabledict)
        return builder

        ############################################################
        ## Consolidated tables
        ############################################################
        '''
Exemple #3
0
 def __init__(self):
     self.privacy_framework = "zcdp"
     self.schema = "PL94"
     self.hist_shape = (8, 2, 2, 63)
     self.unit_hist_shape = (2, )
     self.hist_vars = ("hhgq", "votingage", "hisp", "cenrace")
     self.schema_obj = SchemaMaker.fromAttlist(
         "PL94", [HHGQ, VOTING_AGE, HISPANIC, CENRACE])
     self.unit_schema_obj = SchemaMaker.fromAttlist("justsex", [SEX])
     self.validate_input_data_constraints = False
     self.spine_type = 'non_aian_spine'
     self.plb_allocation = None
     self.geocode_dict = {
         5: "Block",
         4: "Block_Group",
         3: "Tract",
         2: "County",
         1: "State",
         0: "US"
     }
     self.dp_mechanism_name = CC.GEOMETRIC_MECHANISM
     self.inv_con_by_level = {
         'Block': {
             'invar_names': (),
             'cons_names': (),
         },
         'Block_Group': {
             'invar_names': (),
             'cons_names': (),
         },
         'Tract': {
             'invar_names': (),
             'cons_names': (),
         },
         'County': {
             'invar_names': (),
             'cons_names': ()
         },
         'State': {
             'invar_names': (),
             'cons_names': ()
         },
         'US': {
             'invar_names': (),
             'cons_names': ()
         }
     }
     self.levels = list(self.inv_con_by_level.keys())
     self.geo_bottomlevel = 'Block'
     self.geolevel_prop_budgets = (Fraction(44,
                                            1024), Fraction(44, 1024),
                                   Fraction(44 / 1024),
                                   Fraction(44,
                                            1024), Fraction(127, 1024),
                                   Fraction(721, 1024))
     self.postprocess_only = False
     self.only_dyadic_rationals = False
Exemple #4
0
 def __init__(self, hist_shape: Tuple[int, ...], invariants: __InvariantsDict__, constraint_names: Iterable[str]):
     self.invariants = invariants
     self.constraint_names = constraint_names
     self.hist_shape, self.unit_hist_shape = hist_shape
     # self.constraints_dict = {}
     self.constraints_dict = ConstraintDict()
     # self.constraint_funcs_dict = {}
     self.schema = SchemaMaker.fromName(self.schemaname)
     self.unit_schema = SchemaMaker.fromName(_unit_schema_dict[self.schemaname])
     assert self.hist_shape == self.schema.shape, f"Histogram shape in data {self.hist_shape} doesn't correspond to histogram shape in chosen " \
         f"schema {self.schema.shape}"
     assert self.unit_hist_shape == self.unit_schema.shape, f"Histogram shape in data {self.unit_hist_shape} doesn't correspond to histogram shape in chosen " \
         f"schema {self.unit_schema.shape}"
Exemple #5
0
    def __init__(self, data_path, schema_name, budget_group=None, run_id=None):
        """
        .../data-run8.0-epsilon4.0-BlockNodeDicts/

        """
        self.data_path = du.addslash(data_path)
        self.schema_name = schema_name
        self.schema = SchemaMaker.fromName(self.schema_name)

        # extract data from the data_path
        data_info = self.data_path.split("/")[-2]
        #assert data_info.startswith('data'), "The wrong data path has been provided... Cannot load DASrun"
        #TODO: Replace above assert with something more appropriate ('data' was overly narrow)

        print(f"data_info.split(-): {data_info.split('-')}")
        #_, self.run_id, self.budget_group, _ = data_info.split('-')
        if budget_group == None:
            assert run_id == None
            self.parseDataInfo(data_info)
        else:
            assert run_id != None
            self.budget_group = budget_group
            self.run_id = run_id
        self.run_num = self.run_id[3:].split('.')[0]
        self.plb = self.budget_group
        print(f"Detected plb, run_id: {self.plb}, {self.run_id}")
Exemple #6
0
def makeNode(d,
             geocode,
             geocode_dict,
             addsyn=False,
             dpq_make=False,
             querydict=None,
             consn=consn,
             invn=invn):
    ph = convertPL94(d)
    syn = ph[0].toDense() + np.ones(np.prod(ph[0].shape)).reshape(
        ph[0].shape) if addsyn else None
    dpqdict = {
        C.DETAILED:
        DPquery(QueryFactory.makeTabularGroupQuery(array_dims=ph[0].shape),
                GeometricMechanism(Fraction(1, 10), 2, ph[0].toDense()))
    } if dpq_make else {}
    if querydict:
        dpqdict.update({
            name: DPquery(
                query,
                GeometricMechanism(Fraction(1, 10), 2,
                                   query.answer(ph[0].toDense())))
            for name, query in SchemaMaker.fromName(CC.SCHEMA_PL94).getQueries(
                querydict).items()
        })
    inv_dict = makeInvs(ph, invn)
    return GeounitNode(geocode,
                       raw=ph[0],
                       raw_housing=ph[1],
                       syn=syn,
                       cons=makeCons(ph, consn, inv_dict),
                       invar=inv_dict,
                       geocode_dict=geocode_dict,
                       dp_queries=dpqdict)
Exemple #7
0
def getAllImplementedInvariantNames(schemaname):
    schema = SchemaMaker.fromName(schemaname)
    # We add 'tot_hu' for housing invariants, and 'tot' for persons invariants
    inv_names = ['tot_hu'] if ((CC.ATTR_HHSEX in schema.dimnames) or
                               (schema.name is CC.SCHEMA_DHCH)) else ['tot']
    inv_names = inv_names + ['gqhh_tot', 'gq_vect', 'gqhh_vect']
    if CC.VOTING_TOTAL in schema.recodes:
        inv_names = inv_names + ['va']

    return inv_names
    def make(schema: str, raw: __HistData__, raw_housing: __HistData__,
             invariant_names: Iterable[str]) -> Dict[str, np.ndarray]:
        """
        Makes invariants dict, corresponding to person/household and unit schemas with raw data, containing invariants listed
        :param person_schemaname: Person or household schema, for which to make invariants
        :param unit_schemaname: Unit schema for which to make invariants
        :param raw: Raw person or household data
        :param raw_housing: Raw unit data
        :param invariant_names: which invariants to make
        :return: dict of invariants (presented as numpy arrays)
        """
        raw = InvariantsMaker.checkHistType(raw, "Person/Household")
        raw_housing = InvariantsMaker.checkHistType(raw_housing, "Unit")

        person_schema = SchemaMaker.fromName(schema)
        unit_schema = SchemaMaker.fromName(_unit_schema_dict[schema])

        schema_data_recodename = {
            # All schemas have these
            "gqhh_vect": (unit_schema, raw_housing, CC.HHGQ_UNIT_VECTOR),
            "gqhh_tot": (unit_schema, raw_housing, "total"),
            "gq_vect": (unit_schema, raw_housing, CC.HHGQ_UNIT_TOTALGQ),

            # Person schemas have these
            "tot": (person_schema, raw, "total"),
            "va": (person_schema, raw, CC.VOTING_TOTAL),

            # Household schemas have these
            "tot_hu": (unit_schema, raw_housing, CC.HHGQ_UNIT_HOUSING),

            # CVAP schems has this:
            "pl94counts": (unit_schema, raw_housing, 'detailed')
        }

        invariants_dict = {}
        for name in invariant_names:
            assert name in schema_data_recodename, f"Provided invariant name '{name}' is not implemented."
            schema, data, query_name = schema_data_recodename[name]
            query = schema.getQuery(query_name)
            invariants_dict[name] = np.array(
                query.answerWithShape(data)).astype(int)
        return invariants_dict
Exemple #9
0
def getConstraintsDataAnswersSchema(schemaname, data_answers_name, inv_names,
                                    cons_names):
    """ Return constraints, name of data set in data-with-answers and schema"""
    schema = SchemaMaker.fromName(schemaname)
    cc = dd.getConstraintsModule(schemaname).ConstraintsCreator
    data_answers = getattr(data[schemaname], data_answers_name)
    # Create histograms and from them invariants, and this constraint
    person, housing = data_answers[
        "hist"]  # histData(data_answers["data"], schemaname)
    constraints = getConsDict((person, housing),
                              inv(schemaname, person, housing, inv_names),
                              cons_names, cc)
    return constraints, data_answers, schema
Exemple #10
0
 def __init__(self, engine):
     self.privacy_framework = "zcdp"
     self.schema = "TEST"
     self.hist_shape = (2, )
     self.unit_hist_shape = (2, )
     self.hist_vars = ("sex", )
     self.schema_obj = SchemaMaker.fromAttlist("justsex", [SexAttr])
     self.unit_schema_obj = SchemaMaker.fromAttlist(
         "justsex", [SexAttr])
     self.validate_input_data_constraints = False
     self.spine_type = 'non_aian_spine'
     self.plb_allocation = None
     self.geocode_dict = {4: "Block", 3: "County", 1: "State"}
     self.dp_mechanism_name = CC.GEOMETRIC_MECHANISM
     self.inv_con_by_level = {
         'Block': {
             'invar_names': ('tot', ) if engine == BottomUpEngine else
             (),
             'cons_names': ('total', ) if engine == BottomUpEngine else
             (),
         },
         'County': {
             'invar_names': (),
             'cons_names': ()
         },
         'State': {
             'invar_names': ('tot', ),
             'cons_names': ('total', )
         }
     }
     self.levels = list(self.inv_con_by_level.keys())
     self.geo_bottomlevel = 'Block'
     self.geolevel_prop_budgets = (Fraction(1 / 5), Fraction(1 / 5),
                                   Fraction(3 / 5))
     self.postprocess_only = False
     self.only_dyadic_rationals = False
def main():
    spark = SparkSession.builder.appName('RI Redistricting Data - PL94_P12 - Extracting agecat totals').getOrCreate()
    experiments = [
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/"
    ]


    # add das_decennial zip file to the spark context (to be sent to the core nodes)
    spark.sparkContext.addPyFile("/mnt/users/moran331/das_decennial.zip")
    schema_name = "PL94_P12"
    schema = SchemaMaker.fromName(name=schema_name)
    for path in experiments:
        tree = treetools.RunTree(path)
        runs = tree.runs
        for r, run in enumerate(runs):
            if r == 0:
                df = sdftools.getSparseDF(spark, run, schema, run_columns=True).persist()
            else:
                df = df.union(sdftools.getSparseDF(spark, run, schema, run_columns=True)).persist()
        df = df.persist()
        plb = str(du.algset2plb(du.findallSciNotationNumbers(tree.algsets[0])[0]))
        df = df.withColumn("plb", sf.lit(plb))
        df.show()
        geodict = aggtools.getGeodict()
        geolevel = "State"
        mapping = geodict[geolevel]
        group = ["plb", "run_id", geolevel] + schema.dimnames
        geodf = df.withColumn(geolevel, df.geocode[0:mapping]).groupBy(group).sum().persist()
        geodf = sdftools.stripSQLFromColumns(geodf)
        geodf.show()
        queryname = "votingage"
        querydf = sdftools.getQueryDF(geodf, queryname, schema, basegroup=["run_id", geolevel, "plb"]).persist()
        querydf.show()
        savepath = f"/mnt/users/moran331/redistricting/agecat_redistricting_state_totals_2019_06_27/{tree.algsets[0]}/"
        du.makePath(savepath)
        pd = querydf.toPandas().to_csv(savepath + "votingage_all_25_runs.csv", index=False)
        print(f"---Agecat--- | \n{querydf.toPandas().to_string()}")
Exemple #12
0
    def test_wconstraints():
        geocode_dict = {
            16: 'Block',
            12: 'Block_Group',
            11: 'Tract',
            5: 'County'
        }
        histogram, housing_hist = table2hists(
            np.array(
                [  # columns: 'hhgq', 'votingage', 'hispanic', 'cenrace', 'unique unitid' (shape 8,2,2,63 + unitUID)
                    # each row is a person
                    [0, 1, 1, 20, 0],
                    [1, 0, 0, 1, 1],
                    [3, 1, 0, 10, 2],
                    [3, 0, 0, 15, 2],
                    [1, 1, 0, 15, 3]
                ]),
            SchemaMaker.fromName(CC.SCHEMA_PL94),
            CC.ATTR_HHGQ)
        inv_dict = InvariantsMaker.make(schema=CC.SCHEMA_PL94,
                                        raw=histogram,
                                        raw_housing=housing_hist,
                                        invariant_names=("tot", "gqhh_vect",
                                                         "gqhh_tot", "va"))
        con_dict = cPL94.ConstraintsCreator(hist_shape=(histogram.shape, housing_hist.shape), invariants=inv_dict, constraint_names=("total",))\
                .calculateConstraints().constraints_dict

        n1 = nodes.GeounitNode(geocode='123456789abcdefg',
                               geocode_dict=geocode_dict,
                               raw=histogram,
                               raw_housing=housing_hist,
                               cons=con_dict,
                               invar=inv_dict)

        n2 = nodes.GeounitNode(geocode='123456789abcdefg',
                               geocode_dict=geocode_dict,
                               raw=histogram,
                               raw_housing=housing_hist,
                               cons=con_dict,
                               invar=inv_dict)
        assert n1 == n2
def test_impact_gaps():
    for sname, strat in StrategySelector.strategies.items():
        try:
            schema = SchemaMaker.fromName(strat.schema)
        except AttributeError:
            raise AttributeError(
                f"Strategy {sname} doesn't have a schema attribute. Needed to check for impact gaps (to create queries from the schema)"
            )

        s = strat.make(strat.levels)
        for level, qnames in s[CC.DPQUERIES].items():
            for qname in qnames:
                query = schema.getQuery(qname)
                # This is just the sum
                # impact = (np.ones(query.numAnswers()) @ np.abs(query.matrixRep()))  # factor of eps/sens doesn't matter here
                impact = np.abs(query.matrixRep()).sum(axis=0)
                # total_impact += impact * prop  # to do this, need to do composition, multiplying by proportion, like here,  only works for pure, epsilon-DP
                impmin, impmax = impact.min(), impact.max()

                if abs(impmin - impmax) > 1e-7:
                    print(
                        f"{qname} ~ Impact\n {'':50} Min: {impmin}, Max: {impmax}, All: {impact}"
                    )
                    raise ValueError(
                        f"There is an impact gap underutilizing parallel composition in query {qname}, geolevel {level}, in strategy {sname}"
                    )

                # Having both below is redundant, but for clarity and future flexibility including both
                if impmin != 1:
                    raise ValueError(
                        f"Some histogram cells are under-measured in query query {qname}, geolevel {level}, in strategy {sname}"
                    )
                if impmax != 1:
                    raise ValueError(
                        f"Some histogram cells are measured more than once in query {qname}, geolevel {level}, in strategy {sname}"
                    )
def getTableBuilder():
    schema = SchemaMaker.buildSchema()
    tabledict = getTableDict()
    builder = tablebuilder.TableBuilder(schema, tabledict)
    return builder
def getTableBuilder():
    schema = SchemaMaker.fromName(CC.SCHEMA_SF1)
    tabledict = getTableDict()
    builder = tablebuilder.TableBuilder(schema, tabledict)
    return builder
Exemple #16
0
                                spark_loglevel=spark_loglevel)

    # save the analysis script?
    # toggle to_linux=True|False to save|not save this analysis script locally
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)

    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ)

    pp10_gq_edited = spark.read.csv(
        "s3://uscb-decennial-ite-das/2010/cef/pp10_gq_edited.csv")

    pp10_hu_edited = spark.read.csv(
        "s3://uscb-decennial-ite-das/2010/cef/pp10_hu_edited.csv")

    pp10_grf = spark.read.csv(
        "s3://uscb-decennial-ite-das/2010/cef/pp10_grf_tab.csv")

    # Clean up gq file by removing extra characters like b and ' from data, renaming columns, and dropping un-needed columns
    gq_revised = pp10_gq_edited.withColumn(
        "final_pop", sf.regexp_replace(sf.col("_c9"), "[b']", "")).withColumn(
            "FGQ", sf.regexp_replace(sf.col("_c6"), "[b']", "")).withColumn(
                "PEG",
import numpy as np

from programs.constraints.tests.UnitTVGTestdata import units
from programs.schema.schemas.schemamaker import SchemaMaker, _unit_schema_dict
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_HOUSEHOLD2010_TENVACS)
unit_schema = SchemaMaker.fromName(
    _unit_schema_dict[CC.SCHEMA_HOUSEHOLD2010_TENVACS])

# Need hhgq_cap to put correct answers into the data below
# hhgq_cap = ConstraintsCreator(invariants=(), constraint_names=()).hhgq_cap
# Testing data with answers (right hand sides of the constraints)
data1 = {
    'data': [
        # each row is a household
        # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2) + unit UID)
        #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'ten2lev', 'unit UID'
        [1, 8, 1, 0, 1, 20, 0, 0, 0, 0],
        [0, 6, 0, 2, 1, 1, 2, 1, 1, 1],
        [1, 3, 0, 4, 1, 18, 1, 0, 1, 2],
        [0, 3, 0, 3, 7, 15, 3, 1, 1, 3],
        [1, 2, 0, 6, 0, 15, 1, 0, 0, 4],
    ],
    'units':
    units,

    # 5 total households
    'total':
import numpy as np

from programs.schema.schemas.schemamaker import SchemaMaker
from programs.constraints.tests.UnitSimpleRecodedTestdata import units
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_TEN_UNIT_2010)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10)

# Need hhgq_cap to put correct answers into the data below
# hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap
# Testing data with answers (right hand sides of the constraints) TODO: NOTE THAT ALL THREE DATA SETS ARE THE SAME FOR NOW
data1 = {
    'data': [
        # each row is a household
        # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2, 4) + unit UID)
        #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'tenure', 'unit UID'
        [1, 8, 1, 0, 1, 20, 0, 0, 0, 0],
        [0, 6, 0, 2, 1, 1, 2, 1, 1, 1],
        [1, 3, 0, 4, 1, 18, 1, 0, 3, 2],
        [0, 3, 0, 3, 7, 15, 3, 1, 1, 2],
        [1, 2, 0, 6, 0, 15, 1, 0, 2, 3],
        [1, 5, 0, 4, 1, 18, 0, 0, 0, 2],
    ],
    'units':
    units,

    # 5 total households
    'total':
Exemple #19
0
class TestMDF2020PersonWriter():

    config = """
[setup]
spark.name: DAS
spark.loglevel: ERROR

[schema]
schema: DHCP_HHGQ

[geodict]
geolevel_names: Block,Block_Group,Tract,Tract_Group,County,State,US
geolevel_leng: 16,12,11,9,5,2,0

[budget]
geolevel_budget_prop: 0.5, 0.5

[writer]
produce_flag: 1
output_path:
"""
    schema = SchemaMaker.fromName(CC.DAS_DHCP_HHGQ)

    def makeNode(self, persons, geocode='0123456789abcdef'):
        person_hist, unit_hist = table2hists(np.array(persons),
                                             self.schema,
                                             housing_varname=CC.ATTR_HHGQ)

        invar = InvariantsMaker.make(schema=CC.DAS_DHCP_HHGQ,
                                     raw=person_hist,
                                     raw_housing=unit_hist,
                                     invariant_names=('tot', 'gqhh_tot',
                                                      'gqhh_vect'))
        cons = PConstraintsCreator(
            hist_shape=(person_hist.shape, unit_hist.shape),
            invariants=invar,
            constraint_names=(
                'hhgq_total_lb', 'hhgq_total_ub',
                'nurse_nva_0')).calculateConstraints().constraints_dict
        node = GeounitNode(raw=person_hist,
                           raw_housing=unit_hist,
                           invar=invar,
                           cons=cons,
                           geocode_dict={
                               16: 'Block',
                               12: 'Block_Group',
                               11: 'Tract',
                               5: 'County',
                               2: 'State',
                               1: 'US'
                           },
                           geocode=geocode)
        node.syn = node.raw

        return node

    @pytest.mark.parametrize("recode", [True, False])
    @pytest.mark.parametrize("as_dict", [True, False])
    @pytest.mark.parametrize("persons", [
        pdata['persons'],
    ])
    def test_makeHistRowsFromMultiSparse(self, persons, as_dict, recode):
        node = self.makeNode(persons)
        if as_dict:
            node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR))
        if recode:
            rows = makeHistRowsFromMultiSparse(
                node, self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder)
            assert len(rows) == len(persons)
        else:
            rows = makeHistRowsFromMultiSparse(node,
                                               self.schema,
                                               add_schema_name=False)

        input_rows = ["|".join(map(str, row[:-1])) for row in persons]

        if not recode:
            match_cnt = 0
            for row in rows:
                row_str = "|".join([row[var] for var in self.schema.dimnames])
                for inp_row in input_rows:
                    if row_str == inp_row:
                        match_cnt += 1
                        input_rows.remove(inp_row)
                        break
            assert match_cnt == len(rows) == len(persons)

        else:
            assert len(rows) == len(persons)

    @pytest.mark.parametrize(
        "persons",
        [
            #(Household2010_testdata.data['data'], Household2010_testdata.data['units']),
            pdata['persons'],
        ])
    def test_makeHistRowsFromMultiSparseRecode(self, persons):
        node = self.makeNode(persons)
        rows = makeHistRowsFromMultiSparse(
            node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)),
            self.schema,
            row_recoder=DHCPHHGQToMDFPersons2020Recoder)
        ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the
        #   writer test below
        assert len(rows) == len(persons)

    def test_transformRDDForSaving(self, spark, dd_das_stub):
        dd_das_stub.reader = get_reader_stub()

        config = ConfigParser()
        config.read_string(self.config)
        import programs.das_setup as ds
        setup_instance = ds.DASDecennialSetup(config=config,
                                              name='setup',
                                              das=dd_das_stub)
        w = MDF2020PersonWriter(config=setup_instance.config,
                                setup=setup_instance,
                                name='writer',
                                das=dd_das_stub)

        persons = pdata['persons']
        node1 = self.makeNode(persons[:2], geocode='0123456789abcdef')
        node2 = self.makeNode(persons[2:], geocode='0123456789abcdeg')
        spark = SparkSession.builder.getOrCreate()
        node_rdd = spark.sparkContext.parallelize([node1, node2])
        df = w.transformRDDForSaving(node_rdd)
        df.show()

        assert df.count() == len(persons)

        for val in df.select('EPNUM').collect():
            assert val['EPNUM'] == 999999999

        for val in df.select('RELSHIP').collect():
            assert val['RELSHIP'] == '99'

        def len_cond(cond):
            return len(np.where(cond)[0])

        num_gq = len_cond(np.array(persons)[:, 0] > 0)

        rtype = np.array(df.select('RTYPE').collect())

        assert len_cond(rtype[:, 0] == '5') == num_gq
        assert len_cond(rtype[:, 0] == '3') == len(persons) - num_gq
    qps = qprops[strat_name]
    qns = queries[strat_name]
    dp_query_prop = dict((gl, qps[gl]) if gl in qps else (gl, qps['default'])
                         for gl in geolevels)
    query_names = dict((gl, qns[gl]) if gl in qns else (gl, qns['default'])
                       for gl in geolevels)
    levels = list(reversed(list(geolevel_prop_budgets_dict)))
    print(f"dp_query_prop: {dp_query_prop}")
    for k, vals in dp_query_prop.items():
        print(f"{k} : sum -> {sum(vals)}, len -> {len(vals)}")
    print(f"query_names: {query_names}")
    for k, qnames in query_names.items():
        print(f"{k} : len -> {len(qnames)}")

    schema = SchemaMaker.fromName(
        "PL94"
    )  # NOTE: units is actually H1, but irrelevant for geolevel calculations

    def query_iter(gl):
        for qname, qprop in zip(query_names[gl], dp_query_prop[gl]):
            yield schema.getQuery(qname), qprop

    attr_query_props = Budget.getAttrQueryProps(levels, schema.dimnames,
                                                query_iter)
    print(f"attr_query_props: {attr_query_props}")

    def getEpsFromGeoAlloc(geo_all_dict):
        return curve.zCDPEpsDeltaCurve(geo_all_dict,
                                       verbose=False).get_epsilon(float(delta),
                                                                  global_scale,
                                                                  bounded=True,
import analysis.tools.setuptools as setuptools
import analysis.tools.datatools as datatools
import analysis.tools.sdftools as sdftools
import analysis.tools.graphtools as graphtools
import analysis.tools.crosswalk as crosswalk

import matplotlib.pyplot as plt
import programs.cenrace as cenrace
from programs.schema.attributes.cenrace import CenraceAttr as CENRACE
import analysis.constants as AC
import constants as C
import seaborn as sns
from pyspark.sql import functions as sf
from pyspark.sql import Row
from programs.schema.schemas.schemamaker import SchemaMaker
schema = SchemaMaker.fromName("DHCP_SCHEMA")

from programs.schema.schemas.schemamaker import SchemaMaker

import programs.sparse as sp
import scipy.sparse as ss

if __name__ == "__main__":
    ################################################################
    # Set the save_location to your own JBID (and other folder(s))
    # it will automatically find your JBID
    # if something different is desired, just pass what is needed
    # into the setuptools.setup function.
    ################################################################
    jbid = os.environ.get('JBID', 'temp_jbid')
    save_folder = "plots/"
import numpy as np

from programs.constraints.tests.UnitSimpleRecodedTestdata import units
from programs.schema.schemas.schemamaker import SchemaMaker
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_HOUSEHOLD2010)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10)

# Need hhgq_cap to put correct answers into the data below
# hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap
# Testing data with answers (right hand sides of the constraints) TODO: NOTE THAT ALL THREE DATA SETS ARE THE SAME FOR NOW
data1 = {
    'data': [
        # each row is a household
        # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2) + unit UID)
        #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'unit UID'
                    [1,     8,       1,      0,      1,      20,        0,        0,          0],
                    [0,     6,       0,      2,      1,      1,         2,        1,          1],
                    [1,     3,       0,      4,      1,      18,        1,        0,          2],
                    [0,     3,       0,      3,      7,      15,        3,        1,          3],
                    [1,     2,       0,      6,      0,      15,        1,        0,          4],
                    [1,     5,       0,      4,      1,      18,        0,        0,          2],
            ],
    'units' : units,


    # 5 total households
    'total': 6,
Exemple #23
0
spark_loglevel = "ERROR"
analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel)
spark = analysis.spark
S3_BASE="s3://uscb-decennial-ite-das/users"
save_location_linux = f"/mnt/users/rao00316/bias/"


path = [
    "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP_reRun1/td16/"
]

runs = datatools.getDASRuns(path)

schema_name = "DHCP_HHGQ"

schema = SchemaMaker.fromName(name=schema_name)

experiment = analysis.make_experiment("DHCP", path)
df = experiment.getDF()
schema = experiment.schema

geolevels = [C.COUNTY] #, C.COUNTY, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU, C.CD]

queries = ['total']


def NEbias(spark,df,geolevels,queries,schema):
    u=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    u=u.withColumn('diff',sf.col('priv')-sf.col('orig'))
    z=u.groupby(['geolevel']).avg()
import numpy as np

from programs.constraints.Constraints_DHCP_HHGQ import ConstraintsCreator
from programs.schema.schemas.schemamaker import SchemaMaker
from programs.constraints.tests.UnitSimpleRecodedTestdata import units
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_SIMPLE_RECODED)
# Need hhgq_cap to put correct answers into the data below
hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape),
                              invariants=(),
                              constraint_names=()).hhgq_cap

# Testing data with answers (right hand sides of the constraints)
data1 = {
    'data':
    [  # columns: 'sex', 'age', 'hispanic', 'cenrace', 'citizen', 'unit UID' (shape (8, 2, 116, 2, 63, 2) + unit UID)
        # each row is a person
        [0, 1, 10, 1, 20, 1, 0],
        [2, 0, 20, 0, 1, 0, 1],
        [5, 1, 45, 0, 10, 1, 2],
        [5, 0, 80, 0, 15, 0, 2],
        [0, 1, 90, 0, 15, 1, 3],
        [1, 1, 10, 1, 20, 1, 4],
        [1, 0, 20, 0, 1, 0, 5],
        [1, 1, 45, 0, 10, 1, 6],
        [1, 0, 80, 0, 15, 0, 7],
        [1, 1, 90, 0, 15, 1, 8]
import numpy as np

from programs.constraints.Constraints_DHCP import ConstraintsCreator
from programs.schema.schemas.schemamaker import SchemaMaker
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_DHCP)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_DHCP)
# Need hhgq_cap to put correct answers into the data below
hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape),
                              invariants=(),
                              constraint_names=()).hhgq_cap

# Testing data with answers (right hand sides of the constraints)
data1 = {
    'units': [[3, 0], [0, 1], [18, 2], [0, 3], [5, 4], [23, 5], [5, 6], [9, 7],
              [0, 8]],
    'data':
    [  # columns: 'relgq', 'sex', 'age', 'hispanic', 'cenrace', 'unit UID' (shape (42, 2, 116, 2, 63) + unit UID)
        # each row is a person

        # there are 10 people
        # 9 units
        # 3 people in 3 different households (11,17,15)
        # 3 people in 2 different GQ 22
        # 1 people in 1 of each GQ in [20, 35, 40, 26]
        [3 + 17, 0, 67, 1, 29, 0],
        [11, 1, 46, 0, 1, 1],  # Grandchild
        [18 + 17, 1, 42, 0, 47, 2],
Exemple #26
0
def convertPL94(d):
    return table2hists(d, SchemaMaker.fromName(CC.SCHEMA_PL94), CC.ATTR_HHGQ)
Exemple #27
0
import numpy as np
import scipy.sparse as ss
from collections import Counter

from programs.constraints.Constraints_SF1 import ConstraintsCreator
from programs.schema.schemas.schemamaker import SchemaMaker
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_SF1)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10)
# Need hhgq_cap to put correct answers into the data below
hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape),
                              invariants=(),
                              constraint_names=()).hhgq_cap

units1 = [
    # each row is a unit
    # columns: 'hhgq','unit UID',
    [0, 0],
    [0, 1],
    [0, 2],
    [0, 3],
    [1, 4],
    [15, 5],
    [16, 6],
    [20, 7],
    [29, 8],
]
counts1 = Counter(np.array(units1)[:, 0])
Exemple #28
0
class TestMDF2020HouseholdWriter():

    config = """
[setup]
spark.name: DAS
spark.loglevel: ERROR

[schema]
schema: Household2010

[geodict]
geolevel_names: Stub
geolevel_leng: 1

[budget]
geolevel_budget_prop: 0.5, 0.5

[writer]
produce_flag: 1
output_path:
"""
    schema = SchemaMaker.fromName(CC.DAS_Household2010)
    unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10)

    def makeNode(self, hholds, units, geocode='0'):
        hhold_hist, unit_hist = table2hists(
            np.array(hholds), self.schema), table2hists(np.array(units),
                                                        self.unit_schema,
                                                        CC.ATTR_HHGQ,
                                                        units=True)

        invar = InvariantsMaker.make(schema=CC.SCHEMA_HOUSEHOLD2010,
                                     raw=hhold_hist,
                                     raw_housing=unit_hist,
                                     invariant_names=('tot', 'gqhh_vect'))
        cons = HHConstraintsCreator(
            hist_shape=(hhold_hist.shape, unit_hist.shape),
            invariants=invar,
            constraint_names=('no_vacant', 'living_alone',
                              'size2')).calculateConstraints().constraints_dict
        node = GeounitNode(raw=hhold_hist,
                           raw_housing=unit_hist,
                           invar=invar,
                           cons=cons,
                           geocode_dict={1: 'Stub'},
                           geocode=geocode)
        node.syn = node.raw

        return node

    @pytest.mark.parametrize("recode", [True, False])
    @pytest.mark.parametrize("as_dict", [True, False])
    @pytest.mark.parametrize("hholds, units", [
        (Household2010_testdata.data['data'],
         Household2010_testdata.data['units']),
        (hhdata['households'], hhdata['units']),
    ])
    def test_makeHistRowsFromMultiSparse(self, hholds, units, as_dict, recode):
        node = self.makeNode(hholds, units)
        if as_dict:
            node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR))
        if recode:
            rows = makeHistRowsFromMultiSparse(
                node,
                self.schema,
                row_recoder=Household2010ToMDFUnit2020Recoder)
            assert len(rows) == len(hholds)
            rows = addEmptyAndGQ(node,
                                 self.schema,
                                 rows,
                                 row_recoder=Household2010ToMDFUnit2020Recoder)
            assert len(rows) == len(units)
        else:
            rows = makeHistRowsFromMultiSparse(node,
                                               self.schema,
                                               add_schema_name=False)

        input_rows = ["|".join(map(str, row[:-1])) for row in hholds]

        if not recode:
            match_cnt = 0
            for row in rows:
                row_str = "|".join([row[var] for var in self.schema.dimnames])
                for inp_row in input_rows:
                    if row_str == inp_row:
                        match_cnt += 1
                        input_rows.remove(inp_row)
                        break
            assert match_cnt == len(rows) == len(hholds)

        else:
            assert len(rows) == len(units)

    @pytest.mark.parametrize("hholds, units", [
        (Household2010_testdata.data['data'],
         Household2010_testdata.data['units']),
        (hhdata['households'], hhdata['units']),
    ])
    def test_makeHistRowsFromMultiSparseRecode(self, hholds, units):
        node = self.makeNode(hholds, units)
        rows = makeHistRowsFromMultiSparse(
            node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)),
            self.schema,
            row_recoder=Household2010ToMDFUnit2020Recoder)
        ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the
        #   writer test below
        assert len(rows) == len(hholds)
        rows = addEmptyAndGQ(node,
                             self.schema,
                             rows,
                             row_recoder=Household2010ToMDFUnit2020Recoder)
        assert len(rows) == len(units)
        pass

    def test_transformRDDForSaving(self, spark, dd_das_stub):
        dd_das_stub.reader = get_reader_stub()

        config = ConfigParser()
        config.read_string(self.config)
        import programs.das_setup as ds
        setup_instance = ds.DASDecennialSetup(config=config,
                                              name='setup',
                                              das=dd_das_stub)
        w = MDF2020HouseholdWriter(config=setup_instance.config,
                                   setup=setup_instance,
                                   name='writer',
                                   das=dd_das_stub)

        hholds = hhdata['households']
        units = hhdata['units']
        node1 = self.makeNode(hholds[:4], units[:4], geocode='0')
        node2 = self.makeNode(hholds[4:], units[4:], geocode='1')
        spark = SparkSession.builder.getOrCreate()
        node_rdd = spark.sparkContext.parallelize([node1, node2])
        df = w.transformRDDForSaving(node_rdd)
        df.show()

        assert df.count() == len(units)

        for val in df.select('P18').collect():
            assert val['P18'] == 9

        for val in df.select('PAC').collect():
            assert val['PAC'] == '9'

        def len_cond(cond):
            return len(np.where(cond)[0])

        num_gq = len_cond(np.array(units)[:, 0] > 1)

        rtype = np.array(df.select('RTYPE').collect())

        assert len_cond(rtype[:, 0] == '4') == num_gq
        assert len_cond(rtype[:, 0] == '2') == len(units) - num_gq
Exemple #29
0
import numpy as np

from programs.schema.schemas.schemamaker import SchemaMaker
from programs.constraints.Constraints_PL94 import ConstraintsCreator
from das_utils import table2hists

from constants import CC

schema = SchemaMaker.fromName(CC.SCHEMA_PL94)
unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_SIMPLE_RECODED)
# Need hhgq_cap to put correct answers into the data below
hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap

# Testing data with answers (right hand sides of the constraints)
data1 = {
    'data': [   # columns: 'hhgq', 'votingage', 'hispanic', 'cenrace', 'unique unitid' (shape 8,2,2,63 + unitUID)
                # each row is a person
                [0, 1, 1, 20, 0],
                [1, 0, 0, 1, 1],
                [3, 1, 0, 10, 2],
                [3, 0, 0, 15, 2],
                [1, 1, 0, 15, 3]
            ],

    # 5 total people (5 rows)
    'total': 5,

    # 3 rows 1,3,5 have va=1
    'voting_age': 3,

    # 1 row (4th) with nursing home (3 in first column) and 0 (va) in second column
Exemple #30
0
    "run_17": "DA-Run B",
    "run_23": "DA-Run C"
}
run_ids.sort(key=lambda s: int(s.split("_")[1]))
for r, run_id in enumerate(run_ids):
    if r == 0:
        wide['State'] = df[df['run_id'] == run_id].reset_index()['State']
        wide['plb'] = df[df['run_id'] == run_id].reset_index()['plb']
        wide['agecat'] = df[df['run_id'] == run_id].reset_index()['agecat']
        wide['orig'] = df[df['run_id'] == run_id].reset_index()['orig']
    wide[run_id] = df[df['run_id'] == run_id].reset_index()['priv']
# 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production

widedf = pandas.DataFrame(wide)

schema = SchemaMaker.fromName(name="PL94_P12")
levels = schema.levels['agecat']

widedf['agecat'] = pandas.Categorical(widedf['agecat'], categories=levels)
widedf = widedf.sort_values('agecat')
widedf = widedf[['State', 'plb', 'agecat', 'orig'] +
                [f"run_{i}" for i in range(25)]]
widedf = widedf.reset_index()
widedf = widedf[widedf.columns[1:]]

widedf = widedf[["State", "plb", "agecat", "orig"] + list(keep_run_ids.keys())]
widedf = widedf.rename(columns=keep_run_ids)

va_ind = numpy.array(
    schema.getQuerySeed("voting").groupings['agecat']).flatten().tolist()