def __init__(self): self.privacy_framework = "zcdp" self.schema = "TEST" self.hist_shape = (2, ) self.unit_hist_shape = (2, ) self.hist_vars = ("sex", ) self.schema_obj = SchemaMaker.fromAttlist("justsex", [SexAttr]) self.unit_schema_obj = SchemaMaker.fromAttlist( "justsex", [SexAttr]) empty_dict = { 'invar_names': (), 'cons_names': (), } self.inv_con_by_level = { 'Block': empty_dict, } self.levels = CC.GEOLEVEL_BLOCK, CC.GEOLEVEL_BLOCK_GROUP, CC.GEOLEVEL_TRACT, "Tract_Group", CC.GEOLEVEL_COUNTY, CC.GEOLEVEL_STATE, CC.GEOLEVEL_US self.geo_bottomlevel = 'Block' self.spine_type = 'non_aian_spine' self.plb_allocation = None self.geocode_dict = {1: '2', 3: '4'} self.dp_mechanism_name = CC.DISCRETE_GAUSSIAN_MECHANISM self.postprocess_only = False self.use_spark = False self.geolevel_prop_budgets = (Fraction(1, 5), Fraction(1, 5), Fraction(3, 25), Fraction(3, 25), Fraction(3, 25), Fraction(3, 25), Fraction(3, 25)) self.only_dyadic_rationals = True # Doesn't matter true or false because the budgets are assigned directly
def getTableBuilder(testtables=None): """ :param testtables: Will run the testTableDefs function to ensure recodes from the same dimension aren't crossed in a table. This is useful if you get the error because it identifies the table and the cell of the list where the error occurs. :return: """ schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ) tabledict = getTableDict() if testtables == True: tablebuilder.testTableDefs(schema, tabledict) else: schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ) tabledict = getTableDict() builder = tablebuilder.TableBuilder(schema, tabledict) return builder ############################################################ ## Consolidated tables ############################################################ '''
def __init__(self): self.privacy_framework = "zcdp" self.schema = "PL94" self.hist_shape = (8, 2, 2, 63) self.unit_hist_shape = (2, ) self.hist_vars = ("hhgq", "votingage", "hisp", "cenrace") self.schema_obj = SchemaMaker.fromAttlist( "PL94", [HHGQ, VOTING_AGE, HISPANIC, CENRACE]) self.unit_schema_obj = SchemaMaker.fromAttlist("justsex", [SEX]) self.validate_input_data_constraints = False self.spine_type = 'non_aian_spine' self.plb_allocation = None self.geocode_dict = { 5: "Block", 4: "Block_Group", 3: "Tract", 2: "County", 1: "State", 0: "US" } self.dp_mechanism_name = CC.GEOMETRIC_MECHANISM self.inv_con_by_level = { 'Block': { 'invar_names': (), 'cons_names': (), }, 'Block_Group': { 'invar_names': (), 'cons_names': (), }, 'Tract': { 'invar_names': (), 'cons_names': (), }, 'County': { 'invar_names': (), 'cons_names': () }, 'State': { 'invar_names': (), 'cons_names': () }, 'US': { 'invar_names': (), 'cons_names': () } } self.levels = list(self.inv_con_by_level.keys()) self.geo_bottomlevel = 'Block' self.geolevel_prop_budgets = (Fraction(44, 1024), Fraction(44, 1024), Fraction(44 / 1024), Fraction(44, 1024), Fraction(127, 1024), Fraction(721, 1024)) self.postprocess_only = False self.only_dyadic_rationals = False
def __init__(self, hist_shape: Tuple[int, ...], invariants: __InvariantsDict__, constraint_names: Iterable[str]): self.invariants = invariants self.constraint_names = constraint_names self.hist_shape, self.unit_hist_shape = hist_shape # self.constraints_dict = {} self.constraints_dict = ConstraintDict() # self.constraint_funcs_dict = {} self.schema = SchemaMaker.fromName(self.schemaname) self.unit_schema = SchemaMaker.fromName(_unit_schema_dict[self.schemaname]) assert self.hist_shape == self.schema.shape, f"Histogram shape in data {self.hist_shape} doesn't correspond to histogram shape in chosen " \ f"schema {self.schema.shape}" assert self.unit_hist_shape == self.unit_schema.shape, f"Histogram shape in data {self.unit_hist_shape} doesn't correspond to histogram shape in chosen " \ f"schema {self.unit_schema.shape}"
def __init__(self, data_path, schema_name, budget_group=None, run_id=None): """ .../data-run8.0-epsilon4.0-BlockNodeDicts/ """ self.data_path = du.addslash(data_path) self.schema_name = schema_name self.schema = SchemaMaker.fromName(self.schema_name) # extract data from the data_path data_info = self.data_path.split("/")[-2] #assert data_info.startswith('data'), "The wrong data path has been provided... Cannot load DASrun" #TODO: Replace above assert with something more appropriate ('data' was overly narrow) print(f"data_info.split(-): {data_info.split('-')}") #_, self.run_id, self.budget_group, _ = data_info.split('-') if budget_group == None: assert run_id == None self.parseDataInfo(data_info) else: assert run_id != None self.budget_group = budget_group self.run_id = run_id self.run_num = self.run_id[3:].split('.')[0] self.plb = self.budget_group print(f"Detected plb, run_id: {self.plb}, {self.run_id}")
def makeNode(d, geocode, geocode_dict, addsyn=False, dpq_make=False, querydict=None, consn=consn, invn=invn): ph = convertPL94(d) syn = ph[0].toDense() + np.ones(np.prod(ph[0].shape)).reshape( ph[0].shape) if addsyn else None dpqdict = { C.DETAILED: DPquery(QueryFactory.makeTabularGroupQuery(array_dims=ph[0].shape), GeometricMechanism(Fraction(1, 10), 2, ph[0].toDense())) } if dpq_make else {} if querydict: dpqdict.update({ name: DPquery( query, GeometricMechanism(Fraction(1, 10), 2, query.answer(ph[0].toDense()))) for name, query in SchemaMaker.fromName(CC.SCHEMA_PL94).getQueries( querydict).items() }) inv_dict = makeInvs(ph, invn) return GeounitNode(geocode, raw=ph[0], raw_housing=ph[1], syn=syn, cons=makeCons(ph, consn, inv_dict), invar=inv_dict, geocode_dict=geocode_dict, dp_queries=dpqdict)
def getAllImplementedInvariantNames(schemaname): schema = SchemaMaker.fromName(schemaname) # We add 'tot_hu' for housing invariants, and 'tot' for persons invariants inv_names = ['tot_hu'] if ((CC.ATTR_HHSEX in schema.dimnames) or (schema.name is CC.SCHEMA_DHCH)) else ['tot'] inv_names = inv_names + ['gqhh_tot', 'gq_vect', 'gqhh_vect'] if CC.VOTING_TOTAL in schema.recodes: inv_names = inv_names + ['va'] return inv_names
def make(schema: str, raw: __HistData__, raw_housing: __HistData__, invariant_names: Iterable[str]) -> Dict[str, np.ndarray]: """ Makes invariants dict, corresponding to person/household and unit schemas with raw data, containing invariants listed :param person_schemaname: Person or household schema, for which to make invariants :param unit_schemaname: Unit schema for which to make invariants :param raw: Raw person or household data :param raw_housing: Raw unit data :param invariant_names: which invariants to make :return: dict of invariants (presented as numpy arrays) """ raw = InvariantsMaker.checkHistType(raw, "Person/Household") raw_housing = InvariantsMaker.checkHistType(raw_housing, "Unit") person_schema = SchemaMaker.fromName(schema) unit_schema = SchemaMaker.fromName(_unit_schema_dict[schema]) schema_data_recodename = { # All schemas have these "gqhh_vect": (unit_schema, raw_housing, CC.HHGQ_UNIT_VECTOR), "gqhh_tot": (unit_schema, raw_housing, "total"), "gq_vect": (unit_schema, raw_housing, CC.HHGQ_UNIT_TOTALGQ), # Person schemas have these "tot": (person_schema, raw, "total"), "va": (person_schema, raw, CC.VOTING_TOTAL), # Household schemas have these "tot_hu": (unit_schema, raw_housing, CC.HHGQ_UNIT_HOUSING), # CVAP schems has this: "pl94counts": (unit_schema, raw_housing, 'detailed') } invariants_dict = {} for name in invariant_names: assert name in schema_data_recodename, f"Provided invariant name '{name}' is not implemented." schema, data, query_name = schema_data_recodename[name] query = schema.getQuery(query_name) invariants_dict[name] = np.array( query.answerWithShape(data)).astype(int) return invariants_dict
def getConstraintsDataAnswersSchema(schemaname, data_answers_name, inv_names, cons_names): """ Return constraints, name of data set in data-with-answers and schema""" schema = SchemaMaker.fromName(schemaname) cc = dd.getConstraintsModule(schemaname).ConstraintsCreator data_answers = getattr(data[schemaname], data_answers_name) # Create histograms and from them invariants, and this constraint person, housing = data_answers[ "hist"] # histData(data_answers["data"], schemaname) constraints = getConsDict((person, housing), inv(schemaname, person, housing, inv_names), cons_names, cc) return constraints, data_answers, schema
def __init__(self, engine): self.privacy_framework = "zcdp" self.schema = "TEST" self.hist_shape = (2, ) self.unit_hist_shape = (2, ) self.hist_vars = ("sex", ) self.schema_obj = SchemaMaker.fromAttlist("justsex", [SexAttr]) self.unit_schema_obj = SchemaMaker.fromAttlist( "justsex", [SexAttr]) self.validate_input_data_constraints = False self.spine_type = 'non_aian_spine' self.plb_allocation = None self.geocode_dict = {4: "Block", 3: "County", 1: "State"} self.dp_mechanism_name = CC.GEOMETRIC_MECHANISM self.inv_con_by_level = { 'Block': { 'invar_names': ('tot', ) if engine == BottomUpEngine else (), 'cons_names': ('total', ) if engine == BottomUpEngine else (), }, 'County': { 'invar_names': (), 'cons_names': () }, 'State': { 'invar_names': ('tot', ), 'cons_names': ('total', ) } } self.levels = list(self.inv_con_by_level.keys()) self.geo_bottomlevel = 'Block' self.geolevel_prop_budgets = (Fraction(1 / 5), Fraction(1 / 5), Fraction(3 / 5)) self.postprocess_only = False self.only_dyadic_rationals = False
def main(): spark = SparkSession.builder.appName('RI Redistricting Data - PL94_P12 - Extracting agecat totals').getOrCreate() experiments = [ "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/" ] # add das_decennial zip file to the spark context (to be sent to the core nodes) spark.sparkContext.addPyFile("/mnt/users/moran331/das_decennial.zip") schema_name = "PL94_P12" schema = SchemaMaker.fromName(name=schema_name) for path in experiments: tree = treetools.RunTree(path) runs = tree.runs for r, run in enumerate(runs): if r == 0: df = sdftools.getSparseDF(spark, run, schema, run_columns=True).persist() else: df = df.union(sdftools.getSparseDF(spark, run, schema, run_columns=True)).persist() df = df.persist() plb = str(du.algset2plb(du.findallSciNotationNumbers(tree.algsets[0])[0])) df = df.withColumn("plb", sf.lit(plb)) df.show() geodict = aggtools.getGeodict() geolevel = "State" mapping = geodict[geolevel] group = ["plb", "run_id", geolevel] + schema.dimnames geodf = df.withColumn(geolevel, df.geocode[0:mapping]).groupBy(group).sum().persist() geodf = sdftools.stripSQLFromColumns(geodf) geodf.show() queryname = "votingage" querydf = sdftools.getQueryDF(geodf, queryname, schema, basegroup=["run_id", geolevel, "plb"]).persist() querydf.show() savepath = f"/mnt/users/moran331/redistricting/agecat_redistricting_state_totals_2019_06_27/{tree.algsets[0]}/" du.makePath(savepath) pd = querydf.toPandas().to_csv(savepath + "votingage_all_25_runs.csv", index=False) print(f"---Agecat--- | \n{querydf.toPandas().to_string()}")
def test_wconstraints(): geocode_dict = { 16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County' } histogram, housing_hist = table2hists( np.array( [ # columns: 'hhgq', 'votingage', 'hispanic', 'cenrace', 'unique unitid' (shape 8,2,2,63 + unitUID) # each row is a person [0, 1, 1, 20, 0], [1, 0, 0, 1, 1], [3, 1, 0, 10, 2], [3, 0, 0, 15, 2], [1, 1, 0, 15, 3] ]), SchemaMaker.fromName(CC.SCHEMA_PL94), CC.ATTR_HHGQ) inv_dict = InvariantsMaker.make(schema=CC.SCHEMA_PL94, raw=histogram, raw_housing=housing_hist, invariant_names=("tot", "gqhh_vect", "gqhh_tot", "va")) con_dict = cPL94.ConstraintsCreator(hist_shape=(histogram.shape, housing_hist.shape), invariants=inv_dict, constraint_names=("total",))\ .calculateConstraints().constraints_dict n1 = nodes.GeounitNode(geocode='123456789abcdefg', geocode_dict=geocode_dict, raw=histogram, raw_housing=housing_hist, cons=con_dict, invar=inv_dict) n2 = nodes.GeounitNode(geocode='123456789abcdefg', geocode_dict=geocode_dict, raw=histogram, raw_housing=housing_hist, cons=con_dict, invar=inv_dict) assert n1 == n2
def test_impact_gaps(): for sname, strat in StrategySelector.strategies.items(): try: schema = SchemaMaker.fromName(strat.schema) except AttributeError: raise AttributeError( f"Strategy {sname} doesn't have a schema attribute. Needed to check for impact gaps (to create queries from the schema)" ) s = strat.make(strat.levels) for level, qnames in s[CC.DPQUERIES].items(): for qname in qnames: query = schema.getQuery(qname) # This is just the sum # impact = (np.ones(query.numAnswers()) @ np.abs(query.matrixRep())) # factor of eps/sens doesn't matter here impact = np.abs(query.matrixRep()).sum(axis=0) # total_impact += impact * prop # to do this, need to do composition, multiplying by proportion, like here, only works for pure, epsilon-DP impmin, impmax = impact.min(), impact.max() if abs(impmin - impmax) > 1e-7: print( f"{qname} ~ Impact\n {'':50} Min: {impmin}, Max: {impmax}, All: {impact}" ) raise ValueError( f"There is an impact gap underutilizing parallel composition in query {qname}, geolevel {level}, in strategy {sname}" ) # Having both below is redundant, but for clarity and future flexibility including both if impmin != 1: raise ValueError( f"Some histogram cells are under-measured in query query {qname}, geolevel {level}, in strategy {sname}" ) if impmax != 1: raise ValueError( f"Some histogram cells are measured more than once in query {qname}, geolevel {level}, in strategy {sname}" )
def getTableBuilder(): schema = SchemaMaker.buildSchema() tabledict = getTableDict() builder = tablebuilder.TableBuilder(schema, tabledict) return builder
def getTableBuilder(): schema = SchemaMaker.fromName(CC.SCHEMA_SF1) tabledict = getTableDict() builder = tablebuilder.TableBuilder(schema, tabledict) return builder
spark_loglevel=spark_loglevel) # save the analysis script? # toggle to_linux=True|False to save|not save this analysis script locally # toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ) pp10_gq_edited = spark.read.csv( "s3://uscb-decennial-ite-das/2010/cef/pp10_gq_edited.csv") pp10_hu_edited = spark.read.csv( "s3://uscb-decennial-ite-das/2010/cef/pp10_hu_edited.csv") pp10_grf = spark.read.csv( "s3://uscb-decennial-ite-das/2010/cef/pp10_grf_tab.csv") # Clean up gq file by removing extra characters like b and ' from data, renaming columns, and dropping un-needed columns gq_revised = pp10_gq_edited.withColumn( "final_pop", sf.regexp_replace(sf.col("_c9"), "[b']", "")).withColumn( "FGQ", sf.regexp_replace(sf.col("_c6"), "[b']", "")).withColumn( "PEG",
import numpy as np from programs.constraints.tests.UnitTVGTestdata import units from programs.schema.schemas.schemamaker import SchemaMaker, _unit_schema_dict from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_HOUSEHOLD2010_TENVACS) unit_schema = SchemaMaker.fromName( _unit_schema_dict[CC.SCHEMA_HOUSEHOLD2010_TENVACS]) # Need hhgq_cap to put correct answers into the data below # hhgq_cap = ConstraintsCreator(invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) data1 = { 'data': [ # each row is a household # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2) + unit UID) #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'ten2lev', 'unit UID' [1, 8, 1, 0, 1, 20, 0, 0, 0, 0], [0, 6, 0, 2, 1, 1, 2, 1, 1, 1], [1, 3, 0, 4, 1, 18, 1, 0, 1, 2], [0, 3, 0, 3, 7, 15, 3, 1, 1, 3], [1, 2, 0, 6, 0, 15, 1, 0, 0, 4], ], 'units': units, # 5 total households 'total':
import numpy as np from programs.schema.schemas.schemamaker import SchemaMaker from programs.constraints.tests.UnitSimpleRecodedTestdata import units from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_TEN_UNIT_2010) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10) # Need hhgq_cap to put correct answers into the data below # hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) TODO: NOTE THAT ALL THREE DATA SETS ARE THE SAME FOR NOW data1 = { 'data': [ # each row is a household # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2, 4) + unit UID) #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'tenure', 'unit UID' [1, 8, 1, 0, 1, 20, 0, 0, 0, 0], [0, 6, 0, 2, 1, 1, 2, 1, 1, 1], [1, 3, 0, 4, 1, 18, 1, 0, 3, 2], [0, 3, 0, 3, 7, 15, 3, 1, 1, 2], [1, 2, 0, 6, 0, 15, 1, 0, 2, 3], [1, 5, 0, 4, 1, 18, 0, 0, 0, 2], ], 'units': units, # 5 total households 'total':
class TestMDF2020PersonWriter(): config = """ [setup] spark.name: DAS spark.loglevel: ERROR [schema] schema: DHCP_HHGQ [geodict] geolevel_names: Block,Block_Group,Tract,Tract_Group,County,State,US geolevel_leng: 16,12,11,9,5,2,0 [budget] geolevel_budget_prop: 0.5, 0.5 [writer] produce_flag: 1 output_path: """ schema = SchemaMaker.fromName(CC.DAS_DHCP_HHGQ) def makeNode(self, persons, geocode='0123456789abcdef'): person_hist, unit_hist = table2hists(np.array(persons), self.schema, housing_varname=CC.ATTR_HHGQ) invar = InvariantsMaker.make(schema=CC.DAS_DHCP_HHGQ, raw=person_hist, raw_housing=unit_hist, invariant_names=('tot', 'gqhh_tot', 'gqhh_vect')) cons = PConstraintsCreator( hist_shape=(person_hist.shape, unit_hist.shape), invariants=invar, constraint_names=( 'hhgq_total_lb', 'hhgq_total_ub', 'nurse_nva_0')).calculateConstraints().constraints_dict node = GeounitNode(raw=person_hist, raw_housing=unit_hist, invar=invar, cons=cons, geocode_dict={ 16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County', 2: 'State', 1: 'US' }, geocode=geocode) node.syn = node.raw return node @pytest.mark.parametrize("recode", [True, False]) @pytest.mark.parametrize("as_dict", [True, False]) @pytest.mark.parametrize("persons", [ pdata['persons'], ]) def test_makeHistRowsFromMultiSparse(self, persons, as_dict, recode): node = self.makeNode(persons) if as_dict: node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)) if recode: rows = makeHistRowsFromMultiSparse( node, self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder) assert len(rows) == len(persons) else: rows = makeHistRowsFromMultiSparse(node, self.schema, add_schema_name=False) input_rows = ["|".join(map(str, row[:-1])) for row in persons] if not recode: match_cnt = 0 for row in rows: row_str = "|".join([row[var] for var in self.schema.dimnames]) for inp_row in input_rows: if row_str == inp_row: match_cnt += 1 input_rows.remove(inp_row) break assert match_cnt == len(rows) == len(persons) else: assert len(rows) == len(persons) @pytest.mark.parametrize( "persons", [ #(Household2010_testdata.data['data'], Household2010_testdata.data['units']), pdata['persons'], ]) def test_makeHistRowsFromMultiSparseRecode(self, persons): node = self.makeNode(persons) rows = makeHistRowsFromMultiSparse( node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)), self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder) ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the # writer test below assert len(rows) == len(persons) def test_transformRDDForSaving(self, spark, dd_das_stub): dd_das_stub.reader = get_reader_stub() config = ConfigParser() config.read_string(self.config) import programs.das_setup as ds setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) w = MDF2020PersonWriter(config=setup_instance.config, setup=setup_instance, name='writer', das=dd_das_stub) persons = pdata['persons'] node1 = self.makeNode(persons[:2], geocode='0123456789abcdef') node2 = self.makeNode(persons[2:], geocode='0123456789abcdeg') spark = SparkSession.builder.getOrCreate() node_rdd = spark.sparkContext.parallelize([node1, node2]) df = w.transformRDDForSaving(node_rdd) df.show() assert df.count() == len(persons) for val in df.select('EPNUM').collect(): assert val['EPNUM'] == 999999999 for val in df.select('RELSHIP').collect(): assert val['RELSHIP'] == '99' def len_cond(cond): return len(np.where(cond)[0]) num_gq = len_cond(np.array(persons)[:, 0] > 0) rtype = np.array(df.select('RTYPE').collect()) assert len_cond(rtype[:, 0] == '5') == num_gq assert len_cond(rtype[:, 0] == '3') == len(persons) - num_gq
qps = qprops[strat_name] qns = queries[strat_name] dp_query_prop = dict((gl, qps[gl]) if gl in qps else (gl, qps['default']) for gl in geolevels) query_names = dict((gl, qns[gl]) if gl in qns else (gl, qns['default']) for gl in geolevels) levels = list(reversed(list(geolevel_prop_budgets_dict))) print(f"dp_query_prop: {dp_query_prop}") for k, vals in dp_query_prop.items(): print(f"{k} : sum -> {sum(vals)}, len -> {len(vals)}") print(f"query_names: {query_names}") for k, qnames in query_names.items(): print(f"{k} : len -> {len(qnames)}") schema = SchemaMaker.fromName( "PL94" ) # NOTE: units is actually H1, but irrelevant for geolevel calculations def query_iter(gl): for qname, qprop in zip(query_names[gl], dp_query_prop[gl]): yield schema.getQuery(qname), qprop attr_query_props = Budget.getAttrQueryProps(levels, schema.dimnames, query_iter) print(f"attr_query_props: {attr_query_props}") def getEpsFromGeoAlloc(geo_all_dict): return curve.zCDPEpsDeltaCurve(geo_all_dict, verbose=False).get_epsilon(float(delta), global_scale, bounded=True,
import analysis.tools.setuptools as setuptools import analysis.tools.datatools as datatools import analysis.tools.sdftools as sdftools import analysis.tools.graphtools as graphtools import analysis.tools.crosswalk as crosswalk import matplotlib.pyplot as plt import programs.cenrace as cenrace from programs.schema.attributes.cenrace import CenraceAttr as CENRACE import analysis.constants as AC import constants as C import seaborn as sns from pyspark.sql import functions as sf from pyspark.sql import Row from programs.schema.schemas.schemamaker import SchemaMaker schema = SchemaMaker.fromName("DHCP_SCHEMA") from programs.schema.schemas.schemamaker import SchemaMaker import programs.sparse as sp import scipy.sparse as ss if __name__ == "__main__": ################################################################ # Set the save_location to your own JBID (and other folder(s)) # it will automatically find your JBID # if something different is desired, just pass what is needed # into the setuptools.setup function. ################################################################ jbid = os.environ.get('JBID', 'temp_jbid') save_folder = "plots/"
import numpy as np from programs.constraints.tests.UnitSimpleRecodedTestdata import units from programs.schema.schemas.schemamaker import SchemaMaker from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_HOUSEHOLD2010) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10) # Need hhgq_cap to put correct answers into the data below # hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) TODO: NOTE THAT ALL THREE DATA SETS ARE THE SAME FOR NOW data1 = { 'data': [ # each row is a household # (shape (2, 9, 2, 7, 8, 24, 2, 4, 2) + unit UID) #columns: 'hhsex', 'hhage', 'hisp', 'race', 'size', 'hhtype', 'elderly', 'multi', 'unit UID' [1, 8, 1, 0, 1, 20, 0, 0, 0], [0, 6, 0, 2, 1, 1, 2, 1, 1], [1, 3, 0, 4, 1, 18, 1, 0, 2], [0, 3, 0, 3, 7, 15, 3, 1, 3], [1, 2, 0, 6, 0, 15, 1, 0, 4], [1, 5, 0, 4, 1, 18, 0, 0, 2], ], 'units' : units, # 5 total households 'total': 6,
spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel) spark = analysis.spark S3_BASE="s3://uscb-decennial-ite-das/users" save_location_linux = f"/mnt/users/rao00316/bias/" path = [ "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP_reRun1/td16/" ] runs = datatools.getDASRuns(path) schema_name = "DHCP_HHGQ" schema = SchemaMaker.fromName(name=schema_name) experiment = analysis.make_experiment("DHCP", path) df = experiment.getDF() schema = experiment.schema geolevels = [C.COUNTY] #, C.COUNTY, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU, C.CD] queries = ['total'] def NEbias(spark,df,geolevels,queries,schema): u=sdftools.getAnswers(spark,df,geolevels,schema,queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u=u.withColumn('diff',sf.col('priv')-sf.col('orig')) z=u.groupby(['geolevel']).avg()
import numpy as np from programs.constraints.Constraints_DHCP_HHGQ import ConstraintsCreator from programs.schema.schemas.schemamaker import SchemaMaker from programs.constraints.tests.UnitSimpleRecodedTestdata import units from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_REDUCED_DHCP_HHGQ) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_SIMPLE_RECODED) # Need hhgq_cap to put correct answers into the data below hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) data1 = { 'data': [ # columns: 'sex', 'age', 'hispanic', 'cenrace', 'citizen', 'unit UID' (shape (8, 2, 116, 2, 63, 2) + unit UID) # each row is a person [0, 1, 10, 1, 20, 1, 0], [2, 0, 20, 0, 1, 0, 1], [5, 1, 45, 0, 10, 1, 2], [5, 0, 80, 0, 15, 0, 2], [0, 1, 90, 0, 15, 1, 3], [1, 1, 10, 1, 20, 1, 4], [1, 0, 20, 0, 1, 0, 5], [1, 1, 45, 0, 10, 1, 6], [1, 0, 80, 0, 15, 0, 7], [1, 1, 90, 0, 15, 1, 8]
import numpy as np from programs.constraints.Constraints_DHCP import ConstraintsCreator from programs.schema.schemas.schemamaker import SchemaMaker from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_DHCP) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_DHCP) # Need hhgq_cap to put correct answers into the data below hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) data1 = { 'units': [[3, 0], [0, 1], [18, 2], [0, 3], [5, 4], [23, 5], [5, 6], [9, 7], [0, 8]], 'data': [ # columns: 'relgq', 'sex', 'age', 'hispanic', 'cenrace', 'unit UID' (shape (42, 2, 116, 2, 63) + unit UID) # each row is a person # there are 10 people # 9 units # 3 people in 3 different households (11,17,15) # 3 people in 2 different GQ 22 # 1 people in 1 of each GQ in [20, 35, 40, 26] [3 + 17, 0, 67, 1, 29, 0], [11, 1, 46, 0, 1, 1], # Grandchild [18 + 17, 1, 42, 0, 47, 2],
def convertPL94(d): return table2hists(d, SchemaMaker.fromName(CC.SCHEMA_PL94), CC.ATTR_HHGQ)
import numpy as np import scipy.sparse as ss from collections import Counter from programs.constraints.Constraints_SF1 import ConstraintsCreator from programs.schema.schemas.schemamaker import SchemaMaker from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_SF1) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10) # Need hhgq_cap to put correct answers into the data below hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap units1 = [ # each row is a unit # columns: 'hhgq','unit UID', [0, 0], [0, 1], [0, 2], [0, 3], [1, 4], [15, 5], [16, 6], [20, 7], [29, 8], ] counts1 = Counter(np.array(units1)[:, 0])
class TestMDF2020HouseholdWriter(): config = """ [setup] spark.name: DAS spark.loglevel: ERROR [schema] schema: Household2010 [geodict] geolevel_names: Stub geolevel_leng: 1 [budget] geolevel_budget_prop: 0.5, 0.5 [writer] produce_flag: 1 output_path: """ schema = SchemaMaker.fromName(CC.DAS_Household2010) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10) def makeNode(self, hholds, units, geocode='0'): hhold_hist, unit_hist = table2hists( np.array(hholds), self.schema), table2hists(np.array(units), self.unit_schema, CC.ATTR_HHGQ, units=True) invar = InvariantsMaker.make(schema=CC.SCHEMA_HOUSEHOLD2010, raw=hhold_hist, raw_housing=unit_hist, invariant_names=('tot', 'gqhh_vect')) cons = HHConstraintsCreator( hist_shape=(hhold_hist.shape, unit_hist.shape), invariants=invar, constraint_names=('no_vacant', 'living_alone', 'size2')).calculateConstraints().constraints_dict node = GeounitNode(raw=hhold_hist, raw_housing=unit_hist, invar=invar, cons=cons, geocode_dict={1: 'Stub'}, geocode=geocode) node.syn = node.raw return node @pytest.mark.parametrize("recode", [True, False]) @pytest.mark.parametrize("as_dict", [True, False]) @pytest.mark.parametrize("hholds, units", [ (Household2010_testdata.data['data'], Household2010_testdata.data['units']), (hhdata['households'], hhdata['units']), ]) def test_makeHistRowsFromMultiSparse(self, hholds, units, as_dict, recode): node = self.makeNode(hholds, units) if as_dict: node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)) if recode: rows = makeHistRowsFromMultiSparse( node, self.schema, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(hholds) rows = addEmptyAndGQ(node, self.schema, rows, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(units) else: rows = makeHistRowsFromMultiSparse(node, self.schema, add_schema_name=False) input_rows = ["|".join(map(str, row[:-1])) for row in hholds] if not recode: match_cnt = 0 for row in rows: row_str = "|".join([row[var] for var in self.schema.dimnames]) for inp_row in input_rows: if row_str == inp_row: match_cnt += 1 input_rows.remove(inp_row) break assert match_cnt == len(rows) == len(hholds) else: assert len(rows) == len(units) @pytest.mark.parametrize("hholds, units", [ (Household2010_testdata.data['data'], Household2010_testdata.data['units']), (hhdata['households'], hhdata['units']), ]) def test_makeHistRowsFromMultiSparseRecode(self, hholds, units): node = self.makeNode(hholds, units) rows = makeHistRowsFromMultiSparse( node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)), self.schema, row_recoder=Household2010ToMDFUnit2020Recoder) ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the # writer test below assert len(rows) == len(hholds) rows = addEmptyAndGQ(node, self.schema, rows, row_recoder=Household2010ToMDFUnit2020Recoder) assert len(rows) == len(units) pass def test_transformRDDForSaving(self, spark, dd_das_stub): dd_das_stub.reader = get_reader_stub() config = ConfigParser() config.read_string(self.config) import programs.das_setup as ds setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub) w = MDF2020HouseholdWriter(config=setup_instance.config, setup=setup_instance, name='writer', das=dd_das_stub) hholds = hhdata['households'] units = hhdata['units'] node1 = self.makeNode(hholds[:4], units[:4], geocode='0') node2 = self.makeNode(hholds[4:], units[4:], geocode='1') spark = SparkSession.builder.getOrCreate() node_rdd = spark.sparkContext.parallelize([node1, node2]) df = w.transformRDDForSaving(node_rdd) df.show() assert df.count() == len(units) for val in df.select('P18').collect(): assert val['P18'] == 9 for val in df.select('PAC').collect(): assert val['PAC'] == '9' def len_cond(cond): return len(np.where(cond)[0]) num_gq = len_cond(np.array(units)[:, 0] > 1) rtype = np.array(df.select('RTYPE').collect()) assert len_cond(rtype[:, 0] == '4') == num_gq assert len_cond(rtype[:, 0] == '2') == len(units) - num_gq
import numpy as np from programs.schema.schemas.schemamaker import SchemaMaker from programs.constraints.Constraints_PL94 import ConstraintsCreator from das_utils import table2hists from constants import CC schema = SchemaMaker.fromName(CC.SCHEMA_PL94) unit_schema = SchemaMaker.fromName(CC.SCHEMA_UNIT_TABLE_10_SIMPLE_RECODED) # Need hhgq_cap to put correct answers into the data below hhgq_cap = ConstraintsCreator(hist_shape=(schema.shape, unit_schema.shape), invariants=(), constraint_names=()).hhgq_cap # Testing data with answers (right hand sides of the constraints) data1 = { 'data': [ # columns: 'hhgq', 'votingage', 'hispanic', 'cenrace', 'unique unitid' (shape 8,2,2,63 + unitUID) # each row is a person [0, 1, 1, 20, 0], [1, 0, 0, 1, 1], [3, 1, 0, 10, 2], [3, 0, 0, 15, 2], [1, 1, 0, 15, 3] ], # 5 total people (5 rows) 'total': 5, # 3 rows 1,3,5 have va=1 'voting_age': 3, # 1 row (4th) with nursing home (3 in first column) and 0 (va) in second column
"run_17": "DA-Run B", "run_23": "DA-Run C" } run_ids.sort(key=lambda s: int(s.split("_")[1])) for r, run_id in enumerate(run_ids): if r == 0: wide['State'] = df[df['run_id'] == run_id].reset_index()['State'] wide['plb'] = df[df['run_id'] == run_id].reset_index()['plb'] wide['agecat'] = df[df['run_id'] == run_id].reset_index()['agecat'] wide['orig'] = df[df['run_id'] == run_id].reset_index()['orig'] wide[run_id] = df[df['run_id'] == run_id].reset_index()['priv'] # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production widedf = pandas.DataFrame(wide) schema = SchemaMaker.fromName(name="PL94_P12") levels = schema.levels['agecat'] widedf['agecat'] = pandas.Categorical(widedf['agecat'], categories=levels) widedf = widedf.sort_values('agecat') widedf = widedf[['State', 'plb', 'agecat', 'orig'] + [f"run_{i}" for i in range(25)]] widedf = widedf.reset_index() widedf = widedf[widedf.columns[1:]] widedf = widedf[["State", "plb", "agecat", "orig"] + list(keep_run_ids.keys())] widedf = widedf.rename(columns=keep_run_ids) va_ind = numpy.array( schema.getQuerySeed("voting").groupings['agecat']).flatten().tolist()