def _transform(self, mdf): df = mdf df = self._recode_geocode(df).persist() if self.debugmode: sdftools.show(df, "Priv DF after recoding to get geocode variable") df = self._recode_schema(df).persist() if self.debugmode: sdftools.show(df, "Priv DF after recoding schema variables") df = self._add_metadata_columns(df).persist() if self.debugmode: sdftools.show(df, "Priv DF after adding the metadata columns") df = self._drop_unneeded_columns(df).persist() if self.debugmode: sdftools.show(df, "Priv DF after dropping the columns not needed for Analysis") df = self._get_counts(df).persist() if self.debugmode: sdftools.show(df, "Priv DF after counting number of records per record type (i.e. after forming the sparse histogram)") return df
def _recode_hhgq(self, df): hhgq_sql = ['case'] hhgq_sql += [f"when RTYPE = '3' and GQTYPE = '000' then '0'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '101' then '1'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '201' then '2'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '301' then '3'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '401' then '4'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '501' then '5'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '601' then '6'"] hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '701' then '7'"] hhgq_sql += ['else -1'] hhgq_sql += ['end'] hhgq_sql = "\n".join(hhgq_sql) if self.debugmode: sdftools.show(hhgq_sql, "HHGQ recode sql statement") df = df.withColumn("hhgq", sf.expr(hhgq_sql)).persist() if self.debugmode: sdftools.show(df, "Priv DF with hhgq recode") return df
def __init__(self, analysis, schema, mdf_path, cef_path=None, budget_group="Unspecified", plb="Unspecified", run_id="Unspecified", debugmode=True): self.debugmode = debugmode self.analysis = analysis self.spark = self.analysis.spark self.mdf_path = mdf_path self.schema = schema self.mdf = self.spark.read.csv(self.mdf_path, header=True, sep='|', comment='#') sdftools.show(self.mdf, "MDF to be transformed into an Analysis Hist DF using 'DHCP_HHGQ' Schema") self.budget_group = budget_group self.plb = plb self.run_id = run_id self.priv_df = self._transform(self.mdf).persist() if cef_path is not None: self.cef_path = cef_path # assume cef_path's data is a pickled rdd cef_experiment = self.analysis.make_experiment("_CEF_DATA_", cef_path, schema_name=self.schema.name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) self.orig_df = datatools.getOrigDF(cef_experiment).persist() if self.debugmode: sdftools.show(self.orig_df, "CEF sparse histogram DF -- loaded from pickled data") self.df = self._join_mdf_and_cef().persist() else: self.df = self.priv_df sdftools.show(self.df, "Analysis Hist DF with 'DHCP_HHGQ' Schema")
def queryHellinger(df, groupby=[AC.GEOLEVEL, AC.GEOCODE, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]): """ Calculates the Hellinger metric for each unique (GEOLEVEL, GEOCODE, QUERY, RUN_ID, PLB, BUDGET_GROUP) group g = AC.ORIG = raw = CEF # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production h = AC.PRIV = syn = MDF H(g,h) = sqrt(sum([sqrt(h_i) - sqrt(g_i)]^2) / [2*sum(g_i)]) H(g,h) = sqrt( A / B ) A = sum([sqrt(h_i) - sqrt(g_i)]^2) B = 2 * sum(g_i) """ df = df.withColumn("H_A", sf.pow(sf.sqrt(sf.col(AC.PRIV)) - sf.sqrt(sf.col(AC.ORIG)), sf.lit(2))).persist() sdftools.show(df, "H_A = [sqrt(priv) - sqrt(orig)]^2") df = df.withColumn("H_B", sf.lit(2) * sf.col(AC.ORIG)).persist() sdftools.show(df, "H_B = 2 * orig") df = df.groupBy(groupby).sum().persist() sdftools.show(df, "H_A and H_B after summing over groups") df = sdftools.stripSQLFromColumns(df).persist() df = df.withColumn("H", sf.sqrt(sf.col("H_A") / sf.col("H_B"))).persist() sdftools.show(df, "H = sqrt(sum([sqrt(priv) - sqrt(orig)]^2) / [2*sum(orig)])") return df
def _join_mdf_and_cef(self): # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production label = { AC.PLB: self.plb, AC.RUN_ID: self.run_id, AC.BUDGET_GROUP: self.budget_group, AC.ORIG: 0, AC.PRIV: 0 } order_cols = [AC.GEOCODE] + self.schema.dimnames df = self.priv_df.join(self.orig_df, on=order_cols, how="full_outer").persist() if self.debugmode: sdftools.show(df, "Joined Priv and Orig sparse histogram DFs") df = df.fillna(label).persist() if self.debugmode: sdftools.show(label, "Fill in NAs with these values") if self.debugmode: sdftools.show(df, "Joined sparse histogram DF with NAs replaced by the appropriate values") # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production column_order = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] + self.schema.dimnames + [AC.ORIG, AC.PRIV] df = df.select(column_order).persist() return df
if __name__ == "__main__": ################################################################ # Set the save_location to your own JBID (and other folder(s)) # it will automatically find your JBID # if something different is desired, just pass what is needed # into the setuptools.setup function. ################################################################ jbid = os.environ.get('JBID', 'temp_jbid') save_folder = "analysis_results/" save_location = du.addslash(f"{jbid}/{save_folder}") spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=save_location, spark_loglevel=spark_loglevel) # save the analysis script? # toggle to_linux=True|False to save|not save this analysis script locally # toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark sdftools.show(analysis.__dict__, "Analysis attributes")
C.SLDL, C.SLDU ] queries = [ #'total', #'hhgq', #'votingage * citizen', #'numraces * hispanic', #'cenrace * hispanic', #'sex * age', #'detailed' 'cenrace' ] sdftools.show(df, "df with geolevel crosswalk columns") sdftools.show(df, "df with geolevel crosswalk columns") df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.show(df, "df after geolevel aggregation", 1000) qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False) sdftools.show(qdf, "Query df with the query 'total'", 1000) rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY]) #sdftools.show(rdd.collect(), "Row groups") path = save_location_linux + "Gel.csv" q = qdf.toPandas() du.makePath(du.getdir(path)) q.to_csv(path, index=False) def Qualbins(rows, column, bins): #only works if bins > 2 pandas_df = pandas.DataFrame(rows)
# 1. going to use experiment.getDF() twice and will # drop the orig column in one and the priv column # in the other # 2. going to drop all zero rows in each of them (since # the zeros only exist due to (orig > 0 or priv > 0) schema = experiment.schema order_cols = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] + schema.dimnames # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production all_cols = order_cols + [AC.ORIG, AC.PRIV] limit_num = None limit_df = experiment.getDF().sort(order_cols).persist() if limit_num is not None: limit_df = limit_df.sort(order_cols).limit(limit_num).persist() sdftools.show(limit_df, f"DF with {limit_num} rows", limit_num) exp_df = limit_df.persist() # 1. orig_df = limit_df.drop(AC.PRIV).sort(order_cols).persist() priv_df = limit_df.drop(AC.ORIG).sort(order_cols).persist() sdftools.show(orig_df, "DF with only CEF values", 40) sdftools.show(priv_df, "DF with only MDF values", 40) sdftools.show(orig_df.count(), "CEF row count") sdftools.show(priv_df.count(), "MDF row count") # 2. orig_df = orig_df.filter(sf.col(AC.ORIG) > 0).sort(order_cols).persist() priv_df = priv_df.filter(sf.col(AC.PRIV) > 0).sort(order_cols).persist() sdftools.show(orig_df, "CEF DF with only nonzeros", 40)
df = experiment.getDF() schema = experiment.schema sdftools.print_item(df, "Experiment DF") ############################## # Accuracy Metrics ############################## """ Mean / Median Absolute Error (MAE): 1. Calculate total population at County geographic level 2. Calculate |MDF-CEF| for the total populations for each county 3. Calculate the mean or median across all county total populations """ # 1a. Aggregate to County geographic level county_df = sdftools.aggregateGeolevels(spark, df, [C.COUNTY]) sdftools.show(county_df, "Counties") # 1b. Answer the "total" query for all counties county_totals_df = sdftools.answerQueries(county_df, schema, "total", labels=True) sdftools.show(county_totals_df, "County total pops") # 2. Calculate L1(MDF, CEF) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production abs_error_county_totals_df = sdftools.getL1(county_totals_df, colname="AbsError", col1=AC.PRIV, col2=AC.ORIG) sdftools.show(abs_error_county_totals_df,
get5, get6 = MAPE(get) path5 = save_location_linux + "MAPE.csv" pdf5 = get5.toPandas() du.makePath(du.getdir(path5)) pdf5.to_csv(path5, index=False) #get6=MAPE_avg(get5) path6 = save_location_linux + "MAPE_avg.csv" pdf6 = get6.toPandas() du.makePath(du.getdir(path6)) pdf6.to_csv(path6, index=False) quantile = [0.9] zef = sdftools.getGroupQuantiles(get5, 'MAPE', 'geolevel', quantile) sdftools.show(zef) path = f"/mnt/users/rao00316/amaz/" all_files = glob.glob(os.path.join(path, "*.csv")) writer = pandas.ExcelWriter('out.xls', engine='xlswriter') for f in all_files: df = pandas.read_csv(f) df.to_excel(writer, sheet_name=os.path.splitext(os.path.basename(f))[0], index=False) writer.save()
geounits = toytools.getToyGeounitData_GeounitNode(schema, geocodes, geocode_dict, raw_params={'low': 0, 'high': 100}) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300) # aggregate geolevels df = df.withColumn("block", sf.col(AC.GEOCODE)[0:3]).persist() df = df.withColumn("county", sf.col(AC.GEOCODE)[0:2]).persist() df = df.withColumn("nation", sf.col(AC.GEOCODE)[0:1]).persist() sdftools.show(df, "df with geolevel crosswalk columns") df = sdftools.aggregateGeolevels(spark, df, ['block', 'county', 'nation']) sdftools.show(df, "df after geolevel aggregation", 1000) # answer total query qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False) sdftools.show(qdf, "Query df with the query 'total'", 1000) # select geounits by quantile bins rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY]) sdftools.show(rdd.collect(), "Row groups") def row_selection_mapper(rows, selection_function, **selection_kwargs): pandas_df = pandas.DataFrame(rows) pandas_df = selection_function(pandas_df, **selection_kwargs)
def queryLp(df, p, groupby=[AC.GEOLEVEL, AC.GEOCODE, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]): """ Calculates the L^p-norm for each unique (GEOLEVEL, GEOCODE, QUERY, RUN_ID, PLB, BUDGET_GROUP) group """ sdftools.show(p, "Value of p in the L^p metric") if p == "inf": # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production df = df.withColumn("L^inf_norm", sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG))).persist() sdftools.show(df, "L^inf_norm as | protected - orig | before taking the max") df = df.groupBy(groupby).agg(sf.max(sf.col("L^inf_norm"))).persist() sdftools.show(df, "L^inf_norm after taking the max per group") df = sdftools.stripSQLFromColumns(df).persist() else: df = df.withColumn(f"L^{p}", sf.pow(sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG)), sf.lit(p))).persist() sdftools.show(df, f"L^{p} after taking | protected - orig | ^ {p}") df = df.groupBy(groupby).sum().persist() sdftools.show(df, f"L^{p} after groupby and sum") df = sdftools.stripSQLFromColumns(df).persist() df = df.withColumn(f"L^{p}_norm", sf.pow(sf.col(f"L^{p}"), sf.lit(1/p))).persist() sdftools.show(df, f"L^{p} after taking {p}-th root of the sum") df = sdftools.stripSQLFromColumns(df).persist() return df
# Calculating L^p and Hellinger Metrics ######################################## # 0a. Aggregate Blocks to get Geographic Units at all desired Geographic Levels geoleveldf = sdftools.aggregateGeolevels(spark, df, geolevels) # 0b. Answer Queries querydf = sdftools.answerQueries(geoleveldf, schema, queries, labels=True) # 1. Calculate L(orig, priv) and H(orig, priv) for # detailed cells, marginals, total (for the queries listed in "queries") df_L1 = queryLp(querydf, 1) df_L2 = queryLp(querydf, 2) df_Linf = queryLp(querydf, "inf") sdftools.show(df_L1, "L^1 norm for the queries") sdftools.show(df_L2, "L^2 norm for the queries") sdftools.show(df_Linf, "L^inf norm for the queries") df_H = queryHellinger(querydf) sdftools.show(df_H, "Hellinger metric for the queries") # 2. Average L^p and H across geounits in the geolevel # removed AC.GEOCODE from the groupby to aggregate across all geounits groupby = [AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] df_L1_avg = df_L1.groupBy(groupby).agg(sf.avg(sf.col("L^1_norm"))).persist() df_L2_avg = df_L2.groupBy(groupby).agg(sf.avg(sf.col("L^2_norm"))).persist() df_Linf_avg = df_Linf.groupBy(groupby).agg(sf.avg(sf.col("L^inf_norm"))).persist() sdftools.show(df_L1_avg, "L^1 norm for the queries")
# toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark # build an example schema schema = Schema("example", ['a', 'b', 'c'], (2,3,5)) sdftools.show(schema, "Toy example Schema") # build a set of GeounitNodes to use geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022'] geocode_dict = {3: 'block', 2: 'county'} # build geounits geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.show(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.show(df, "Toy example DF", 300)
analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False) spark = analysis.spark ###################################################################### # Use the orig and priv run data to generate Spark DFs # then join those DFs on the 'geocode' and schema dimnames colums ###################################################################### # 1. Need to create an experiment for the cef-related DAS runs AND # an experiment for the mdf-related DAS runs path_without_orig = f"{AC.S3_BASE}lecle301/withoutRawForBrett_topdown_ri44/" syn_experiment = analysis.make_experiment("without_orig", path_without_orig) path_with_orig = f"{AC.S3_BASE}lecle301/withRawForBrett_topdown_ri44/" raw_experiment = analysis.make_experiment("with_orig", path_with_orig) # 2. Pass the cef experiment object and the mdf experiment object to # datatools.getJoinedDF, which will create the joined MDF and CEF DF df = datatools.getJoinedDF(syn_experiment, raw_experiment) sdftools.show(df, "Joined Experiment DF") sdftools.show(df.count(), "# rows in Joined Experiment DF") groupby = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] sdftools.show( df.agg(*(sf.countDistinct(sf.col(c)).alias(c) for c in groupby)), f"Distinct counts of each column in {groupby}")
# 3. Calculate the Signed Error, with the CEF counts binned bins = [0, 1, 10, 100, 1000, 10000] df = sdftools.getSignedErrorByTrueCountRuns(querydf, bins) # 4. Calculate quantiles within groups quantiles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] groupby = [ 'orig_count_bin', AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP ] # 4a. Calculate within-group quantiles for 'signed error' signed_error_quantile_df = sdftools.getGroupQuantiles( df, columns=['signed_error'], groupby=groupby, quantiles=quantiles) sdftools.show(signed_error_quantile_df.persist(), f"Signed error quantiles by group '{groupby}'", 1000) # 4b. Calculate within-group quantiles for 'relative error' re_quantile_df = sdftools.getGroupQuantiles(df, columns=['re'], groupby=groupby, quantiles=quantiles) sdftools.show(re_quantile_df, f"Relative error quantiles by group '{groupby}'", 1000) # 5. Calculate averages within groups # 5a. Calculate signed error averages by group signed_error_avg_df = df.groupBy(groupby).agg( sf.avg("signed_error")).persist() sdftools.show(signed_error_avg_df, "signed_error_average_by_run", 1000)
pandas_df = pandas.DataFrame(rows) for bindex in range(0, len(buckets)): pandas_df[f"Bin{bindex}"] = ( buckets[bindex][0] <= pandas_df[column]) & (pandas_df[column] <= buckets[bindex][1]) rows = pandas_df.to_dict('records') return rows def binIndexToInteger(row, buckets): for bindex, bucket in enumerate(buckets): if row[f"Bin{bindex}"] == True: return str(bucket) # loop over each table, calculating the queries for each one for table_name, queries in tabledict.items(): sdftools.show(queries, f"The queries associated with table '{table_name}'") get, get2, get3 = MAE(spark, df, geolevels, queries, schema) # answer the queries within the table #df_table = sdftools.answerQueries(df_geolevel, schema, queries, labels=True).persist() rdd = sdftools.getRowGroupsAsRDD(get, groupby=[AC.GEOLEVEL, AC.QUERY]) rdd = rdd.flatMapValues( lambda rows: sepBounds(rows, 'orig', buckets)).persist() rdd = rdd.map(lambda row: Row(**row[1])) df = rdd.toDF().persist() metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' df = df.groupby([ 'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4', 'Bin5'
# Get the DF and schema schema = experiment.schema df = experiment.getDF() # Get the geolevels (faster to do this before looping if the queries # are to be answered over the same geolevels; otherwise, can perform this # step in the loop) geolevels = [ C.US, C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU ] df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) # loop over each table, calculating the queries for each one for table_name, queries in tabledict.items(): sdftools.show(queries, f"The queries associated with table '{table_name}'") # answer the queries within the table df_table = sdftools.answerQueries(df_geolevel, schema, queries, labels=True).persist() sdftools.show(df_table, f"The DF with answers for the table '{table_name}'") # further computations... # plot and/or save the results