def run(self, engine_tuple): block_nodes, feas_dict = engine_tuple # access the SparkSession (needed for aggregating geolevels) spark = SparkSession(SparkContext.getOrCreate()) # transform the rdd of block-level nodes into a 'sparse histogram' spark df df = datatools.rdd2df(block_nodes, self.setup.schema_obj) sdftools.show(df, "The Block-level Geounit Nodes as Sparse Histogram DF", 1000) # read the geolevels from the error_metrics section of the config file geolevels = self.setup.config['error_metrics']['geolevels'].split(", ") #geolevels = self.setup.levels # aggregate blocks to get the different geolevels df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.show(df, f"DF with all Geolevels in {geolevels}", 1000) # access the queries from the error_metrics section of the config file queries = self.setup.config['error_metrics']['queries'].split(", ") # and answer the queries df = sdftools.answerQueries(df, self.setup.schema_obj, queries) sdftools.show(df, f"DF with all Queries in {queries}", 1000) # compute the Geolevel 1-TVD metric geolevel_tvd = sdftools.getGeolevelTVD(df, groupby=[AC.GEOLEVEL, AC.QUERY]) geolevel_tvd = geolevel_tvd.orderBy([AC.QUERY, AC.GEOLEVEL]) sdftools.show(geolevel_tvd, f"Geolevel 1-TVD per geolevel per query", 1000) # calculate sparsity change sparsity_df = sdftools.getCellSparsityByGroup(df, self.setup.schema_obj, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.QUERY]) sdftools.show(sparsity_df, f"Query and Geolevel DF with Sparsity per group", 1000)
schema = Schema("example", ['a', 'b', 'c'], (2, 3, 5)) sdftools.print_item(schema, "Toy example Schema") # build a set of GeounitNodes to use geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022'] geocode_dict = {3: 'block', 2: 'county'} # build geounits geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300) # perform analyses # L1 # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production df = sdftools.getL1(df, colname="L1_cell", col1='priv', col2='orig') sdftools.print_item(df, "Toy example L1", 300) # adding in a simple row-counting column df = df.withColumn("row_count", sf.lit(1)).persist() sdftools.print_item(df, "Totals + rowcounter column") # total within each geocode df = sdftools.answerQuery(df, schema, "total", labels=False) sdftools.print_item(df, "Totals within each geounit", 300)