geounits = [ testdata_random_geounit_generator(x, schema, density=0.00001, scale=10) for x in geocodes ] sdftools.print_item(geounits, "Random Geounit Data") rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd, "Parallelized RDD data") df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema) ).map(lambda row: Row(**row)).toDF().persist() sdftools.print_item(df, "DF of Random Geounit Data") df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist() sdftools.print_item(df, "DF with STATE code") df = sdftools.aggregateGeolevels(spark, df, 'STATE') sdftools.print_item(df, "Aggregated to the STATE geolevel") query = 'sex * age' df = sdftools.answerQuery(df, schema, query, labels=False) sdftools.print_item(df, "Answering the sex query") groupby = ['geocode', 'geolevel'] rdd = sdftools.getRowGroupsAsRDD(df, groupby) df = rdd.flatMapValues(prob_vector_mapper).map( lambda row: Row(**row[1])).toDF() df = df.withColumn('age', sf.col('age').cast("int")).persist() df = df.sort(['geocode', 'age', 'sex']).persist() sdftools.print_item(df, f"Prob vector for {query} query", show=1000)
geocode_dict = {3: 'block', 2: 'county'} # build geounits geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict) rdd = spark.sparkContext.parallelize(geounits).persist() sdftools.print_item(rdd.take(1), "One of the toy example geounits") # use Analysis to transform the rdd of geounitnodes into a spark dataframe df = datatools.rdd2df(rdd, schema) sdftools.print_item(df, "Toy example DF", 300) # perform analyses # L1 # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production df = sdftools.getL1(df, colname="L1_cell", col1='priv', col2='orig') sdftools.print_item(df, "Toy example L1", 300) # adding in a simple row-counting column df = df.withColumn("row_count", sf.lit(1)).persist() sdftools.print_item(df, "Totals + rowcounter column") # total within each geocode df = sdftools.answerQuery(df, schema, "total", labels=False) sdftools.print_item(df, "Totals within each geounit", 300) # L1 of total df = sdftools.getL1(df, colname="L1_total", col1="priv", col2="orig") sdftools.print_item(df, "Totals + rowcounter column + L1")
df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.print_item(df, "Geolevel DF") #################### # Answering Queries #################### queries = [ "total", "sex", "sex * hispanic", "votingage * sex * hispanic", "numraces" ] for query in queries: querydf = sdftools.answerQuery(df, schema, query, labels=False, merge_dims=False) sdftools.print_item( querydf, f"Query DF for {query}, labels=False, merge_dims=False", 2000) querydf = sdftools.answerQuery(df, schema, query, labels=False, merge_dims=True) sdftools.print_item( querydf, f"Query DF for {query}, labels=False, merge_dims=True", 2000)