Exemple #1
0
    def run(self, engine_tuple):
        block_nodes, feas_dict = engine_tuple
        
        # access the SparkSession (needed for aggregating geolevels)
        spark = SparkSession(SparkContext.getOrCreate())

        # transform the rdd of block-level nodes into a 'sparse histogram' spark df
        df = datatools.rdd2df(block_nodes, self.setup.schema_obj)
        sdftools.show(df, "The Block-level Geounit Nodes as Sparse Histogram DF", 1000)

        # read the geolevels from the error_metrics section of the config file
        geolevels = self.setup.config['error_metrics']['geolevels'].split(", ")
        #geolevels = self.setup.levels
        
        # aggregate blocks to get the different geolevels
        df = sdftools.aggregateGeolevels(spark, df, geolevels)
        sdftools.show(df, f"DF with all Geolevels in {geolevels}", 1000)
        
        # access the queries from the error_metrics section of the config file
        queries = self.setup.config['error_metrics']['queries'].split(", ")
        # and answer the queries
        df = sdftools.answerQueries(df, self.setup.schema_obj, queries)
        sdftools.show(df, f"DF with all Queries in {queries}", 1000)

        # compute the Geolevel 1-TVD metric
        geolevel_tvd = sdftools.getGeolevelTVD(df, groupby=[AC.GEOLEVEL, AC.QUERY])
        geolevel_tvd = geolevel_tvd.orderBy([AC.QUERY, AC.GEOLEVEL])
        sdftools.show(geolevel_tvd, f"Geolevel 1-TVD per geolevel per query", 1000)

        # calculate sparsity change
        sparsity_df = sdftools.getCellSparsityByGroup(df, self.setup.schema_obj, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.QUERY])
        sdftools.show(sparsity_df, f"Query and Geolevel DF with Sparsity per group", 1000)
    schema = Schema("example", ['a', 'b', 'c'], (2, 3, 5))
    sdftools.print_item(schema, "Toy example Schema")

    # build a set of GeounitNodes to use
    geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022']
    geocode_dict = {3: 'block', 2: 'county'}

    # build geounits
    geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict)

    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)

    # perform analyses
    # L1
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    df = sdftools.getL1(df, colname="L1_cell", col1='priv', col2='orig')
    sdftools.print_item(df, "Toy example L1", 300)

    # adding in a simple row-counting column
    df = df.withColumn("row_count", sf.lit(1)).persist()
    sdftools.print_item(df, "Totals + rowcounter column")

    # total within each geocode
    df = sdftools.answerQuery(df, schema, "total", labels=False)
    sdftools.print_item(df, "Totals within each geounit", 300)