예제 #1
0
    geounits = [
        testdata_random_geounit_generator(x, schema, density=0.00001, scale=10)
        for x in geocodes
    ]
    sdftools.print_item(geounits, "Random Geounit Data")

    rdd = spark.sparkContext.parallelize(geounits).persist()
    sdftools.print_item(rdd, "Parallelized RDD data")

    df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema)
                     ).map(lambda row: Row(**row)).toDF().persist()
    sdftools.print_item(df, "DF of Random Geounit Data")

    df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist()
    sdftools.print_item(df, "DF with STATE code")

    df = sdftools.aggregateGeolevels(spark, df, 'STATE')
    sdftools.print_item(df, "Aggregated to the STATE geolevel")

    query = 'sex * age'
    df = sdftools.answerQuery(df, schema, query, labels=False)
    sdftools.print_item(df, "Answering the sex query")

    groupby = ['geocode', 'geolevel']
    rdd = sdftools.getRowGroupsAsRDD(df, groupby)
    df = rdd.flatMapValues(prob_vector_mapper).map(
        lambda row: Row(**row[1])).toDF()
    df = df.withColumn('age', sf.col('age').cast("int")).persist()
    df = df.sort(['geocode', 'age', 'sex']).persist()
    sdftools.print_item(df, f"Prob vector for {query} query", show=1000)
    geocode_dict = {3: 'block', 2: 'county'}

    # build geounits
    geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict)

    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)

    # perform analyses
    # L1
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    df = sdftools.getL1(df, colname="L1_cell", col1='priv', col2='orig')
    sdftools.print_item(df, "Toy example L1", 300)

    # adding in a simple row-counting column
    df = df.withColumn("row_count", sf.lit(1)).persist()
    sdftools.print_item(df, "Totals + rowcounter column")

    # total within each geocode
    df = sdftools.answerQuery(df, schema, "total", labels=False)
    sdftools.print_item(df, "Totals within each geounit", 300)

    # L1 of total
    df = sdftools.getL1(df, colname="L1_total", col1="priv", col2="orig")
    sdftools.print_item(df, "Totals + rowcounter column + L1")
예제 #3
0
    df = sdftools.aggregateGeolevels(spark, df, geolevels)
    sdftools.print_item(df, "Geolevel DF")

    ####################
    # Answering Queries
    ####################
    queries = [
        "total", "sex", "sex * hispanic", "votingage * sex * hispanic",
        "numraces"
    ]

    for query in queries:
        querydf = sdftools.answerQuery(df,
                                       schema,
                                       query,
                                       labels=False,
                                       merge_dims=False)
        sdftools.print_item(
            querydf, f"Query DF for {query}, labels=False, merge_dims=False",
            2000)

        querydf = sdftools.answerQuery(df,
                                       schema,
                                       query,
                                       labels=False,
                                       merge_dims=True)
        sdftools.print_item(
            querydf, f"Query DF for {query}, labels=False, merge_dims=True",
            2000)