Exemple #1
0
 def _transform(self, mdf):
     df = mdf
     df = self._recode_geocode(df).persist()
     if self.debugmode: sdftools.show(df, "Priv DF after recoding to get geocode variable")
     
     df = self._recode_schema(df).persist()
     if self.debugmode: sdftools.show(df, "Priv DF after recoding schema variables")
     
     df = self._add_metadata_columns(df).persist()
     if self.debugmode: sdftools.show(df, "Priv DF after adding the metadata columns")
     
     df = self._drop_unneeded_columns(df).persist()
     if self.debugmode: sdftools.show(df, "Priv DF after dropping the columns not needed for Analysis")
     
     df = self._get_counts(df).persist()
     if self.debugmode: sdftools.show(df, "Priv DF after counting number of records per record type (i.e. after forming the sparse histogram)")
     return df
Exemple #2
0
 def _recode_hhgq(self, df):
     hhgq_sql = ['case']
     hhgq_sql += [f"when RTYPE = '3' and GQTYPE = '000' then '0'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '101' then '1'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '201' then '2'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '301' then '3'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '401' then '4'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '501' then '5'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '601' then '6'"]
     hhgq_sql += [f"when RTYPE = '5' and GQTYPE = '701' then '7'"]
     hhgq_sql += ['else -1']
     hhgq_sql += ['end']
     hhgq_sql = "\n".join(hhgq_sql)
     if self.debugmode: sdftools.show(hhgq_sql, "HHGQ recode sql statement")
     df = df.withColumn("hhgq", sf.expr(hhgq_sql)).persist()
     if self.debugmode: sdftools.show(df, "Priv DF with hhgq recode")
     return df
Exemple #3
0
    def __init__(self, analysis, schema, mdf_path, cef_path=None, budget_group="Unspecified", plb="Unspecified", run_id="Unspecified", debugmode=True):
        self.debugmode = debugmode
        self.analysis = analysis
        self.spark = self.analysis.spark
        self.mdf_path = mdf_path
        self.schema = schema

        self.mdf = self.spark.read.csv(self.mdf_path, header=True, sep='|', comment='#')
        sdftools.show(self.mdf, "MDF to be transformed into an Analysis Hist DF using 'DHCP_HHGQ' Schema")

        self.budget_group = budget_group
        self.plb = plb
        self.run_id = run_id

        self.priv_df = self._transform(self.mdf).persist()
        
        if cef_path is not None:
            self.cef_path = cef_path
        
            # assume cef_path's data is a pickled rdd
            cef_experiment = self.analysis.make_experiment("_CEF_DATA_", cef_path, schema_name=self.schema.name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
            self.orig_df = datatools.getOrigDF(cef_experiment).persist()
            if self.debugmode: sdftools.show(self.orig_df, "CEF sparse histogram DF -- loaded from pickled data")
        
            self.df = self._join_mdf_and_cef().persist()
        else:
            self.df = self.priv_df

        sdftools.show(self.df, "Analysis Hist DF with 'DHCP_HHGQ' Schema")
 def queryHellinger(df, groupby=[AC.GEOLEVEL, AC.GEOCODE, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]):
     """
     Calculates the Hellinger metric for each unique (GEOLEVEL, GEOCODE, QUERY, RUN_ID, PLB, BUDGET_GROUP) group
     
     g = AC.ORIG = raw = CEF
     # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
     h = AC.PRIV = syn = MDF
     
     H(g,h) = sqrt(sum([sqrt(h_i) - sqrt(g_i)]^2) / [2*sum(g_i)])
     H(g,h) = sqrt( A / B )
     
     A = sum([sqrt(h_i) - sqrt(g_i)]^2)
     B = 2 * sum(g_i)
     """
     df = df.withColumn("H_A", sf.pow(sf.sqrt(sf.col(AC.PRIV)) - sf.sqrt(sf.col(AC.ORIG)), sf.lit(2))).persist()
     sdftools.show(df, "H_A = [sqrt(priv) - sqrt(orig)]^2")
     df = df.withColumn("H_B", sf.lit(2) * sf.col(AC.ORIG)).persist()
     sdftools.show(df, "H_B = 2 * orig")
     df = df.groupBy(groupby).sum().persist()
     sdftools.show(df, "H_A and H_B after summing over groups")
     df = sdftools.stripSQLFromColumns(df).persist()
     df = df.withColumn("H", sf.sqrt(sf.col("H_A") / sf.col("H_B"))).persist()
     sdftools.show(df, "H = sqrt(sum([sqrt(priv) - sqrt(orig)]^2) / [2*sum(orig)])")
     return df
Exemple #5
0
    def _join_mdf_and_cef(self):
        # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
        label = {
            AC.PLB: self.plb,
            AC.RUN_ID: self.run_id,
            AC.BUDGET_GROUP: self.budget_group,
            AC.ORIG: 0,
            AC.PRIV: 0
        }
        order_cols = [AC.GEOCODE] + self.schema.dimnames
        
        df = self.priv_df.join(self.orig_df, on=order_cols, how="full_outer").persist()
        if self.debugmode: sdftools.show(df, "Joined Priv and Orig sparse histogram DFs")
        df = df.fillna(label).persist()
        if self.debugmode: sdftools.show(label, "Fill in NAs with these values")
        if self.debugmode: sdftools.show(df, "Joined sparse histogram DF with NAs replaced by the appropriate values")

        # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
        column_order = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] + self.schema.dimnames + [AC.ORIG, AC.PRIV]
        df = df.select(column_order).persist()

        return df
Exemple #6
0
if __name__ == "__main__":
    ################################################################
    # Set the save_location to your own JBID (and other folder(s))
    # it will automatically find your JBID
    # if something different is desired, just pass what is needed
    # into the setuptools.setup function.
    ################################################################
    jbid = os.environ.get('JBID', 'temp_jbid')
    save_folder = "analysis_results/"

    save_location = du.addslash(f"{jbid}/{save_folder}")

    spark_loglevel = "ERROR"
    analysis = setuptools.setup(save_location=save_location,
                                spark_loglevel=spark_loglevel)

    # save the analysis script?
    # toggle to_linux=True|False to save|not save this analysis script locally
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)

    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    sdftools.show(analysis.__dict__, "Analysis attributes")
Exemple #7
0
        C.SLDL,
        C.SLDU
]

queries = [
        #'total',
        #'hhgq',
        #'votingage * citizen',
        #'numraces * hispanic',
        #'cenrace * hispanic',
        #'sex * age',
        #'detailed'
        'cenrace'
]

sdftools.show(df, "df with geolevel crosswalk columns")
sdftools.show(df, "df with geolevel crosswalk columns")
df = sdftools.aggregateGeolevels(spark, df, geolevels)
sdftools.show(df, "df after geolevel aggregation", 1000)
qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False)
sdftools.show(qdf, "Query df with the query 'total'", 1000)
rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY])
#sdftools.show(rdd.collect(), "Row groups")

path = save_location_linux + "Gel.csv"
q = qdf.toPandas()
du.makePath(du.getdir(path))
q.to_csv(path, index=False)
def Qualbins(rows, column, bins):
    #only works if bins > 2
    pandas_df = pandas.DataFrame(rows)
    # 1. going to use experiment.getDF() twice and will
    #    drop the orig column in one and the priv column
    #    in the other
    # 2. going to drop all zero rows in each of them (since
    #    the zeros only exist due to (orig > 0 or priv > 0)
    schema = experiment.schema
    order_cols = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP] + schema.dimnames
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    all_cols = order_cols + [AC.ORIG, AC.PRIV]
    
    limit_num = None
    
    limit_df = experiment.getDF().sort(order_cols).persist()
    if limit_num is not None:
        limit_df = limit_df.sort(order_cols).limit(limit_num).persist()
        sdftools.show(limit_df, f"DF with {limit_num} rows", limit_num)
    
    exp_df = limit_df.persist()

    # 1.
    orig_df = limit_df.drop(AC.PRIV).sort(order_cols).persist()
    priv_df = limit_df.drop(AC.ORIG).sort(order_cols).persist()
    sdftools.show(orig_df, "DF with only CEF values", 40)
    sdftools.show(priv_df, "DF with only MDF values", 40)
    sdftools.show(orig_df.count(), "CEF row count")
    sdftools.show(priv_df.count(), "MDF row count")
    
    # 2.
    orig_df = orig_df.filter(sf.col(AC.ORIG) > 0).sort(order_cols).persist()
    priv_df = priv_df.filter(sf.col(AC.PRIV) > 0).sort(order_cols).persist()
    sdftools.show(orig_df, "CEF DF with only nonzeros", 40)
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    ##############################
    # Accuracy Metrics
    ##############################
    """
    Mean / Median Absolute Error (MAE):
        1. Calculate total population at County geographic level
        2. Calculate |MDF-CEF| for the total populations for each county
        3. Calculate the mean or median across all county total populations
    """
    # 1a. Aggregate to County geographic level
    county_df = sdftools.aggregateGeolevels(spark, df, [C.COUNTY])
    sdftools.show(county_df, "Counties")

    # 1b. Answer the "total" query for all counties
    county_totals_df = sdftools.answerQueries(county_df,
                                              schema,
                                              "total",
                                              labels=True)
    sdftools.show(county_totals_df, "County total pops")

    # 2. Calculate L1(MDF, CEF)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    abs_error_county_totals_df = sdftools.getL1(county_totals_df,
                                                colname="AbsError",
                                                col1=AC.PRIV,
                                                col2=AC.ORIG)
    sdftools.show(abs_error_county_totals_df,
Exemple #10
0
get5, get6 = MAPE(get)
path5 = save_location_linux + "MAPE.csv"
pdf5 = get5.toPandas()
du.makePath(du.getdir(path5))
pdf5.to_csv(path5, index=False)

#get6=MAPE_avg(get5)
path6 = save_location_linux + "MAPE_avg.csv"
pdf6 = get6.toPandas()
du.makePath(du.getdir(path6))
pdf6.to_csv(path6, index=False)

quantile = [0.9]
zef = sdftools.getGroupQuantiles(get5, 'MAPE', 'geolevel', quantile)
sdftools.show(zef)

path = f"/mnt/users/rao00316/amaz/"

all_files = glob.glob(os.path.join(path, "*.csv"))

writer = pandas.ExcelWriter('out.xls', engine='xlswriter')

for f in all_files:
    df = pandas.read_csv(f)
    df.to_excel(writer,
                sheet_name=os.path.splitext(os.path.basename(f))[0],
                index=False)

writer.save()
    geounits = toytools.getToyGeounitData_GeounitNode(schema, geocodes, geocode_dict, raw_params={'low': 0, 'high': 100})
        
    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)


    # aggregate geolevels
    df = df.withColumn("block", sf.col(AC.GEOCODE)[0:3]).persist()
    df = df.withColumn("county", sf.col(AC.GEOCODE)[0:2]).persist()
    df = df.withColumn("nation", sf.col(AC.GEOCODE)[0:1]).persist()
    sdftools.show(df, "df with geolevel crosswalk columns")
    df = sdftools.aggregateGeolevels(spark, df, ['block', 'county', 'nation'])
    sdftools.show(df, "df after geolevel aggregation", 1000)

    # answer total query
    qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False)
    sdftools.show(qdf, "Query df with the query 'total'", 1000)
    
    # select geounits by quantile bins
    rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY])
    sdftools.show(rdd.collect(), "Row groups")
    

    def row_selection_mapper(rows, selection_function, **selection_kwargs):
        pandas_df = pandas.DataFrame(rows)
        pandas_df = selection_function(pandas_df, **selection_kwargs)
 def queryLp(df, p, groupby=[AC.GEOLEVEL, AC.GEOCODE, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]):
     """
     Calculates the L^p-norm for each unique (GEOLEVEL, GEOCODE, QUERY, RUN_ID, PLB, BUDGET_GROUP) group
     """
     sdftools.show(p, "Value of p in the L^p metric")
     if p == "inf":
         # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
         df = df.withColumn("L^inf_norm", sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG))).persist()
         sdftools.show(df, "L^inf_norm as | protected - orig | before taking the max")
         df = df.groupBy(groupby).agg(sf.max(sf.col("L^inf_norm"))).persist()
         sdftools.show(df, "L^inf_norm after taking the max per group")
         df = sdftools.stripSQLFromColumns(df).persist()
     else:
         df = df.withColumn(f"L^{p}", sf.pow(sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG)), sf.lit(p))).persist()
         sdftools.show(df, f"L^{p} after taking | protected - orig | ^ {p}")
         df = df.groupBy(groupby).sum().persist()
         sdftools.show(df, f"L^{p} after groupby and sum")
         df = sdftools.stripSQLFromColumns(df).persist()
         df = df.withColumn(f"L^{p}_norm", sf.pow(sf.col(f"L^{p}"), sf.lit(1/p))).persist()
         sdftools.show(df, f"L^{p} after taking {p}-th root of the sum")
         df = sdftools.stripSQLFromColumns(df).persist()
     return df
 # Calculating L^p and Hellinger Metrics
 ########################################
 
 # 0a. Aggregate Blocks to get Geographic Units at all desired Geographic Levels
 geoleveldf = sdftools.aggregateGeolevels(spark, df, geolevels)
 
 # 0b. Answer Queries
 querydf = sdftools.answerQueries(geoleveldf, schema, queries, labels=True)
 
 # 1. Calculate L(orig, priv) and H(orig, priv) for
 #    detailed cells, marginals, total (for the queries listed in "queries")
 df_L1 = queryLp(querydf, 1)
 df_L2 = queryLp(querydf, 2)
 df_Linf = queryLp(querydf, "inf")
 
 sdftools.show(df_L1, "L^1 norm for the queries")
 sdftools.show(df_L2, "L^2 norm for the queries")
 sdftools.show(df_Linf, "L^inf norm for the queries")
 
 df_H = queryHellinger(querydf)
 
 sdftools.show(df_H, "Hellinger metric for the queries")
 
 # 2. Average L^p and H across geounits in the geolevel
 # removed AC.GEOCODE from the groupby to aggregate across all geounits
 groupby = [AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]
 df_L1_avg = df_L1.groupBy(groupby).agg(sf.avg(sf.col("L^1_norm"))).persist()
 df_L2_avg = df_L2.groupBy(groupby).agg(sf.avg(sf.col("L^2_norm"))).persist()
 df_Linf_avg = df_Linf.groupBy(groupby).agg(sf.avg(sf.col("L^inf_norm"))).persist()
 
 sdftools.show(df_L1_avg, "L^1 norm for the queries")
Exemple #14
0
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)
    
    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)
    
    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark



    # build an example schema
    schema = Schema("example", ['a', 'b', 'c'], (2,3,5))
    sdftools.show(schema, "Toy example Schema")
    
    # build a set of GeounitNodes to use
    geocodes = ['000', '001', '002', '003', '010', '011', '012', '020', '022']
    geocode_dict = {3: 'block', 2: 'county'}
    
    # build geounits
    geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict)
    
    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.show(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.show(df, "Toy example DF", 300)
Exemple #15
0
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)

    spark = analysis.spark

    ######################################################################
    # Use the orig and priv run data to generate Spark DFs
    # then join those DFs on the 'geocode' and schema dimnames colums
    ######################################################################
    # 1. Need to create an experiment for the cef-related DAS runs AND
    #    an experiment for the mdf-related DAS runs
    path_without_orig = f"{AC.S3_BASE}lecle301/withoutRawForBrett_topdown_ri44/"
    syn_experiment = analysis.make_experiment("without_orig",
                                              path_without_orig)

    path_with_orig = f"{AC.S3_BASE}lecle301/withRawForBrett_topdown_ri44/"
    raw_experiment = analysis.make_experiment("with_orig", path_with_orig)

    # 2. Pass the cef experiment object and the mdf experiment object to
    #    datatools.getJoinedDF, which will create the joined MDF and CEF DF
    df = datatools.getJoinedDF(syn_experiment, raw_experiment)
    sdftools.show(df, "Joined Experiment DF")
    sdftools.show(df.count(), "# rows in Joined Experiment DF")

    groupby = [AC.GEOCODE, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]
    sdftools.show(
        df.agg(*(sf.countDistinct(sf.col(c)).alias(c) for c in groupby)),
        f"Distinct counts of each column in {groupby}")
    # 3. Calculate the Signed Error, with the CEF counts binned
    bins = [0, 1, 10, 100, 1000, 10000]
    df = sdftools.getSignedErrorByTrueCountRuns(querydf, bins)

    # 4. Calculate quantiles within groups
    quantiles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    groupby = [
        'orig_count_bin', AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB,
        AC.BUDGET_GROUP
    ]

    # 4a. Calculate within-group quantiles for 'signed error'
    signed_error_quantile_df = sdftools.getGroupQuantiles(
        df, columns=['signed_error'], groupby=groupby, quantiles=quantiles)
    sdftools.show(signed_error_quantile_df.persist(),
                  f"Signed error quantiles by group '{groupby}'", 1000)

    # 4b. Calculate within-group quantiles for 'relative error'
    re_quantile_df = sdftools.getGroupQuantiles(df,
                                                columns=['re'],
                                                groupby=groupby,
                                                quantiles=quantiles)
    sdftools.show(re_quantile_df,
                  f"Relative error quantiles by group '{groupby}'", 1000)

    # 5. Calculate averages within groups
    # 5a. Calculate signed error averages by group
    signed_error_avg_df = df.groupBy(groupby).agg(
        sf.avg("signed_error")).persist()
    sdftools.show(signed_error_avg_df, "signed_error_average_by_run", 1000)
        pandas_df = pandas.DataFrame(rows)
        for bindex in range(0, len(buckets)):
            pandas_df[f"Bin{bindex}"] = (
                buckets[bindex][0] <=
                pandas_df[column]) & (pandas_df[column] <= buckets[bindex][1])
            rows = pandas_df.to_dict('records')
        return rows

    def binIndexToInteger(row, buckets):
        for bindex, bucket in enumerate(buckets):
            if row[f"Bin{bindex}"] == True:
                return str(bucket)

    # loop over each table, calculating the queries for each one
    for table_name, queries in tabledict.items():
        sdftools.show(queries,
                      f"The queries associated with table '{table_name}'")
        get, get2, get3 = MAE(spark, df, geolevels, queries, schema)

        # answer the queries within the table
        #df_table = sdftools.answerQueries(df_geolevel, schema, queries, labels=True).persist()
        rdd = sdftools.getRowGroupsAsRDD(get, groupby=[AC.GEOLEVEL, AC.QUERY])
        rdd = rdd.flatMapValues(
            lambda rows: sepBounds(rows, 'orig', buckets)).persist()
        rdd = rdd.map(lambda row: Row(**row[1]))
        df = rdd.toDF().persist()
        metric_name = "Avg( |q(MDF) - q(CEF)| )"
        x_axis_variable_name = 'CEF Count, Binned'

        df = df.groupby([
            'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3',
            'Bin4', 'Bin5'
Exemple #18
0
    # Get the DF and schema
    schema = experiment.schema
    df = experiment.getDF()

    # Get the geolevels (faster to do this before looping if the queries
    # are to be answered over the same geolevels; otherwise, can perform this
    # step in the loop)
    geolevels = [
        C.US, C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP,
        C.BLOCK, C.SLDL, C.SLDU
    ]
    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)

    # loop over each table, calculating the queries for each one
    for table_name, queries in tabledict.items():
        sdftools.show(queries,
                      f"The queries associated with table '{table_name}'")

        # answer the queries within the table
        df_table = sdftools.answerQueries(df_geolevel,
                                          schema,
                                          queries,
                                          labels=True).persist()
        sdftools.show(df_table,
                      f"The DF with answers for the table '{table_name}'")

        # further computations...

        # plot and/or save the results