def age_quantile_lineplot(df, saveloc, product, state):
    unique_geolevels = df.geolevel.unique()
    categories = [x for x in CATEGORIES if x in unique_geolevels]
    df.geolevel = pandas.Categorical(df.geolevel, categories=categories)
    query = df[AC.QUERY].unique().tolist().pop()
    category = df['category'].unique().tolist().pop()
    category = " * ".join(category.split("."))
    df.plb = df.plb.astype('float')
    for percentile in df['percentile'].unique():
        data = df[df['percentile'].isin([percentile])]
        print(data)
        columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, 'avg(quantile_L1)']
        print(columns)
        data = data[columns]
        data = data.sort_values(columns)
        data['negative_avg(quantile_L1)'] = data['avg(quantile_L1)'] * -1

        minval = min(data['negative_avg(quantile_L1)'])
        maxval = max(data['negative_avg(quantile_L1)']) + 1
        title = f"Statistic: {query} | Category: {category}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})"
        fig, ax = plt.subplots()

        plbs = [float(x) for x in data.plb.unique()]
        plb_max = max(plbs)

        data = data.groupby([AC.PLB, AC.GEOLEVELS], as_index=False).mean()
        print(data)
        for label, group in data.groupby(AC.GEOLEVELS):
            plot = group.plot(x='plb',
                              y='negative_avg(quantile_L1)',
                              ylim=(minval, maxval),
                              style=".-",
                              fontsize=6,
                              alpha=0.85,
                              ax=ax,
                              xlim=(-0.5, plb_max + 0.5),
                              markersize=3,
                              linewidth=1.0,
                              label=label)
            #plot.set_xticks(group.plb)
            #plot.set_xticklabels(group.plb)
            plot.set_ylabel("", fontsize=7)
            plot.set_xlabel("Privacy Loss Budget (PLB)", fontsize=7)
            ax.set_title(title, {"fontsize": 8})

        legend = plt.legend(loc='lower right',
                            frameon=False,
                            fontsize=6,
                            ncol=4,
                            title='Geolevels')
        legend.get_title().set_fontsize(7)

        path = du.addslash(saveloc)
        fsqueryname = ".".join(query.split(" * "))
        query_saveloc = f"{path}age_quantile_{fsqueryname}_lineplot_percentile_{percentile}.pdf"
        print(query_saveloc)
        plt.savefig(query_saveloc)
        plt.clf()

    plt.close()
Beispiel #2
0
    def __init__(self, data_path, schema_name, budget_group=None, run_id=None):
        """
        .../data-run8.0-epsilon4.0-BlockNodeDicts/

        """
        self.data_path = du.addslash(data_path)
        self.schema_name = schema_name
        self.schema = SchemaMaker.fromName(self.schema_name)

        # extract data from the data_path
        data_info = self.data_path.split("/")[-2]
        #assert data_info.startswith('data'), "The wrong data path has been provided... Cannot load DASrun"
        #TODO: Replace above assert with something more appropriate ('data' was overly narrow)

        print(f"data_info.split(-): {data_info.split('-')}")
        #_, self.run_id, self.budget_group, _ = data_info.split('-')
        if budget_group == None:
            assert run_id == None
            self.parseDataInfo(data_info)
        else:
            assert run_id != None
            self.budget_group = budget_group
            self.run_id = run_id
        self.run_num = self.run_id[3:].split('.')[0]
        self.plb = self.budget_group
        print(f"Detected plb, run_id: {self.plb}, {self.run_id}")
Beispiel #3
0
def geolevel_tvd_heatmap(df, saveloc, product, state):
    unique_geolevels = df.geolevel.unique()
    geolevel_categories = [x for x in GEOLEVEL_CATEGORIES if x in unique_geolevels]
    df.geolevel = pandas.Categorical(df.geolevel, categories=geolevel_categories)
    queries = df[AC.QUERY].unique()
    print(queries)
    df.plb = df.plb.astype("float")
    for query in queries:
        data = df[df[AC.QUERY].isin([query])]
        print(data)
        columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, "1-TVD"]
        data = data[columns]
        data = data.sort_values([AC.GEOLEVEL])
        data = data.groupby([AC.GEOLEVEL, AC.PLB], as_index=False).mean()
        print(data)
        data = data.pivot(index=AC.GEOLEVEL, columns=AC.PLB, values="1-TVD")
        print(data.to_string())
        sns.set(font_scale=0.4)
        fig, ax = plt.subplots()
        title = f"Statistic: {query}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})"
        plt.title(title, fontsize=6)
        snsplot = sns.heatmap(data, annot=True, linewidths=0.5, ax=ax, cbar=False, vmin=0.0, vmax=1.0, cmap="Blues", fmt=".4f")
        plot = snsplot.get_figure()
        path = du.addslash(saveloc)
        fsqueryname = ".".join(query.split(" * "))
        query_saveloc = f"{path}geolevel_tvd_{fsqueryname}_heatmap.pdf"
        print(query_saveloc)
        plot.savefig(query_saveloc)
        plt.clf()

    plt.close()
def geolevel_tvd_lineplot(df, saveloc, product, state):
    unique_geolevels = df.geolevel.unique()
    categories = [x for x in CATEGORIES if x in unique_geolevels]
    df.geolevel = pandas.Categorical(df.geolevel, categories=categories)
    queries = df[AC.QUERY].unique()
    print(queries)
    df.plb = df.plb.astype('float')
    for query in queries:
        data = df[df[AC.QUERY].isin([query])]
        print(data)
        columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, "1-TVD"]
        data = data[columns]
        data = data.sort_values([AC.PLB, AC.RUN_ID, AC.GEOLEVEL])

        minval = 0.0
        maxval = 1.01
        title = f"Statistic: {query}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})"
        fig, ax = plt.subplots()

        plbs = [float(x) for x in data.plb.unique()]
        plb_max = max(plbs)

        data = data.groupby([AC.GEOLEVEL, AC.PLB], as_index=False).mean()
        print(data)
        for label, group in data.groupby(AC.GEOLEVEL):
            plot = group.plot(x='plb',
                              y='1-TVD',
                              ylim=(minval, maxval),
                              style=".-",
                              fontsize=6,
                              alpha=0.85,
                              ax=ax,
                              xlim=(-0.5, plb_max + 0.5),
                              markersize=3,
                              linewidth=1.0,
                              label=label)
            #plot.set_xticks(group.plb)
            #plot.set_xticklabels(group.plb)
            plot.set_ylabel("1-TVD", fontsize=7)
            plot.set_xlabel("Privacy Loss Budget (PLB)", fontsize=7)
            ax.set_title(title, {"fontsize": 8})

        legend = plt.legend(loc='lower right',
                            frameon=False,
                            fontsize=6,
                            ncol=4,
                            title="Geolevels")
        legend.get_title().set_fontsize(7)

        path = du.addslash(saveloc)
        fsqueryname = ".".join(query.split(" * "))
        query_saveloc = f"{path}geolevel_tvd_{fsqueryname}_lineplot.pdf"
        print(query_saveloc)
        plt.savefig(query_saveloc)
        plt.clf()

    plt.close()
Beispiel #5
0
from programs.schema.schemas.schemamaker import SchemaMaker

import programs.sparse as sp
import scipy.sparse as ss

if __name__ == "__main__":
    ################################################################
    # Set the save_location to your own JBID (and other folder(s))
    # it will automatically find your JBID
    # if something different is desired, just pass what is needed
    # into the setuptools.setup function.
    ################################################################
    jbid = os.environ.get('JBID', 'temp_jbid')
    save_folder = "analysis_results/"

    save_location = du.addslash(f"{jbid}/{save_folder}")

    spark_loglevel = "ERROR"
    analysis = setuptools.setup(save_location=save_location,
                                spark_loglevel=spark_loglevel)

    # save the analysis script?
    # toggle to_linux=True|False to save|not save this analysis script locally
    # toggle to_s3=True|False to save|not save this analysis script to s3
    analysis.save_analysis_script(to_linux=False, to_s3=False)

    # save/copy the log file?
    analysis.save_log(to_linux=False, to_s3=False)

    # zip the local results to s3?
    analysis.zip_results_to_s3(flag=False)
        #   "hispanic * cenrace"
        #  ],
    }

    # Get the DF and schema
    schema = experiment.schema
    df = experiment.getDF()

    # Get the geolevels (faster to do this before looping if the queries
    # are to be answered over the same geolevels; otherwise, can perform this
    # step in the loop)
    geolevels = [C.COUNTY]
    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)
    buckets = [(0, 0), (1, 10), (11, 100), (100, 1000), (1000, 10000),
               (10000, float('inf'))]
    path = du.addslash(save_location)

    def MAE(spark, df, geolevels, queries, schema):
        u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
        # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
        u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
        u = u.withColumn('abs diff', sf.abs(sf.col('diff')))
        y = u.groupby(['geocode', 'geolevel', 'level']).avg()
        z = u.groupby(['geolevel']).avg()
        return u, y, z

    def sepBounds(rows, column, buckets):
        pandas_df = pandas.DataFrame(rows)
        for bindex in range(0, len(buckets)):
            pandas_df[f"Bin{bindex}"] = (
                buckets[bindex][0] <=