def age_quantile_lineplot(df, saveloc, product, state): unique_geolevels = df.geolevel.unique() categories = [x for x in CATEGORIES if x in unique_geolevels] df.geolevel = pandas.Categorical(df.geolevel, categories=categories) query = df[AC.QUERY].unique().tolist().pop() category = df['category'].unique().tolist().pop() category = " * ".join(category.split(".")) df.plb = df.plb.astype('float') for percentile in df['percentile'].unique(): data = df[df['percentile'].isin([percentile])] print(data) columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, 'avg(quantile_L1)'] print(columns) data = data[columns] data = data.sort_values(columns) data['negative_avg(quantile_L1)'] = data['avg(quantile_L1)'] * -1 minval = min(data['negative_avg(quantile_L1)']) maxval = max(data['negative_avg(quantile_L1)']) + 1 title = f"Statistic: {query} | Category: {category}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})" fig, ax = plt.subplots() plbs = [float(x) for x in data.plb.unique()] plb_max = max(plbs) data = data.groupby([AC.PLB, AC.GEOLEVELS], as_index=False).mean() print(data) for label, group in data.groupby(AC.GEOLEVELS): plot = group.plot(x='plb', y='negative_avg(quantile_L1)', ylim=(minval, maxval), style=".-", fontsize=6, alpha=0.85, ax=ax, xlim=(-0.5, plb_max + 0.5), markersize=3, linewidth=1.0, label=label) #plot.set_xticks(group.plb) #plot.set_xticklabels(group.plb) plot.set_ylabel("", fontsize=7) plot.set_xlabel("Privacy Loss Budget (PLB)", fontsize=7) ax.set_title(title, {"fontsize": 8}) legend = plt.legend(loc='lower right', frameon=False, fontsize=6, ncol=4, title='Geolevels') legend.get_title().set_fontsize(7) path = du.addslash(saveloc) fsqueryname = ".".join(query.split(" * ")) query_saveloc = f"{path}age_quantile_{fsqueryname}_lineplot_percentile_{percentile}.pdf" print(query_saveloc) plt.savefig(query_saveloc) plt.clf() plt.close()
def __init__(self, data_path, schema_name, budget_group=None, run_id=None): """ .../data-run8.0-epsilon4.0-BlockNodeDicts/ """ self.data_path = du.addslash(data_path) self.schema_name = schema_name self.schema = SchemaMaker.fromName(self.schema_name) # extract data from the data_path data_info = self.data_path.split("/")[-2] #assert data_info.startswith('data'), "The wrong data path has been provided... Cannot load DASrun" #TODO: Replace above assert with something more appropriate ('data' was overly narrow) print(f"data_info.split(-): {data_info.split('-')}") #_, self.run_id, self.budget_group, _ = data_info.split('-') if budget_group == None: assert run_id == None self.parseDataInfo(data_info) else: assert run_id != None self.budget_group = budget_group self.run_id = run_id self.run_num = self.run_id[3:].split('.')[0] self.plb = self.budget_group print(f"Detected plb, run_id: {self.plb}, {self.run_id}")
def geolevel_tvd_heatmap(df, saveloc, product, state): unique_geolevels = df.geolevel.unique() geolevel_categories = [x for x in GEOLEVEL_CATEGORIES if x in unique_geolevels] df.geolevel = pandas.Categorical(df.geolevel, categories=geolevel_categories) queries = df[AC.QUERY].unique() print(queries) df.plb = df.plb.astype("float") for query in queries: data = df[df[AC.QUERY].isin([query])] print(data) columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, "1-TVD"] data = data[columns] data = data.sort_values([AC.GEOLEVEL]) data = data.groupby([AC.GEOLEVEL, AC.PLB], as_index=False).mean() print(data) data = data.pivot(index=AC.GEOLEVEL, columns=AC.PLB, values="1-TVD") print(data.to_string()) sns.set(font_scale=0.4) fig, ax = plt.subplots() title = f"Statistic: {query}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})" plt.title(title, fontsize=6) snsplot = sns.heatmap(data, annot=True, linewidths=0.5, ax=ax, cbar=False, vmin=0.0, vmax=1.0, cmap="Blues", fmt=".4f") plot = snsplot.get_figure() path = du.addslash(saveloc) fsqueryname = ".".join(query.split(" * ")) query_saveloc = f"{path}geolevel_tvd_{fsqueryname}_heatmap.pdf" print(query_saveloc) plot.savefig(query_saveloc) plt.clf() plt.close()
def geolevel_tvd_lineplot(df, saveloc, product, state): unique_geolevels = df.geolevel.unique() categories = [x for x in CATEGORIES if x in unique_geolevels] df.geolevel = pandas.Categorical(df.geolevel, categories=categories) queries = df[AC.QUERY].unique() print(queries) df.plb = df.plb.astype('float') for query in queries: data = df[df[AC.QUERY].isin([query])] print(data) columns = [AC.GEOLEVEL, AC.PLB, AC.RUN_ID, "1-TVD"] data = data[columns] data = data.sort_values([AC.PLB, AC.RUN_ID, AC.GEOLEVEL]) minval = 0.0 maxval = 1.01 title = f"Statistic: {query}\nAccuracy as a Fxn of Privacy-Loss Budget (for {state}), Geolevel\n(Data Product: {product})" fig, ax = plt.subplots() plbs = [float(x) for x in data.plb.unique()] plb_max = max(plbs) data = data.groupby([AC.GEOLEVEL, AC.PLB], as_index=False).mean() print(data) for label, group in data.groupby(AC.GEOLEVEL): plot = group.plot(x='plb', y='1-TVD', ylim=(minval, maxval), style=".-", fontsize=6, alpha=0.85, ax=ax, xlim=(-0.5, plb_max + 0.5), markersize=3, linewidth=1.0, label=label) #plot.set_xticks(group.plb) #plot.set_xticklabels(group.plb) plot.set_ylabel("1-TVD", fontsize=7) plot.set_xlabel("Privacy Loss Budget (PLB)", fontsize=7) ax.set_title(title, {"fontsize": 8}) legend = plt.legend(loc='lower right', frameon=False, fontsize=6, ncol=4, title="Geolevels") legend.get_title().set_fontsize(7) path = du.addslash(saveloc) fsqueryname = ".".join(query.split(" * ")) query_saveloc = f"{path}geolevel_tvd_{fsqueryname}_lineplot.pdf" print(query_saveloc) plt.savefig(query_saveloc) plt.clf() plt.close()
from programs.schema.schemas.schemamaker import SchemaMaker import programs.sparse as sp import scipy.sparse as ss if __name__ == "__main__": ################################################################ # Set the save_location to your own JBID (and other folder(s)) # it will automatically find your JBID # if something different is desired, just pass what is needed # into the setuptools.setup function. ################################################################ jbid = os.environ.get('JBID', 'temp_jbid') save_folder = "analysis_results/" save_location = du.addslash(f"{jbid}/{save_folder}") spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=save_location, spark_loglevel=spark_loglevel) # save the analysis script? # toggle to_linux=True|False to save|not save this analysis script locally # toggle to_s3=True|False to save|not save this analysis script to s3 analysis.save_analysis_script(to_linux=False, to_s3=False) # save/copy the log file? analysis.save_log(to_linux=False, to_s3=False) # zip the local results to s3? analysis.zip_results_to_s3(flag=False)
# "hispanic * cenrace" # ], } # Get the DF and schema schema = experiment.schema df = experiment.getDF() # Get the geolevels (faster to do this before looping if the queries # are to be answered over the same geolevels; otherwise, can perform this # step in the loop) geolevels = [C.COUNTY] df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) buckets = [(0, 0), (1, 10), (11, 100), (100, 1000), (1000, 10000), (10000, float('inf'))] path = du.addslash(save_location) def MAE(spark, df, geolevels, queries, schema): u = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u = u.withColumn('diff', sf.col('priv') - sf.col('orig')) u = u.withColumn('abs diff', sf.abs(sf.col('diff'))) y = u.groupby(['geocode', 'geolevel', 'level']).avg() z = u.groupby(['geolevel']).avg() return u, y, z def sepBounds(rows, column, buckets): pandas_df = pandas.DataFrame(rows) for bindex in range(0, len(buckets)): pandas_df[f"Bin{bindex}"] = ( buckets[bindex][0] <=