def __init__(self, spark, save_location, analysis_script_path): """ example of save_location (since s3_base and linux_base are expected to be static) save_location = "moran331/analysis_results/logname_and_other_info/" which, for save_location_s3, will become "s3://uscb-decennial-ite-das/users/moran331/analysis_results/logname_and_other_info/" and for save_location_linux "/mnt/users/moran331/analysis_results/logname_and_other_info/" """ self.save_location = save_location self.save_location_s3 = f"{AC.S3_BASE}{save_location}" print("") print(f"___ANALYSIS_RESULTS_LOCATION_S3___: {self.save_location_s3}") print("") self.save_location_linux = f"{AC.LINUX_BASE}{save_location}" print("") print( f"___ANALYSIS_RESULTS_LOCATION_LINUX___: {self.save_location_linux}" ) print("") du.makePath(self.save_location_linux) self.spark = spark self.analysis_script_path = analysis_script_path
def saveRunData(path, config=None, feas_dict=None, rdd=None, batchSize=10): if path[-1] == '/': path = path[0:-1] # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(path): das_utils.makePath(path) if config is not None: config_path = path + "/config.ini" logging.debug("Saving config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, config) if rdd is not None: logging.debug("Pickle Batch Size: {}".format(batchSize)) data_path = path + "/data" logging.debug("Saving data to directory: {}".format(data_path)) das_utils.savePickledRDD(data_path, rdd, batchSize=batchSize) if feas_dict is not None: for key in feas_dict.keys(): feas_dict[key] = feas_dict[ key].value #this seems redundant, but is actually needed for the accumulator logging.info("Feasibility dictionary: {}".format(feas_dict)) feas_path = path + "/feas_dict.json" logging.debug("Saving feas_dict to directory: {}".format(feas_path)) das_utils.saveJSONFile(feas_path, feas_dict)
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps) print(f"Passing paths to Analysis experiment maker: {paths}") experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP]) missing_rows_dict = defaultdict(int) for index, row in missing_rows_pandas_df.iterrows(): #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}") missing_rows_dict[row['geolevel']] = row['sum(missing)'] spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' pandas_df = spark_df.toPandas() pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") # Saving data frame csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str, missing_rows_dict, num_trials)
def saveFigs(plot_savepath, filetypes, filename, plt): for filetype in filetypes: filetype_savepath = plot_savepath + filetype + "/" full_path = filetype_savepath + filename + "." + filetype print(f"Making dir-path to {filetype_savepath}\nAnd saving to {full_path}") du.makePath(filetype_savepath) if filetype != "pdf": plt.savefig(full_path) else: plt.savefig(full_path, dpi=300)
def saveRunData(self, path, feas_dict=None, rdd=None): self.annotate("saveRunData", verbose=True) if path[-1] == '/': path = path[0:-1] # RDD must be saved first, because it needs an empty prefix. if rdd is not None: output_datafile_name = os.path.join(path, self.output_datafname) if self.overwrite_flag: das_utils.clearPath(output_datafile_name) # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(output_datafile_name): das_utils.makePath(output_datafile_name) output_metadata_file_name = output_datafile_name + "/0_metadata" # sorts before 'p' output_header_file_name = output_datafile_name + "/1_header" # sorts before 'p' but after '1' self.annotate(f"writing RDD to {output_datafile_name}") self.saveRDD(output_datafile_name, rdd) if self.write_metadata: now = datetime.datetime.now().isoformat() self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count()) self.saveHeader(path=output_header_file_name) if self.s3cat: self.annotate(f"combining {output_datafile_name} with s3cat") s3cat.s3cat(output_datafile_name, demand_success=True, suffix=self.s3cat_suffix, verbose=self.s3cat_verbose) self.add_output_path(output_datafile_name + self.s3cat_suffix) else: self.add_output_path(output_datafile_name) config_path = os.path.join(path, C.CONFIG_INI) self.annotate("Saving config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, self.config) if feas_dict is not None: for key in feas_dict.keys(): if hasattr(feas_dict[key], 'value'): feas_dict[key] = feas_dict[ key].value # this seems redundant, but is actually needed for the accumulator self.log_and_print(f"Feasibility dictionary: {feas_dict}") feas_path = os.path.join(path, C.FEAS_DICT_JSON) self.annotate(f"Saving feas_dict to directory: {feas_path}") das_utils.saveJSONFile(feas_path, feas_dict)
def main(): spark = SparkSession.builder.appName('RI Redistricting Data - PL94_P12 - Extracting agecat totals').getOrCreate() experiments = [ "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/" ] # add das_decennial zip file to the spark context (to be sent to the core nodes) spark.sparkContext.addPyFile("/mnt/users/moran331/das_decennial.zip") schema_name = "PL94_P12" schema = SchemaMaker.fromName(name=schema_name) for path in experiments: tree = treetools.RunTree(path) runs = tree.runs for r, run in enumerate(runs): if r == 0: df = sdftools.getSparseDF(spark, run, schema, run_columns=True).persist() else: df = df.union(sdftools.getSparseDF(spark, run, schema, run_columns=True)).persist() df = df.persist() plb = str(du.algset2plb(du.findallSciNotationNumbers(tree.algsets[0])[0])) df = df.withColumn("plb", sf.lit(plb)) df.show() geodict = aggtools.getGeodict() geolevel = "State" mapping = geodict[geolevel] group = ["plb", "run_id", geolevel] + schema.dimnames geodf = df.withColumn(geolevel, df.geocode[0:mapping]).groupBy(group).sum().persist() geodf = sdftools.stripSQLFromColumns(geodf) geodf.show() queryname = "votingage" querydf = sdftools.getQueryDF(geodf, queryname, schema, basegroup=["run_id", geolevel, "plb"]).persist() querydf.show() savepath = f"/mnt/users/moran331/redistricting/agecat_redistricting_state_totals_2019_06_27/{tree.algsets[0]}/" du.makePath(savepath) pd = querydf.toPandas().to_csv(savepath + "votingage_all_25_runs.csv", index=False) print(f"---Agecat--- | \n{querydf.toPandas().to_string()}")
def save_log(self, to_linux=True, to_s3=True): """ Saves a copy of the output log locally (in linux) or in S3 (or both) Parameters ========== to_linux : bool (default is True) - Save the log locally? to_s3 : bool (default is True) - Save the log to S3? Notes ===== - Will save the log in the analysis results folder (i.e. the save_location_linux/s3 attributes) for 'this' instance of Analysis. """ if to_linux: du.makePath(self.save_location_linux) print("___SAVE_LOG_TO_LINUX___") if to_s3: print("___SAVE_LOG_TO_S3___")
def save_analysis_script(self, to_linux=True, to_s3=True): """ Saves a copy of the analysis script (.py file) currently being run locally (in linux) or in S3 (or both) Parameters ========== to_linux : bool (default is True) - Save the analysis script locally? to_s3 : bool (default is True) - Save the analysis script to S3? Notes ===== - Will save the analysis script in the analysis results folder (i.e. the save_location_linux/s3 attributes) for 'this' instance of Analysis. """ if to_linux: du.makePath(self.save_location_linux) print(self.save_location_linux) print("^^^^^ Created save location for analysis results ^^^^^\n") # copy the analysis script to the local save location (i.e. in Linux) shutil.copy2(self.analysis_script_path, self.save_location_linux) print( f"{self.save_location_linux}{self.analysis_script_path.split('/')[-1]}" ) print( "^^^^^ Copied analysis script to save location for analysis results ^^^^^\n" ) if to_s3: # copy the analysis script to the s3 save location script_path = self.analysis_script_path.split("/")[-1] s3.put_s3url(f"{self.save_location_s3}{script_path}", self.analysis_script_path) print(f"{self.save_location_s3}{script_path}") print( "^^^^^ Copied analysis script to save location in s3 ^^^^^\n")
data = [{ "Area": 'State', 'Value': no_state }, { "Area": 'County', 'Value': no_county }, { "Area": 'Tract', 'Value': no_tract }, { "Area": 'Place', 'Value': no_place }, { "Area": 'Block Group', 'Value': no_group }, { "Area": 'Block', 'Value': no_block }, { "Area": 'SLDL', 'Value': no_sldl }, { "Area": 'SLDU', 'Value': no_sldu }] df = spark.createDataFrame(data) pandas_df = df.toPandas() csv_savepath = "s3://uscb-decennial-ite-das/rao001/Geolevel_counts.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") df = experiment.getDF() if TEST: df = df.limit(TEST_NUM) print("df looks like:") df.show() schema = experiment.schema sdftools.print_item(df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema) rddWithAbsDiff = sdftools.getFullWorkloadDF( rddWithAbsDiff, schema, queri, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) rdd = rdd.flatMapValues( lambda rows: sepBounds(rows, 'orig', buckets)).persist() rdd = rdd.map(lambda row: Row(**row[1])) df = rdd.toDF().persist() metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' df = df.groupby([ 'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4', 'Bin5' ]).avg() pandas_df = df.toPandas() pandas_df = pandas_df.rename(columns={ "avg(abs diff)": metric_name, "avg(orig)": "orig" }) pandas_df[x_axis_variable_name] = pandas_df.apply( lambda row: binIndexToInteger(row, buckets), axis=1) plt.figure(1, figsize=(11, 8.5)) plt.rc('axes', labelsize=8) """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name)
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name): EPT = table_name[:4] + "_" + schema_name for query in queries: for geolevel in geolevels: subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel] race_iterates = major_omb_race_names if ( "EPT3" in table_name and "allraces *" in query) else [""] race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else [""] race_iterates = major_combination_race_names if ( "EPT3" in table_name and query.str.contains("combo")) else race_iterates print( f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})" ) # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor? for race in race_iterates: plotting_df = subsetted_df if race in major_omb_race_names: plotting_df = subsetted_df[ subsetted_df['level'].str.contains(race)] if race in major_combination_race_names: plotting_df = subsetted_df[ subsetted_df['level'].str.contains(race)] bucket_counts = {} for bindex, bucket in enumerate(buckets): trueRows = plotting_df[plotting_df[f"Bin{bindex}"] == True] bucket_counts[bucket] = trueRows.shape[0] print( f"Geolevel {geolevel} has bucket_counts: {bucket_counts}") bucket_names = [ str(bucket) for bucket in buckets if bucket_counts[bucket] != 0 ] if (np.array(list(bucket_counts.values())) > 0).any(): graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}" if race in major_omb_race_names: graph_title += f"Race Alone: {race.title()}" if race == "2+ Races": graph_title += f"2+ Races" if race in major_combination_race_names: graph_title += f"Race: {race}" graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C." # Scatterplot strips superimposed on violin plots sns.set(style="whitegrid") sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names, inner = None, color="0.8") \ .set_title(graph_title) sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names) \ .set_title(graph_title) plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/scatteredViolin/{experiment_name}_" plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf" du.makePath(du.getdir(plot_savepath)) print( f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}" ) plt.savefig(plot_savepath) plt.clf() else: print( f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated." )
#'detailed' 'cenrace' ] sdftools.show(df, "df with geolevel crosswalk columns") sdftools.show(df, "df with geolevel crosswalk columns") df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.show(df, "df after geolevel aggregation", 1000) qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False) sdftools.show(qdf, "Query df with the query 'total'", 1000) rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY]) #sdftools.show(rdd.collect(), "Row groups") path = save_location_linux + "Gel.csv" q = qdf.toPandas() du.makePath(du.getdir(path)) q.to_csv(path, index=False) def Qualbins(rows, column, bins): #only works if bins > 2 pandas_df = pandas.DataFrame(rows) q = 1/bins p = bins+1 for i in range (1, p): k=str(i) pandas_df['Bin'+k]=(np.quantile(pandas_df[column],q)>=pandas_df[column])&(pandas_df[column]>=np.quantile(pandas_df[column],q-1/bins)) q=q+1/bins rows = pandas_df.to_dict('records')
queri = ["allraces"] df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) df_table = sdftools.answerQueries(df_geolevel, schema, queri, labels=True).persist() df_withmissingrows=sdftools.getFullWorkloadDF(df_table, schema, queri,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #print(df_withmissingrows.head(200)) sparse = sdftools.getCellSparsityByGroup(df_withmissingrows,schema,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP,AC.QUERY]) zero=ReturnZeroCounts(df_withmissingrows, geolevels) print("This is sparse:") print(sparse.head(20)) print("This is zero") print(zero.head(20)) csv_savepath = save_location_linux + f"origtable.csv" csv_savepath2 = save_location_linux + f"missingrows.csv" du.makePath(du.getdir(csv_savepath)) du.makePath(du.getdir(csv_savepath2)) pandas_df_table=df_table.toPandas() pandas_df_table.to_csv(csv_savepath, index=False) pandas_dfmissing=df_withmissingrows.toPandas() pandas_dfmissing.to_csv(csv_savepath2, index=False) # df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) # races_names1=['White','Black or African American','American Indian and Alaskan Native'] # races_names2=['Asian','Native Hawaiian and Other Pacific Islander','Some Other Race'] # white1=['Aian'] # buckets=[(0,0),(1,10),(11,100),(100,1000),(1000,10000),(10000,float('inf'))] # path = du.addslash(save_location) # plt.figure(figsize=(11,8.5))
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, schema_name="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ geolevel = geolevels[0] EPT = table_name[:4] + "_" + schema_name graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C." plt.figure(1, figsize=(20, 40)) sns.set(style="ticks") fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True) axes_flat = axes.ravel() sns.despine(fig=fig) #.set_title(graph_title) print( f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}" ) num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName( schema_name, query, table_name, eps) plt.xscale('log') plt.yscale('symlog', linthreshy=100) for k, path in enumerate(paths): axes_flat[k].set_title(spines[k] + '_' + mechanisms[k]) experiment = analysis.make_experiment( experiment_name, [path], schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) spark_df = experiment.getDF() sdftools.print_item(experiment.__dict__, "Experiment Attributes") schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) # jitter points to make them visually distinct: spark_df = sdftools.answerQueries(spark_df, schema, queries) \ .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \ .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.) if geolevel == "AIAN_AREAS": spark_df = spark_df.filter(spark_df.geocode != "9999") elif geolevel == 'OSE': spark_df = spark_df.filter( sf.col(AC.GEOCODE).substr( sf.length(sf.col(AC.GEOCODE)) - 4, sf.length(sf.col(AC.GEOCODE))) != "99999") elif geolevel == 'AIANTract': spark_df = spark_df.filter(spark_df.geocode != "9" * 11) elif geolevel == 'AIANState': spark_df = spark_df.filter(spark_df.geocode != "99") elif geolevel == 'AIANBlock': spark_df = spark_df.filter(spark_df.geocode != "9" * 16) # t = spark_df.filter(sf.abs(spark_df.Error) > 1000) spark_df = spark_df.select(["orig", "Error"]) pandas_df = spark_df.toPandas() #if pandas_df.max()["Error"] == pandas_df.min()["Error"]: # continue sns.scatterplot(x="orig", y="Error", data=pandas_df, alpha=.6, s=10, marker="+", ax=axes_flat[k]) axes_flat[k].axhline(0., ls='--') filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}" plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/" du.makePath(plot_path) plt.savefig(plot_path + filename + ".png") plt.clf()
def MattsMetrics(query, table_name, analysis, spark, geolevels, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) if table_name not in (table_list_3_plus_list_age): for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets bucket_size = "NA" metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="A") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list2: # if data is bucketed in 3 buckets, bucket_size = default_buckets2 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list1: # Table 1 and 2, six buckets bucket_size = default_buckets1 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_list_3geolevels: #three geolevels, state, county, place, Tables 10,14,18,22 metrics_result = sdftools.metrics_with_3geolevels( spark_df, spark, geolevels) file_name = f"{table_name}.csv" if table_name in table_list_age: # Tables 32-35 if table_name in table_age_bracket1: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="A") if table_name in table_age_bracket2: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="B") if table_name in table_age_bracket3: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="A") if table_name in table_age_bracket4: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="B") file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def saveRunData(self, path, feas_dict=None, rdd=None): self.annotate("saveRunData", verbose=True) if path[-1] == '/': path = path[0:-1] # RDD must be saved first, because it needs an empty prefix. if rdd is not None: output_datafile_name = os.path.join(path, self.output_datafname) if self.overwrite_flag: das_utils.clearPath(output_datafile_name) # needed when not an s3 path, as the with open context assumes the folder already exists if not das_utils.isS3Path(output_datafile_name): das_utils.makePath(output_datafile_name) output_metadata_file_name = output_datafile_name+"/0_metadata" # sorts before 'p' output_header_file_name = output_datafile_name+"/1_header" # sorts before 'p' but after '1' self.annotate(f"writing RDD to {output_datafile_name}") self.saveRDD(output_datafile_name, rdd) if self.write_metadata: now = datetime.datetime.now().isoformat() self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count()) self.saveHeader(path=output_header_file_name) if self.s3cat: # If we combine the data with s3cat # note the combined filename in the annotated output, the DFXML file, the DVS object, and do it. self.annotate(f"combining {output_datafile_name} with s3cat") # Record this with DFXML ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_S3CAT, {'output_datafile_name':output_datafile_name, 'demand_success':'True', 'suffix':self.s3cat_suffix, 'verbose':str(self.s3cat_verbose)}) self.add_output_path(output_datafile_name + self.s3cat_suffix) s3cat.s3_cat(output_datafile_name) else: # Otherwise just note the prefix in DFS and DFXML ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_OUTPUT).text=output_datafile_name+"/" self.add_output_path(output_datafile_name + "/") config_path = os.path.join(path, f"{self.output_datafname}_{CC.CONFIG_INI}") self.annotate("Saving the flattened config to directory: {}".format(config_path)) das_utils.saveConfigFile(config_path, self.config) f = io.StringIO() self.config.write(f) ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_CONFIG).text = f.getvalue() if feas_dict is not None: for key in feas_dict.keys(): if hasattr(feas_dict[key], 'value'): feas_dict[key] = feas_dict[key].value # this seems redundant, but is actually needed for the accumulator self.log_and_print(f"Feasibility dictionary: {feas_dict}") feas_path = os.path.join(path, f"{self.output_datafname}_{CC.FEAS_DICT_JSON}") self.annotate(f"Saving feas_dict to directory: {feas_path}") das_utils.saveJSONFile(feas_path, feas_dict)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) # Old approach to computing df with abs diff, bucketed by true count: #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema) #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]): #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist() #rdd = rdd.map(lambda row: Row(**row[1])) #spark_df = rdd.toDF().persist() # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count: # (avoids pandas dfs inside mappers, which is RAM-hungry) spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg() # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg() #print("^^^^ after averaging, spark_df looks like ^^^^") pandas_df = spark_df.toPandas() #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"}) #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1) #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"}) pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket buckets = [buckets[i] for i in new_bucket_order] print(f"Sorted bucket names: {buckets}") """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets if table_name in table_race_query: # Table 17, 18, 21 and others print("no buckets, with race query") spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) spark_df2.show(100, False) print("Make sure 'Not' values are removed") metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) else: print("no buckets, without race query") spark_df1.show(100, False) spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) print("with Not removed") spark_df2.show(100, False) metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket1: print("Data is in age brackets, 0 to 17, 18 to 64, 65+") spark_df1.show(100, False) for age_range in age_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket2: print("Data is age buckets, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket3: print("Data is in age brackets of 5 year age groups") spark_df1.show(100, False) for age_range in age_range_list2: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket4: print("Data is age buckets of 5 year age groups, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list2: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data is in buckets [0,10],[10,100),[100+) print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets2: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list: # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2 print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, key, agekey, sexkey, bucketkey, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels, agekey, sexkey, bucketkey, key) file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics #spark_df.show(spark_df.count(), False) for g in geolevels: spark_df = spark_df[spark_df['geolevel'] == g] print("This has all levels") spark_df.show(150, False) metrics_dataframe = sdftools.mattsmetrics(spark_df, spark) Counts = spark_df.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data needs bucketing for b in default_buckets2: # calculate Metrics at each bucket subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")) ) # Removes instances of Not Hispanic..from dataframe subset_sparkdf.show(100, False) print("Make sure its bucketed and without 'Not' values") subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark) Counts = subset_sparkdf.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
print("This is q") q.show(10) q=q.groupby(['geolevel','run_id']).sum() columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)'] print("this is q2") q=q.drop(*columnstodrop) q.show(10) z=q.groupby(['geolevel']).avg() print("this is z") z.show(10) return q,z get,get2=NEbias(spark,df,geolevels,queries,schema) path=save_location_linux+"NEbias.csv" pdf=get.toPandas() du.makePath(du.getdir(path)) pdf.to_csv(path,index=False) path2=save_location_linux+"NEbias_av.csv" pdf2=get2.toPandas() du.makePath(du.getdir(path2)) pdf2.to_csv(path2,index=False) get3,get4=MMbias(get) path3=save_location_linux+"MMbias.csv" pdf3=get3.toPandas() du.makePath(du.getdir(path3)) pdf3.to_csv(path3,index=False) path4=save_location_linux+"MMbias_av.csv" pdf4=get4.toPandas()
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str): EPT = table_name[:4]+"_"+schema_name for query in queries: for geolevel in geolevels: subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel] race_iterates = major_omb_race_names if ("EPT3" in table_name and "allraces *" in query) else [""] race_iterates = race_combo_names if (race_iterates == [""] and "racecombos *" in query) else race_iterates race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else race_iterates print(f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})") # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor? for race in race_iterates: plotting_df = subsetted_df if race in major_omb_race_names: plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)] if race in race_combo_names: plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)] bucket_counts = {} for bucket in buckets: trueRows = plotting_df[plotting_df[x_axis_variable_name] == bucket] bucket_counts[bucket] = trueRows.shape[0] print(f"Geolevel {geolevel} has bucket_counts: {bucket_counts}") bucket_names = [str(bucket) for bucket in buckets if bucket_counts[bucket] != 0] if (np.array(list(bucket_counts.values())) > 0).any(): graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}" if race in major_omb_race_names: graph_title += f"\n Race Alone: {race.title()}" if race in race_combo_names: graph_title += f"\n Race Combination: {race.title()}" if race == "2+ Races": graph_title += f"\n 2+ Races" graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C." pandas.set_option('display.max_columns', None) pandas.set_option('display.max_rows', None) #print(f"Before plotting, plotting_df looks like:") #print(plotting_df) #print(f"Feeding plotting_df to seaborn with xvar {x_axis_variable_name} & yvar {metric_name}") #print(f"And bucket_names are: {bucket_names}") # Scatterplot strips superimposed on violin plots sns.set(style="whitegrid") ax = sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names, inner = None, color="0.8") \ .set_title(graph_title) maxVal = plotting_df[metric_name].max() minVal = plotting_df[metric_name].min() print(f"maxVal type, val: {type(maxVal)}, {maxVal}") print(f"minVal type, val: {type(minVal)}, {minVal}") if abs(maxVal - minVal) < 0.1: #print(f"violin ax has type {type(ax)} & methods: {dir(ax)}") #print(f"violin ax.axes has type {type(ax.axes)} & methods: {dir(ax.axes)}") ax.axes.set(ylim=(minVal-10, maxVal+10)) ax = sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names) \ .set_title(graph_title) #if geolevel == C.US: # #print(f"strip ax has type {type(ax)} & methods: {dir(ax)}") # ax.axes.set(ylim=(plotting_df[x_axis_variable_name].min - 10,plotting_df[x_axis_variable_name].max + 10)) plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/epsilon{eps_str}/scatteredViolin/{experiment_name}_" plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf" du.makePath(du.getdir(plot_savepath)) print(f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}") plt.savefig(plot_savepath) plt.clf() else: print(f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated.")