def getCategoryByAgeQuantilesFast(sdf, queries, product, state, plot=False): df = sdf.df results = {} for query in queries: res = sdftools.categoryByAgeQuantiles(df, sdf.schema, query, labels=True) results.update(res) if plot: for key, df in results.items(): queryname, category, datatype = parseAgeQuantileKey(key, fsname=False) sdftools.print_item( df.count(), "Number of rows in the Spark DF before transforming to Pandas DF" ) if datatype == "quantile_df": age_quantile_pandas_df = df.toPandas() saveloc = du.getdir(sdf.metric_save_location) rp.age_quantile_lineplot(age_quantile_pandas_df, saveloc, product, state) else: # datatype == "survival_props" pass return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing schema : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps) print(f"Passing paths to Analysis experiment maker: {paths}") experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP]) missing_rows_dict = defaultdict(int) for index, row in missing_rows_pandas_df.iterrows(): #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}") missing_rows_dict[row['geolevel']] = row['sum(missing)'] spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' pandas_df = spark_df.toPandas() pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") # Saving data frame csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str, missing_rows_dict, num_trials)
def getGeolevelTVDFast(sdf, product, state, plot=False): sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB]) if plot: saveloc = du.getdir(sdf.metric_save_location) sdftools.print_item( sdf.df.count(), "Number of rows in the Spark DF before transforming to Pandas DF") geolevel_tvd_pandas_df = sdf.toPandas() rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product, state) rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product, state) results = {'geolevel_tvd': sdf} return results
def getGeolevelTVD(sdf, geolevels, queries, product, state, plot=False): """ Calculates the following: 0. Query answers at the specified geolevels 1. Per-geolevel TVD """ sdf = sdf.getGeolevels(geolevels).getQueryAnswers(queries) sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB]) if plot: saveloc = du.getdir(sdf.metric_save_location) geolevel_tvd_pandas_df = sdf.toPandas() rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product, state) rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product, state) return sdf
queri = ["allraces"] df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) df_table = sdftools.answerQueries(df_geolevel, schema, queri, labels=True).persist() df_withmissingrows=sdftools.getFullWorkloadDF(df_table, schema, queri,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #print(df_withmissingrows.head(200)) sparse = sdftools.getCellSparsityByGroup(df_withmissingrows,schema,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP,AC.QUERY]) zero=ReturnZeroCounts(df_withmissingrows, geolevels) print("This is sparse:") print(sparse.head(20)) print("This is zero") print(zero.head(20)) csv_savepath = save_location_linux + f"origtable.csv" csv_savepath2 = save_location_linux + f"missingrows.csv" du.makePath(du.getdir(csv_savepath)) du.makePath(du.getdir(csv_savepath2)) pandas_df_table=df_table.toPandas() pandas_df_table.to_csv(csv_savepath, index=False) pandas_dfmissing=df_withmissingrows.toPandas() pandas_dfmissing.to_csv(csv_savepath2, index=False) # df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) # races_names1=['White','Black or African American','American Indian and Alaskan Native'] # races_names2=['Asian','Native Hawaiian and Other Pacific Islander','Some Other Race'] # white1=['Aian'] # buckets=[(0,0),(1,10),(11,100),(100,1000),(1000,10000),(10000,float('inf'))] # path = du.addslash(save_location) # plt.figure(figsize=(11,8.5))
#'detailed' 'cenrace' ] sdftools.show(df, "df with geolevel crosswalk columns") sdftools.show(df, "df with geolevel crosswalk columns") df = sdftools.aggregateGeolevels(spark, df, geolevels) sdftools.show(df, "df after geolevel aggregation", 1000) qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False) sdftools.show(qdf, "Query df with the query 'total'", 1000) rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY]) #sdftools.show(rdd.collect(), "Row groups") path = save_location_linux + "Gel.csv" q = qdf.toPandas() du.makePath(du.getdir(path)) q.to_csv(path, index=False) def Qualbins(rows, column, bins): #only works if bins > 2 pandas_df = pandas.DataFrame(rows) q = 1/bins p = bins+1 for i in range (1, p): k=str(i) pandas_df['Bin'+k]=(np.quantile(pandas_df[column],q)>=pandas_df[column])&(pandas_df[column]>=np.quantile(pandas_df[column],q-1/bins)) q=q+1/bins rows = pandas_df.to_dict('records')
def MattsMetrics(query, table_name, analysis, spark, geolevels, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) if table_name not in (table_list_3_plus_list_age): for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets bucket_size = "NA" metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="A") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list2: # if data is bucketed in 3 buckets, bucket_size = default_buckets2 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list1: # Table 1 and 2, six buckets bucket_size = default_buckets1 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_list_3geolevels: #three geolevels, state, county, place, Tables 10,14,18,22 metrics_result = sdftools.metrics_with_3geolevels( spark_df, spark, geolevels) file_name = f"{table_name}.csv" if table_name in table_list_age: # Tables 32-35 if table_name in table_age_bracket1: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="A") if table_name in table_age_bracket2: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="B") if table_name in table_age_bracket3: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="A") if table_name in table_age_bracket4: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="B") file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) # Old approach to computing df with abs diff, bucketed by true count: #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema) #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]): #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist() #rdd = rdd.map(lambda row: Row(**row[1])) #spark_df = rdd.toDF().persist() # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count: # (avoids pandas dfs inside mappers, which is RAM-hungry) spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg() # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg() #print("^^^^ after averaging, spark_df looks like ^^^^") pandas_df = spark_df.toPandas() #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"}) #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1) #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"}) pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket buckets = [buckets[i] for i in new_bucket_order] print(f"Sorted bucket names: {buckets}") """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str)
def MattsMetrics(query, table_name, analysis, spark, geolevels, key, agekey, sexkey, bucketkey, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels, agekey, sexkey, bucketkey, key) file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics #spark_df.show(spark_df.count(), False) for g in geolevels: spark_df = spark_df[spark_df['geolevel'] == g] print("This has all levels") spark_df.show(150, False) metrics_dataframe = sdftools.mattsmetrics(spark_df, spark) Counts = spark_df.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data needs bucketing for b in default_buckets2: # calculate Metrics at each bucket subset_sparkdf = spark_df[spark_df['orig_count_bin'] == b] #subset into bins subset_sparkdf = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")) ) # Removes instances of Not Hispanic..from dataframe subset_sparkdf.show(100, False) print("Make sure its bucketed and without 'Not' values") subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark) Counts = subset_sparkdf.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
df.write.partitionBy(partitionby).option("header", "true").format("csv").save(path) path = analysis.save_location_s3 + "toy_data_df_results_by_partition.csv" sdftools.show(path, "S3 location of the toy df analysis results in csv form, partitioned by columns") # can be a list; spark will split by each column requested, in the order specified partitionby = ['query', 'geocode'] df.write.partitionBy(partitionby).option("header", "true").format("csv").save(path) ### Saving Spark DF as csv locally (via Pandas DF) path = analysis.save_location_linux + "toy_data_df_results.csv" sdftools.show(path, "Linux location of the toy df analysis results in csv form") pdf = df.toPandas() sdftools.show(pdf, "The Pandas DF based on the Spark DF") # create the directory locally, otherwise pandas_df.to_csv will throw an error sdftools.show(du.getdir(path), "The directory being created to house the pandas df data") du.makePath(du.getdir(path)) pdf.to_csv(path, index=False) ### Saving Pandas DF to S3 # save locally and then copy to s3 path_linux = analysis.save_location_linux + "toy_data_pandas_df_results.csv" du.makePath(du.getdir(path)) pdf.to_csv(path_linux, index=False) path_s3 = analysis.save_location_s3 + "toy_data_pandas_df_results.csv" s3.put_s3url(path_s3, path_linux) sdftools.show(analysis.save_location_s3, "S3 Analysis Results Save Location")
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets if table_name in table_race_query: # Table 17, 18, 21 and others print("no buckets, with race query") spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) spark_df2.show(100, False) print("Make sure 'Not' values are removed") metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) else: print("no buckets, without race query") spark_df1.show(100, False) spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) print("with Not removed") spark_df2.show(100, False) metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket1: print("Data is in age brackets, 0 to 17, 18 to 64, 65+") spark_df1.show(100, False) for age_range in age_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket2: print("Data is age buckets, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket3: print("Data is in age brackets of 5 year age groups") spark_df1.show(100, False) for age_range in age_range_list2: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket4: print("Data is age buckets of 5 year age groups, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list2: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data is in buckets [0,10],[10,100),[100+) print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets2: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list: # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2 print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") df = experiment.getDF() if TEST: df = df.limit(TEST_NUM) print("df looks like:") df.show() schema = experiment.schema sdftools.print_item(df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema) rddWithAbsDiff = sdftools.getFullWorkloadDF( rddWithAbsDiff, schema, queri, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) rdd = rdd.flatMapValues( lambda rows: sepBounds(rows, 'orig', buckets)).persist() rdd = rdd.map(lambda row: Row(**row[1])) df = rdd.toDF().persist() metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' df = df.groupby([ 'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4', 'Bin5' ]).avg() pandas_df = df.toPandas() pandas_df = pandas_df.rename(columns={ "avg(abs diff)": metric_name, "avg(orig)": "orig" }) pandas_df[x_axis_variable_name] = pandas_df.apply( lambda row: binIndexToInteger(row, buckets), axis=1) plt.figure(1, figsize=(11, 8.5)) plt.rc('axes', labelsize=8) """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name)
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name): EPT = table_name[:4] + "_" + schema_name for query in queries: for geolevel in geolevels: subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel] race_iterates = major_omb_race_names if ( "EPT3" in table_name and "allraces *" in query) else [""] race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else [""] race_iterates = major_combination_race_names if ( "EPT3" in table_name and query.str.contains("combo")) else race_iterates print( f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})" ) # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor? for race in race_iterates: plotting_df = subsetted_df if race in major_omb_race_names: plotting_df = subsetted_df[ subsetted_df['level'].str.contains(race)] if race in major_combination_race_names: plotting_df = subsetted_df[ subsetted_df['level'].str.contains(race)] bucket_counts = {} for bindex, bucket in enumerate(buckets): trueRows = plotting_df[plotting_df[f"Bin{bindex}"] == True] bucket_counts[bucket] = trueRows.shape[0] print( f"Geolevel {geolevel} has bucket_counts: {bucket_counts}") bucket_names = [ str(bucket) for bucket in buckets if bucket_counts[bucket] != 0 ] if (np.array(list(bucket_counts.values())) > 0).any(): graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}" if race in major_omb_race_names: graph_title += f"Race Alone: {race.title()}" if race == "2+ Races": graph_title += f"2+ Races" if race in major_combination_race_names: graph_title += f"Race: {race}" graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C." # Scatterplot strips superimposed on violin plots sns.set(style="whitegrid") sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names, inner = None, color="0.8") \ .set_title(graph_title) sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names) \ .set_title(graph_title) plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/scatteredViolin/{experiment_name}_" plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf" du.makePath(du.getdir(plot_savepath)) print( f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}" ) plt.savefig(plot_savepath) plt.clf() else: print( f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated." )
data = [{ "Area": 'State', 'Value': no_state }, { "Area": 'County', 'Value': no_county }, { "Area": 'Tract', 'Value': no_tract }, { "Area": 'Place', 'Value': no_place }, { "Area": 'Block Group', 'Value': no_group }, { "Area": 'Block', 'Value': no_block }, { "Area": 'SLDL', 'Value': no_sldl }, { "Area": 'SLDU', 'Value': no_sldu }] df = spark.createDataFrame(data) pandas_df = df.toPandas() csv_savepath = "s3://uscb-decennial-ite-das/rao001/Geolevel_counts.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
print("This is q") q.show(10) q=q.groupby(['geolevel','run_id']).sum() columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)'] print("this is q2") q=q.drop(*columnstodrop) q.show(10) z=q.groupby(['geolevel']).avg() print("this is z") z.show(10) return q,z get,get2=NEbias(spark,df,geolevels,queries,schema) path=save_location_linux+"NEbias.csv" pdf=get.toPandas() du.makePath(du.getdir(path)) pdf.to_csv(path,index=False) path2=save_location_linux+"NEbias_av.csv" pdf2=get2.toPandas() du.makePath(du.getdir(path2)) pdf2.to_csv(path2,index=False) get3,get4=MMbias(get) path3=save_location_linux+"MMbias.csv" pdf3=get3.toPandas() du.makePath(du.getdir(path3)) pdf3.to_csv(path3,index=False) path4=save_location_linux+"MMbias_av.csv" pdf4=get4.toPandas()
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str): EPT = table_name[:4]+"_"+schema_name for query in queries: for geolevel in geolevels: subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel] race_iterates = major_omb_race_names if ("EPT3" in table_name and "allraces *" in query) else [""] race_iterates = race_combo_names if (race_iterates == [""] and "racecombos *" in query) else race_iterates race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else race_iterates print(f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})") # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor? for race in race_iterates: plotting_df = subsetted_df if race in major_omb_race_names: plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)] if race in race_combo_names: plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)] bucket_counts = {} for bucket in buckets: trueRows = plotting_df[plotting_df[x_axis_variable_name] == bucket] bucket_counts[bucket] = trueRows.shape[0] print(f"Geolevel {geolevel} has bucket_counts: {bucket_counts}") bucket_names = [str(bucket) for bucket in buckets if bucket_counts[bucket] != 0] if (np.array(list(bucket_counts.values())) > 0).any(): graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}" if race in major_omb_race_names: graph_title += f"\n Race Alone: {race.title()}" if race in race_combo_names: graph_title += f"\n Race Combination: {race.title()}" if race == "2+ Races": graph_title += f"\n 2+ Races" graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C." pandas.set_option('display.max_columns', None) pandas.set_option('display.max_rows', None) #print(f"Before plotting, plotting_df looks like:") #print(plotting_df) #print(f"Feeding plotting_df to seaborn with xvar {x_axis_variable_name} & yvar {metric_name}") #print(f"And bucket_names are: {bucket_names}") # Scatterplot strips superimposed on violin plots sns.set(style="whitegrid") ax = sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names, inner = None, color="0.8") \ .set_title(graph_title) maxVal = plotting_df[metric_name].max() minVal = plotting_df[metric_name].min() print(f"maxVal type, val: {type(maxVal)}, {maxVal}") print(f"minVal type, val: {type(minVal)}, {minVal}") if abs(maxVal - minVal) < 0.1: #print(f"violin ax has type {type(ax)} & methods: {dir(ax)}") #print(f"violin ax.axes has type {type(ax.axes)} & methods: {dir(ax.axes)}") ax.axes.set(ylim=(minVal-10, maxVal+10)) ax = sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names) \ .set_title(graph_title) #if geolevel == C.US: # #print(f"strip ax has type {type(ax)} & methods: {dir(ax)}") # ax.axes.set(ylim=(plotting_df[x_axis_variable_name].min - 10,plotting_df[x_axis_variable_name].max + 10)) plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/epsilon{eps_str}/scatteredViolin/{experiment_name}_" plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf" du.makePath(du.getdir(plot_savepath)) print(f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}") plt.savefig(plot_savepath) plt.clf() else: print(f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated.")