for k in geolevels: l=df.loc[df['geolevel']==k] if (df['orig']==0 & df['priv']==0): counter=counter+1 zerocounts=zerocounts.append({k:counter}) return zerocounts # Get the geolevels (faster to do this before looping if the queries # are to be answered over the same geolevels; otherwise, can perform this # step in the loop) geolevels = [C.COUNTY, C.TRACT] queri = ["allraces"] df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels) df_table = sdftools.answerQueries(df_geolevel, schema, queri, labels=True).persist() df_withmissingrows=sdftools.getFullWorkloadDF(df_table, schema, queri,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #print(df_withmissingrows.head(200)) sparse = sdftools.getCellSparsityByGroup(df_withmissingrows,schema,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP,AC.QUERY]) zero=ReturnZeroCounts(df_withmissingrows, geolevels) print("This is sparse:") print(sparse.head(20)) print("This is zero") print(zero.head(20)) csv_savepath = save_location_linux + f"origtable.csv" csv_savepath2 = save_location_linux + f"missingrows.csv" du.makePath(du.getdir(csv_savepath)) du.makePath(du.getdir(csv_savepath2)) pandas_df_table=df_table.toPandas() pandas_df_table.to_csv(csv_savepath, index=False) pandas_dfmissing=df_withmissingrows.toPandas()
def MattsMetrics(query, table_name, analysis, spark, geolevels, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) if table_name not in (table_list_3_plus_list_age): for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets bucket_size = "NA" metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="A") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list2: # if data is bucketed in 3 buckets, bucket_size = default_buckets2 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_bucket_list1: # Table 1 and 2, six buckets bucket_size = default_buckets1 print("BUCKET SIZE IS:", bucket_size) metrics_result = sdftools.metrics_with_popbucket(spark_df1, bucket_size, spark, key="B") file_name = f"{table_name}_{g}.csv" if table_name in table_list_3geolevels: #three geolevels, state, county, place, Tables 10,14,18,22 metrics_result = sdftools.metrics_with_3geolevels( spark_df, spark, geolevels) file_name = f"{table_name}.csv" if table_name in table_list_age: # Tables 32-35 if table_name in table_age_bracket1: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="A") if table_name in table_age_bracket2: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list, key="B") if table_name in table_age_bracket3: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="A") if table_name in table_age_bracket4: metrics_result = sdftools.metrics_with_age(spark_df, spark, age_range_list2, key="B") file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, key, agekey, sexkey, bucketkey, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_bucket_list1): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels, agekey, sexkey, bucketkey, key) file_name = f"{table_name}.csv" pandas_df = metrics_result.toPandas() csv_savepath = experiment.save_location_linux + file_name du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds" """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF( spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) #spark_df.show(spark_df.count(), False) # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production spark_df = sdftools.getL1(spark_df, colname="L1", col1=AC.PRIV, col2=AC.ORIG) spark_df = sdftools.getL2(spark_df, colname="L2", col1=AC.PRIV, col2=AC.ORIG) # apply bin functions for particular tables if (table_name in table_default_bucket_list): spark_df = sdftools.getCountBins( spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000, 100000]).persist() if (table_name in table_default_bucket_list2): spark_df = sdftools.getCountBins(spark_df, column=AC.ORIG, bins=[0, 10, 100]).persist() # This finds overall metrics spark_df.show(100, False) for g in geolevels: spark_df1 = spark_df[spark_df['geolevel'] == g] # Separate data for each geolevel if table_name in table_default_no_bucket: # If data is not in buckets if table_name in table_race_query: # Table 17, 18, 21 and others print("no buckets, with race query") spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) spark_df2.show(100, False) print("Make sure 'Not' values are removed") metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) else: print("no buckets, without race query") spark_df1.show(100, False) spark_df2 = spark_df1.subtract( spark_df1.filter(spark_df1.level.rlike("Not"))) print("with Not removed") spark_df2.show(100, False) metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark) Counts = spark_df2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket1: print("Data is in age brackets, 0 to 17, 18 to 64, 65+") spark_df1.show(100, False) for age_range in age_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket2: print("Data is age buckets, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket3: print("Data is in age brackets of 5 year age groups") spark_df1.show(100, False) for age_range in age_range_list2: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(age_range)) subset_sparkdf1.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf1, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf1.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_age_bracket4: print("Data is age buckets of 5 year age groups, with sex query") spark_df1.show(100, False) for sexlevel in sex_range_list: subset_sparkdf1 = spark_df1.filter( spark_df1.level.rlike(sexlevel)) for age_range in age_range_list2: subset_sparkdf2 = subset_sparkdf1.filter( subset_sparkdf1.level.rlike(age_range)) subset_sparkdf2.show(100, False) metrics_dataframe = sdftools.mattsmetrics( subset_sparkdf2, spark) #subset_sparkdf1.show(100, False) Counts = subset_sparkdf2.count() print("Counts are", Counts) newRow = spark.createDataFrame([(Counts, "Counts")]) metrics_dataframe = metrics_dataframe.union(newRow) pandas_df = metrics_dataframe.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list2: # If data is in buckets [0,10],[10,100),[100+) print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets2: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) if table_name in table_default_bucket_list: # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2 print("data is bucketed and treated accordingly") #if table_name in table_race_query: for b in default_buckets: # calculate Metrics at each bucket print("Bucket is:", b) subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] == b] #subset into bins subset_sparkdf.show(100, False) print("Bucketed data") subset_sparkdf1 = subset_sparkdf.subtract( subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))) subset_sparkdf1.show(100, False) print("Make sure its bucketed and 'Not' values are removed") subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark) Counts = subset_sparkdf1.count() newRow = spark.createDataFrame([(b, "Bucket")]) newRow1 = spark.createDataFrame([(Counts, "Counts")]) subset_metrics = subset_metrics.union(newRow).union(newRow1) pandas_df = subset_metrics.toPandas() csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print( f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}" ) schema_name = schema paths, experiment_name = getPathsAndName(schema_name) experiment = analysis.make_experiment( experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") df = experiment.getDF() if TEST: df = df.limit(TEST_NUM) print("df looks like:") df.show() schema = experiment.schema sdftools.print_item(df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema) rddWithAbsDiff = sdftools.getFullWorkloadDF( rddWithAbsDiff, schema, queri, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) rdd = rdd.flatMapValues( lambda rows: sepBounds(rows, 'orig', buckets)).persist() rdd = rdd.map(lambda row: Row(**row[1])) df = rdd.toDF().persist() metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' df = df.groupby([ 'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4', 'Bin5' ]).avg() pandas_df = df.toPandas() pandas_df = pandas_df.rename(columns={ "avg(abs diff)": metric_name, "avg(orig)": "orig" }) pandas_df[x_axis_variable_name] = pandas_df.apply( lambda row: binIndexToInteger(row, buckets), axis=1) plt.figure(1, figsize=(11, 8.5)) plt.rc('axes', labelsize=8) """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"): """ Main plotting fxn. query : str, name of a valid query for the target experiment's schema table_name : str, name of a table (used for file-naming conventions) analysis : Analysis setuptools.setup object, organizes Analysis metadata spark : SparkSession object, attached to analysis object geolevels : [str, ...], geolevels to compute over for the current query buckets : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3. """ print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}") schema_name = schema paths, experiment_name, eps_str = getPathsAndName(schema_name) experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT) sdftools.print_item(experiment.__dict__, "Experiment Attributes") spark_df = experiment.getDF() print("df looks like:") spark_df.show() schema = experiment.schema sdftools.print_item(spark_df, "Flat Experiment DF") queries = [query] #y=sdftools.getAnswers(spark,df,geolevels,schema,queries) # Old approach to computing df with abs diff, bucketed by true count: #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema) #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]): #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY]) #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist() #rdd = rdd.map(lambda row: Row(**row[1])) #spark_df = rdd.toDF().persist() # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count: # (avoids pandas dfs inside mappers, which is RAM-hungry) spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels) spark_df = sdftools.answerQueries(spark_df, schema, queries) spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]) spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist() spark_df.show() print("^^^^ with abs error, DF looks like ^^^^") metric_name = "Avg( |q(MDF) - q(CEF)| )" x_axis_variable_name = 'CEF Count, Binned' # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg() # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg() #print("^^^^ after averaging, spark_df looks like ^^^^") pandas_df = spark_df.toPandas() #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"}) #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1) #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"}) pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name}) plt.figure(1, figsize=(11,8.5)) plt.rc('axes', labelsize=8) print(f"pandas df before plotting has cols: {pandas_df.columns.values}") print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}") buckets = pandas_df[x_axis_variable_name].unique() buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name)) print(f"Sorted bucket names: {buckets}") new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket buckets = [buckets[i] for i in new_bucket_order] print(f"Sorted bucket names: {buckets}") """ print(pandas_df.head(30)) print(f"pandas_df headers: {list(pandas_df.columns.values)}") tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]] print("tmpDf looks like:") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(tmpDf) print("^^^^ pandas df looks like ^^^^") print("And first 3 rows:") print(pandas_df.iloc[:3]) #print(df.dtypes) print("And first 100 rows, subset to Bins:") print(pandas_df.iloc[0:101,3:9]) print(pandas_df.iloc[0:101,-1]) """ # Saving data frame csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv" du.makePath(du.getdir(csv_savepath)) pandas_df.to_csv(csv_savepath, index=False) makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name, metric_name, geolevels, pandas_df, buckets, schema_name, eps_str)