def getCategoryByAgeQuantilesFast(sdf, queries, product, state, plot=False):
    df = sdf.df
    results = {}
    for query in queries:
        res = sdftools.categoryByAgeQuantiles(df,
                                              sdf.schema,
                                              query,
                                              labels=True)
        results.update(res)

    if plot:
        for key, df in results.items():
            queryname, category, datatype = parseAgeQuantileKey(key,
                                                                fsname=False)
            sdftools.print_item(
                df.count(),
                "Number of rows in the Spark DF before transforming to Pandas DF"
            )
            if datatype == "quantile_df":
                age_quantile_pandas_df = df.toPandas()
                saveloc = du.getdir(sdf.metric_save_location)
                rp.age_quantile_lineplot(age_quantile_pandas_df, saveloc,
                                         product, state)
            else:  # datatype == "survival_props"
                pass

    return results
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps)
    print(f"Passing paths to Analysis experiment maker: {paths}")
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()
    missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP])
    missing_rows_dict = defaultdict(int)
    for index, row in missing_rows_pandas_df.iterrows():
        #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}")
        missing_rows_dict[row['geolevel']] = row['sum(missing)']
    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    pandas_df = spark_df.toPandas()
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str, missing_rows_dict, num_trials)
def getGeolevelTVDFast(sdf, product, state, plot=False):
    sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB])

    if plot:
        saveloc = du.getdir(sdf.metric_save_location)
        sdftools.print_item(
            sdf.df.count(),
            "Number of rows in the Spark DF before transforming to Pandas DF")
        geolevel_tvd_pandas_df = sdf.toPandas()
        rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product,
                                 state)
        rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product,
                                state)

    results = {'geolevel_tvd': sdf}

    return results
def getGeolevelTVD(sdf, geolevels, queries, product, state, plot=False):
    """ 
    Calculates the following:
    0. Query answers at the specified geolevels
    1. Per-geolevel TVD
    """
    sdf = sdf.getGeolevels(geolevels).getQueryAnswers(queries)
    sdf = sdf.geolevel_tvd(groupby=[AC.GEOLEVEL, AC.RUN_ID, AC.QUERY, AC.PLB])

    if plot:
        saveloc = du.getdir(sdf.metric_save_location)
        geolevel_tvd_pandas_df = sdf.toPandas()
        rp.geolevel_tvd_lineplot(geolevel_tvd_pandas_df, saveloc, product,
                                 state)
        rp.geolevel_tvd_heatmap(geolevel_tvd_pandas_df, saveloc, product,
                                state)

    return sdf
Beispiel #5
0
    queri = ["allraces"]
    
    
    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)
    df_table = sdftools.answerQueries(df_geolevel, schema, queri, labels=True).persist()
    df_withmissingrows=sdftools.getFullWorkloadDF(df_table, schema, queri,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    #print(df_withmissingrows.head(200))
    sparse = sdftools.getCellSparsityByGroup(df_withmissingrows,schema,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP,AC.QUERY])
    zero=ReturnZeroCounts(df_withmissingrows, geolevels)
    print("This is sparse:")
    print(sparse.head(20))
    print("This is zero")
    print(zero.head(20))
    csv_savepath = save_location_linux + f"origtable.csv"
    csv_savepath2 = save_location_linux + f"missingrows.csv"
    du.makePath(du.getdir(csv_savepath))
    du.makePath(du.getdir(csv_savepath2))
    pandas_df_table=df_table.toPandas()
    
    pandas_df_table.to_csv(csv_savepath, index=False)
    pandas_dfmissing=df_withmissingrows.toPandas()
    pandas_dfmissing.to_csv(csv_savepath2, index=False)


#    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)
#    races_names1=['White','Black or African American','American Indian and Alaskan Native']
#    races_names2=['Asian','Native Hawaiian and Other Pacific Islander','Some Other Race']
#    white1=['Aian']
#    buckets=[(0,0),(1,10),(11,100),(100,1000),(1000,10000),(10000,float('inf'))]
#    path = du.addslash(save_location)
#    plt.figure(figsize=(11,8.5))
Beispiel #6
0
        #'detailed'
        'cenrace'
]

sdftools.show(df, "df with geolevel crosswalk columns")
sdftools.show(df, "df with geolevel crosswalk columns")
df = sdftools.aggregateGeolevels(spark, df, geolevels)
sdftools.show(df, "df after geolevel aggregation", 1000)
qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False)
sdftools.show(qdf, "Query df with the query 'total'", 1000)
rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY])
#sdftools.show(rdd.collect(), "Row groups")

path = save_location_linux + "Gel.csv"
q = qdf.toPandas()
du.makePath(du.getdir(path))
q.to_csv(path, index=False)
def Qualbins(rows, column, bins):
    #only works if bins > 2
    pandas_df = pandas.DataFrame(rows)
    q = 1/bins
    
    p = bins+1
    for i in range (1, p):
        k=str(i)
        pandas_df['Bin'+k]=(np.quantile(pandas_df[column],q)>=pandas_df[column])&(pandas_df[column]>=np.quantile(pandas_df[column],q-1/bins))
            
        q=q+1/bins    
         
   
    rows = pandas_df.to_dict('records')
Beispiel #7
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)

    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    if table_name not in (table_list_3_plus_list_age):

        for g in geolevels:
            spark_df1 = spark_df[spark_df['geolevel'] ==
                                 g]  # Separate data for each geolevel
            if table_name in table_default_no_bucket:  # If data is not in buckets
                bucket_size = "NA"
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="A")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list2:  # if data is bucketed in 3 buckets,
                bucket_size = default_buckets2
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list1:  # Table 1 and 2, six buckets
                bucket_size = default_buckets1
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

    if table_name in table_list_3geolevels:  #three geolevels, state, county, place, Tables 10,14,18,22

        metrics_result = sdftools.metrics_with_3geolevels(
            spark_df, spark, geolevels)
        file_name = f"{table_name}.csv"

    if table_name in table_list_age:  # Tables 32-35

        if table_name in table_age_bracket1:

            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="A")
        if table_name in table_age_bracket2:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="B")
        if table_name in table_age_bracket3:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="A")
        if table_name in table_age_bracket4:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="B")

        file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)

    # Old approach to computing df with abs diff, bucketed by true count:
    #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema)
    #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]):
    #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY])
    #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    #rdd = rdd.map(lambda row: Row(**row[1]))
    #spark_df = rdd.toDF().persist()

    # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count:
    # (avoids pandas dfs inside mappers, which is RAM-hungry)
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()

    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg()
    # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re
    #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg()
    #print("^^^^ after averaging, spark_df looks like ^^^^")
    pandas_df = spark_df.toPandas()
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"})
    #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1)
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"})
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")
    new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket
    buckets = [buckets[i] for i in new_bucket_order]
    print(f"Sorted bucket names: {buckets}")


    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str)
Beispiel #9
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 key,
                 agekey,
                 sexkey,
                 bucketkey,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels,
                                               agekey, sexkey, bucketkey, key)
    file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    #spark_df.show(spark_df.count(), False)

    for g in geolevels:
        spark_df = spark_df[spark_df['geolevel'] == g]
        print("This has all levels")
        spark_df.show(150, False)

        metrics_dataframe = sdftools.mattsmetrics(spark_df, spark)
        Counts = spark_df.count()
        print("Counts are", Counts)
        newRow = spark.createDataFrame([(Counts, "Counts")])
        metrics_dataframe = metrics_dataframe.union(newRow)
        pandas_df = metrics_dataframe.toPandas()
        csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
        du.makePath(du.getdir(csv_savepath))
        pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_default_bucket_list2:  # If data needs bucketing

            for b in default_buckets2:  # calculate Metrics at each bucket
                subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                          b]  #subset into bins
                subset_sparkdf = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))
                )  # Removes instances of Not Hispanic..from dataframe
                subset_sparkdf.show(100, False)
                print("Make sure its bucketed and without 'Not' values")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark)
                Counts = subset_sparkdf.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
Beispiel #11
0
    df.write.partitionBy(partitionby).option("header", "true").format("csv").save(path)
    
    path = analysis.save_location_s3 + "toy_data_df_results_by_partition.csv"
    sdftools.show(path, "S3 location of the toy df analysis results in csv form, partitioned by columns")
    # can be a list; spark will split by each column requested, in the order specified
    partitionby = ['query', 'geocode']
    df.write.partitionBy(partitionby).option("header", "true").format("csv").save(path)
    
    
    ### Saving Spark DF as csv locally (via Pandas DF)
    path = analysis.save_location_linux + "toy_data_df_results.csv"
    sdftools.show(path, "Linux location of the toy df analysis results in csv form")
    pdf = df.toPandas()
    sdftools.show(pdf, "The Pandas DF based on the Spark DF")
    # create the directory locally, otherwise pandas_df.to_csv will throw an error
    sdftools.show(du.getdir(path), "The directory being created to house the pandas df data")
    du.makePath(du.getdir(path))
    pdf.to_csv(path, index=False)
   
    
    ### Saving Pandas DF to S3
    # save locally and then copy to s3
    path_linux = analysis.save_location_linux + "toy_data_pandas_df_results.csv"
    du.makePath(du.getdir(path))
    pdf.to_csv(path_linux, index=False)
    
    path_s3 = analysis.save_location_s3 + "toy_data_pandas_df_results.csv"
    s3.put_s3url(path_s3, path_linux)

    
    sdftools.show(analysis.save_location_s3, "S3 Analysis Results Save Location")
Beispiel #12
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    for g in geolevels:
        spark_df1 = spark_df[spark_df['geolevel'] ==
                             g]  # Separate data for each geolevel
        if table_name in table_default_no_bucket:  # If data is not in buckets
            if table_name in table_race_query:  # Table 17, 18, 21 and others
                print("no buckets, with race query")
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                spark_df2.show(100, False)
                print("Make sure 'Not' values are removed")
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

            else:
                print("no buckets, without race query")
                spark_df1.show(100, False)
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                print("with Not removed")
                spark_df2.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_age_bracket1:
            print("Data is in age brackets, 0 to 17, 18 to 64, 65+")
            spark_df1.show(100, False)
            for age_range in age_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket2:
            print("Data is age buckets, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket3:
            print("Data is in age brackets of 5 year age groups")
            spark_df1.show(100, False)
            for age_range in age_range_list2:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket4:
            print("Data is age buckets of 5 year age groups, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list2:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list2:  # If data is in buckets [0,10],[10,100),[100+)
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets2:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list:  # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
Beispiel #13
0
def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    df = experiment.getDF()
    if TEST:
        df = df.limit(TEST_NUM)
    print("df looks like:")
    df.show()
    schema = experiment.schema
    sdftools.print_item(df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema)
    rddWithAbsDiff = sdftools.getFullWorkloadDF(
        rddWithAbsDiff,
        schema,
        queri,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff,
                                     groupby=[AC.GEOLEVEL, AC.QUERY])
    rdd = rdd.flatMapValues(
        lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    rdd = rdd.map(lambda row: Row(**row[1]))
    df = rdd.toDF().persist()

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    df = df.groupby([
        'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4',
        'Bin5'
    ]).avg()
    pandas_df = df.toPandas()
    pandas_df = pandas_df.rename(columns={
        "avg(abs diff)": metric_name,
        "avg(orig)": "orig"
    })
    pandas_df[x_axis_variable_name] = pandas_df.apply(
        lambda row: binIndexToInteger(row, buckets), axis=1)
    plt.figure(1, figsize=(11, 8.5))
    plt.rc('axes', labelsize=8)
    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries,
              x_axis_variable_name, metric_name, geolevels, pandas_df, buckets,
              schema_name)
Beispiel #14
0
def makePlots(experiment, experiment_name, table_name, queries,
              x_axis_variable_name, metric_name, geolevels, pandas_df, buckets,
              schema_name):
    EPT = table_name[:4] + "_" + schema_name
    for query in queries:
        for geolevel in geolevels:
            subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel]
            race_iterates = major_omb_race_names if (
                "EPT3" in table_name and "allraces *" in query) else [""]
            race_iterates = ["2+ Races"] if (race_iterates == [""]
                                             and "tomr *" in query) else [""]
            race_iterates = major_combination_race_names if (
                "EPT3" in table_name
                and query.str.contains("combo")) else race_iterates
            print(
                f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})"
            )

            # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor?
            for race in race_iterates:
                plotting_df = subsetted_df
                if race in major_omb_race_names:
                    plotting_df = subsetted_df[
                        subsetted_df['level'].str.contains(race)]
                if race in major_combination_race_names:
                    plotting_df = subsetted_df[
                        subsetted_df['level'].str.contains(race)]
                bucket_counts = {}
                for bindex, bucket in enumerate(buckets):
                    trueRows = plotting_df[plotting_df[f"Bin{bindex}"] == True]
                    bucket_counts[bucket] = trueRows.shape[0]
                print(
                    f"Geolevel {geolevel} has bucket_counts: {bucket_counts}")
                bucket_names = [
                    str(bucket) for bucket in buckets
                    if bucket_counts[bucket] != 0
                ]
                if (np.array(list(bucket_counts.values())) > 0).any():
                    graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}"
                    if race in major_omb_race_names:
                        graph_title += f"Race Alone: {race.title()}"
                    if race == "2+ Races":
                        graph_title += f"2+ Races"
                    if race in major_combination_race_names:
                        graph_title += f"Race: {race}"

                    graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C."

                    # Scatterplot strips superimposed on violin plots
                    sns.set(style="whitegrid")
                    sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names,
                                            inner = None, color="0.8") \
                                            .set_title(graph_title)
                    sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names)  \
                                            .set_title(graph_title)
                    plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/scatteredViolin/{experiment_name}_"
                    plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf"
                    du.makePath(du.getdir(plot_savepath))
                    print(
                        f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}"
                    )
                    plt.savefig(plot_savepath)
                    plt.clf()
                else:
                    print(
                        f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated."
                    )
Beispiel #15
0
    data = [{
        "Area": 'State',
        'Value': no_state
    }, {
        "Area": 'County',
        'Value': no_county
    }, {
        "Area": 'Tract',
        'Value': no_tract
    }, {
        "Area": 'Place',
        'Value': no_place
    }, {
        "Area": 'Block Group',
        'Value': no_group
    }, {
        "Area": 'Block',
        'Value': no_block
    }, {
        "Area": 'SLDL',
        'Value': no_sldl
    }, {
        "Area": 'SLDU',
        'Value': no_sldu
    }]
    df = spark.createDataFrame(data)
    pandas_df = df.toPandas()
    csv_savepath = "s3://uscb-decennial-ite-das/rao001/Geolevel_counts.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
Beispiel #16
0
    print("This is q")
    q.show(10)
    q=q.groupby(['geolevel','run_id']).sum()
    columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)']
    print("this is q2")
    q=q.drop(*columnstodrop)
    q.show(10)
    z=q.groupby(['geolevel']).avg()
    print("this is z")
    z.show(10)
    return q,z

get,get2=NEbias(spark,df,geolevels,queries,schema)
path=save_location_linux+"NEbias.csv"
pdf=get.toPandas()
du.makePath(du.getdir(path))
pdf.to_csv(path,index=False)

path2=save_location_linux+"NEbias_av.csv"
pdf2=get2.toPandas()
du.makePath(du.getdir(path2))
pdf2.to_csv(path2,index=False)

get3,get4=MMbias(get)
path3=save_location_linux+"MMbias.csv"
pdf3=get3.toPandas()
du.makePath(du.getdir(path3))
pdf3.to_csv(path3,index=False)

path4=save_location_linux+"MMbias_av.csv"
pdf4=get4.toPandas()
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str):
    EPT = table_name[:4]+"_"+schema_name
    for query in queries:
        for geolevel in geolevels:
            subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel]
            race_iterates = major_omb_race_names if ("EPT3" in table_name and "allraces *" in query) else [""]
            race_iterates = race_combo_names if (race_iterates == [""] and "racecombos *" in query) else race_iterates
            race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else race_iterates
            print(f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})")

            # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor?
            for race in race_iterates:
                plotting_df = subsetted_df
                if race in major_omb_race_names:
                    plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)]
                if race in race_combo_names:
                    plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)]
                bucket_counts = {}
                for bucket in buckets:
                    trueRows = plotting_df[plotting_df[x_axis_variable_name] == bucket]
                    bucket_counts[bucket] = trueRows.shape[0]
                print(f"Geolevel {geolevel} has bucket_counts: {bucket_counts}")
                bucket_names = [str(bucket) for bucket in buckets if bucket_counts[bucket] != 0]
                if (np.array(list(bucket_counts.values())) > 0).any():
                    graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}"
                    if race in major_omb_race_names:
                        graph_title += f"\n Race Alone: {race.title()}"
                    if race in race_combo_names:
                        graph_title += f"\n Race Combination: {race.title()}"

                    if race == "2+ Races":
                        graph_title += f"\n 2+ Races"
                    graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C."

                    pandas.set_option('display.max_columns', None)
                    pandas.set_option('display.max_rows', None)
                    #print(f"Before plotting, plotting_df looks like:")
                    #print(plotting_df)
                    #print(f"Feeding plotting_df to seaborn with xvar {x_axis_variable_name} & yvar {metric_name}")
                    #print(f"And bucket_names are: {bucket_names}")

                    # Scatterplot strips superimposed on violin plots
                    sns.set(style="whitegrid")
                    ax = sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names,
                                            inner = None, color="0.8") \
                                            .set_title(graph_title)

                    maxVal = plotting_df[metric_name].max()
                    minVal = plotting_df[metric_name].min()
                    print(f"maxVal type, val: {type(maxVal)}, {maxVal}")
                    print(f"minVal type, val: {type(minVal)}, {minVal}")
                    if abs(maxVal - minVal) < 0.1:
                        #print(f"violin ax has type {type(ax)} & methods: {dir(ax)}")
                        #print(f"violin ax.axes has type {type(ax.axes)} & methods: {dir(ax.axes)}")
                        ax.axes.set(ylim=(minVal-10, maxVal+10))
                    ax = sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names)  \
                                            .set_title(graph_title)
                    #if geolevel == C.US:
                    #    #print(f"strip ax has type {type(ax)} & methods: {dir(ax)}")
                    #    ax.axes.set(ylim=(plotting_df[x_axis_variable_name].min - 10,plotting_df[x_axis_variable_name].max + 10))
                    plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/epsilon{eps_str}/scatteredViolin/{experiment_name}_"
                    plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf"
                    du.makePath(du.getdir(plot_savepath))
                    print(f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}")
                    plt.savefig(plot_savepath)
                    plt.clf()
                else:
                    print(f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated.")