Python aggregateGeolevels Examples, analysis.tools.sdftools.aggregateGeolevels Python Examples

Example #1

0

Show file

    def run(self, engine_tuple):
        block_nodes, feas_dict = engine_tuple
        
        # access the SparkSession (needed for aggregating geolevels)
        spark = SparkSession(SparkContext.getOrCreate())

        # transform the rdd of block-level nodes into a 'sparse histogram' spark df
        df = datatools.rdd2df(block_nodes, self.setup.schema_obj)
        sdftools.show(df, "The Block-level Geounit Nodes as Sparse Histogram DF", 1000)

        # read the geolevels from the error_metrics section of the config file
        geolevels = self.setup.config['error_metrics']['geolevels'].split(", ")
        #geolevels = self.setup.levels
        
        # aggregate blocks to get the different geolevels
        df = sdftools.aggregateGeolevels(spark, df, geolevels)
        sdftools.show(df, f"DF with all Geolevels in {geolevels}", 1000)
        
        # access the queries from the error_metrics section of the config file
        queries = self.setup.config['error_metrics']['queries'].split(", ")
        # and answer the queries
        df = sdftools.answerQueries(df, self.setup.schema_obj, queries)
        sdftools.show(df, f"DF with all Queries in {queries}", 1000)

        # compute the Geolevel 1-TVD metric
        geolevel_tvd = sdftools.getGeolevelTVD(df, groupby=[AC.GEOLEVEL, AC.QUERY])
        geolevel_tvd = geolevel_tvd.orderBy([AC.QUERY, AC.GEOLEVEL])
        sdftools.show(geolevel_tvd, f"Geolevel 1-TVD per geolevel per query", 1000)

        # calculate sparsity change
        sparsity_df = sdftools.getCellSparsityByGroup(df, self.setup.schema_obj, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.QUERY])
        sdftools.show(sparsity_df, f"Query and Geolevel DF with Sparsity per group", 1000)

Example #2

0

Show file

File: Executive_Priority_Tabulations.py Project: p-b-j/census-das-container

def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps)
    print(f"Passing paths to Analysis experiment maker: {paths}")
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()
    missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP])
    missing_rows_dict = defaultdict(int)
    for index, row in missing_rows_pandas_df.iterrows():
        #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}")
        missing_rows_dict[row['geolevel']] = row['sum(missing)']
    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    pandas_df = spark_df.toPandas()
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str, missing_rows_dict, num_trials)

Example #3

0

Show file

File: relative_l1_error.py Project: p-b-j/census-das-container

def analyzeQuery(query, analysis, spark, geolevel, schema_name, path):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    # To avoid cases in which max(numerator_query_levels)/denom_query_level >= 1:
    assert query != denom_query

    experiment_name = "NA"
    experiment = analysis.make_experiment(
        experiment_name, [path],
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT,
        budget_group='1',
        run_id='run1.0')
    spark_df = experiment.getDF()
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel)
    spark_df = sdftools.remove_not_in_area(spark_df, [geolevel])
    spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query])

    spark_df = sdftools.getL1Relative(spark_df,
                                      colname="L1Relative",
                                      denom_query=denom_query,
                                      denom_level=denom_level).persist()
    query_counts = spark_df.rdd.map(lambda row: (row[AC.QUERY], )).countByKey()
    query_counts_keys = list(query_counts.keys())
    assert len(query_counts_keys) == 1 and query_counts_keys[0] == query

    spark_rdd_prop_lt = spark_df.rdd.map(
        lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1.
                     if row["L1Relative"] <= THRESHOLD else 0.))
    spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"])

    # Find the proportion of geounits that have L1Relative errors less than threshold for each bin:
    grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({
        "prop_lt": "avg",
        "*": "count"
    })
    prop_lt = grouped_df_prop_lt.collect()
    n_bins = len(POPULATION_BIN_STARTS) + 1
    prop_lt_list = [None] * n_bins
    prop_lt_counts = [0] * n_bins
    for row in prop_lt:
        prop_lt_list[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5)
        prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"])
    print(prop_lt_list)
    print(
        f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}"
    )

    population_bin_starts = np.concatenate(
        ([-np.inf], POPULATION_BIN_STARTS, [np.inf]))
    ranges = list(
        zip(population_bin_starts[:-1], population_bin_starts[1:] - 1))
    assert len(prop_lt_list) == (len(population_bin_starts) - 1)
    prop_lt_reformat = list(zip(ranges, prop_lt_list))

    spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF)
    # Count above POPULATION_CUTOFF
    count = spark_df.count()
    # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined
    # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.)
    spark_df = spark_df.filter(spark_df.L1Relative != 2.)
    count_correct_sign = spark_df.count()

    quantiles_df = sdftools.getGroupQuantiles(spark_df,
                                              columns=["L1Relative"],
                                              groupby=[AC.QUERY, AC.GEOLEVEL],
                                              quantiles=QUANTILES).collect()
    avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect()

    quantiles_dict = {}
    for row in quantiles_df:
        quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5)
    quantiles_reformat = [(quant, quantiles_dict[quant])
                          for quant in QUANTILES]
    error_metrics = [
        np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign
    ] + [quantiles_reformat] + [prop_lt_reformat]

    print("error_metrics:", error_metrics)
    return error_metrics

Example #4

0

Show file

    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    ##############################
    # Work with the Experiment DF
    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    geolevels = [
        C.STATE, C.COUNTY, C.TRACT_GROUP, C.TRACT, C.BLOCK_GROUP, C.BLOCK,
        C.SLDL, C.SLDU
    ]

    queries = [
        'total', 'hhgq', 'votingage * citizen', 'numraces * hispanic',
        'cenrace * hispanic', 'sex * age', 'detailed'
    ]

    #####################################################
    # Binning and Filtering "Large" and "Small" Geounits
    #####################################################

    # 0a. Aggregate Blocks to get Geographic Units at all desired Geographic Levels
    geoleveldf = sdftools.aggregateGeolevels(spark, df, geolevels)

    # 0b. Answer Queries
    querydf = sdftools.answerQueries(geoleveldf, schema, queries, labels=True)

    # TODO: Implement the binning and filtering operations

Example #5

0

Show file

File: H1_error.py Project: p-b-j/census-das-container

def analyzeQuery(query, analysis, spark, geolevel, schema_name, path):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """

    experiment_name = "NA"
    quantiles = [xi / 20. for xi in np.arange(20)] + [.975, .99, 1.]
    experiment = analysis.make_experiment(
        experiment_name, [path],
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT,
        budget_group='1',
        run_id='run1.0')
    spark_df = experiment.getDF()
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevel)

    if geolevel == C.PLACE:
        spark_df = spark_df.filter(spark_df.geocode[2:7] != "99999")
    elif geolevel == 'AIAN_AREAS':
        spark_df = spark_df.filter(spark_df.geocode != "9999")
    elif geolevel == 'OSE':
        spark_df = spark_df.filter(
            sf.col(AC.GEOCODE).substr(
                sf.length(sf.col(AC.GEOCODE)) -
                4, sf.length(sf.col(AC.GEOCODE))) != "99999")
    elif geolevel == 'AIANTract':
        spark_df = spark_df.filter(spark_df.geocode != "9" * 11)
    elif geolevel == 'AIANState':
        spark_df = spark_df.filter(spark_df.geocode != "99")
    elif geolevel == 'AIANBlock':
        spark_df = spark_df.filter(spark_df.geocode != "9" * 16)
    elif geolevel == 'COUNTY_NSMCD':
        spark_df = spark_df.filter(spark_df.geocode != "999")

    spark_df = sdftools.answerQueries(spark_df, schema, [query, denom_query])

    spark_df = sdftools.getL1Relative(spark_df,
                                      colname="L1Relative",
                                      denom_query=denom_query,
                                      denom_level=denom_level).persist()

    spark_rdd_prop_lt = spark_df.rdd.map(
        lambda row: (int(np.digitize(row["orig"], POPULATION_BIN_STARTS)), 1.
                     if row["L1Relative"] <= THRESHOLD else 0.))
    spark_df_prop_lt = spark_rdd_prop_lt.toDF(["pop_bin", "prop_lt"])

    # Find the proportion of geounits that have L1Relative errors less than threshold for each bin:
    grouped_df_prop_lt = spark_df_prop_lt.groupBy("pop_bin").agg({
        "prop_lt": "avg",
        "*": "count"
    })
    # print("RCM", grouped_df_prop_lt.first())
    prop_lt = grouped_df_prop_lt.collect()
    prop_lt_dict = {}
    prop_lt_counts = {}
    for row in prop_lt:
        prop_lt_dict[int(row["pop_bin"])] = np.round(row["avg(prop_lt)"], 5)
        prop_lt_counts[int(row["pop_bin"])] = int(row["count(1)"])
    print(prop_lt_dict)
    pop_bin_indices = list(prop_lt_dict.keys())
    for k in range(len(POPULATION_BIN_STARTS)):
        if k not in pop_bin_indices:
            prop_lt_dict[k] = None
            prop_lt_counts[k] = 0
    print(
        f"geounits counts for each bin: {[(POPULATION_BIN_STARTS[k], prop_lt_counts[k]) for k in range(len(POPULATION_BIN_STARTS))]}"
    )
    prop_lt_reformat = [(POPULATION_BIN_STARTS[k], prop_lt_dict[k])
                        for k in range(len(POPULATION_BIN_STARTS))]

    spark_df = spark_df.filter(spark_df.orig >= POPULATION_CUTOFF)
    # Count above POPULATION_CUTOFF
    count = spark_df.count()
    # For the quantiles and the avg, we will omit geounits that would not have had a well defined L1Relative metric well defined
    # due to division by zero: (See the comments in the UDF used in sdftools.getL1Relative() for more detail.)
    spark_df = spark_df.filter(spark_df.L1Relative != 2.)
    count_correct_sign = spark_df.count()

    quantiles_df = sdftools.getGroupQuantiles(spark_df,
                                              columns=["L1Relative"],
                                              groupby=[AC.QUERY, AC.GEOLEVEL],
                                              quantiles=QUANTILES).collect()
    avg = spark_df.groupBy([AC.QUERY, AC.GEOLEVEL]).avg("L1Relative").collect()

    quantiles_dict = {}
    for row in quantiles_df:
        quantiles_dict[float(row["quantile"])] = np.round(row["L1Relative"], 5)
    quantiles_reformat = [(quant, quantiles_dict[quant])
                          for quant in QUANTILES]
    error_metrics = [
        np.round(avg[0]["avg(L1Relative)"], 5), count, count_correct_sign
    ] + [quantiles_reformat] + [prop_lt_reformat]

    print("error_metrics:", error_metrics)
    return error_metrics

Example #6

0

Show file

    geounits = [
        testdata_random_geounit_generator(x, schema, density=0.00001, scale=10)
        for x in geocodes
    ]
    sdftools.print_item(geounits, "Random Geounit Data")

    rdd = spark.sparkContext.parallelize(geounits).persist()
    sdftools.print_item(rdd, "Parallelized RDD data")

    df = rdd.flatMap(lambda node: mappers.getSparseDF_mapper(node, schema)
                     ).map(lambda row: Row(**row)).toDF().persist()
    sdftools.print_item(df, "DF of Random Geounit Data")

    df = df.withColumn("STATE", sf.col("geocode")[0:2]).persist()
    sdftools.print_item(df, "DF with STATE code")

    df = sdftools.aggregateGeolevels(spark, df, 'STATE')
    sdftools.print_item(df, "Aggregated to the STATE geolevel")

    query = 'sex * age'
    df = sdftools.answerQuery(df, schema, query, labels=False)
    sdftools.print_item(df, "Answering the sex query")

    groupby = ['geocode', 'geolevel']
    rdd = sdftools.getRowGroupsAsRDD(df, groupby)
    df = rdd.flatMapValues(prob_vector_mapper).map(
        lambda row: Row(**row[1])).toDF()
    df = df.withColumn('age', sf.col('age').cast("int")).persist()
    df = df.sort(['geocode', 'age', 'sex']).persist()
    sdftools.print_item(df, f"Prob vector for {query} query", show=1000)

Example #7

0

Show file

def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)

    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    if table_name not in (table_list_3_plus_list_age):

        for g in geolevels:
            spark_df1 = spark_df[spark_df['geolevel'] ==
                                 g]  # Separate data for each geolevel
            if table_name in table_default_no_bucket:  # If data is not in buckets
                bucket_size = "NA"
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="A")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list2:  # if data is bucketed in 3 buckets,
                bucket_size = default_buckets2
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list1:  # Table 1 and 2, six buckets
                bucket_size = default_buckets1
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

    if table_name in table_list_3geolevels:  #three geolevels, state, county, place, Tables 10,14,18,22

        metrics_result = sdftools.metrics_with_3geolevels(
            spark_df, spark, geolevels)
        file_name = f"{table_name}.csv"

    if table_name in table_list_age:  # Tables 32-35

        if table_name in table_age_bracket1:

            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="A")
        if table_name in table_age_bracket2:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="B")
        if table_name in table_age_bracket3:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="A")
        if table_name in table_age_bracket4:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="B")

        file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

Example #8

0

Show file

File: pop_demo_metrics.py Project: p-b-j/census-das-container

    ##############################
    df = experiment.getDF()
    schema = experiment.schema
    sdftools.print_item(df, "Experiment DF")

    ##############################
    # Accuracy Metrics
    ##############################
    """
    Mean / Median Absolute Error (MAE):
        1. Calculate total population at County geographic level
        2. Calculate |MDF-CEF| for the total populations for each county
        3. Calculate the mean or median across all county total populations
    """
    # 1a. Aggregate to County geographic level
    county_df = sdftools.aggregateGeolevels(spark, df, [C.COUNTY])
    sdftools.show(county_df, "Counties")

    # 1b. Answer the "total" query for all counties
    county_totals_df = sdftools.answerQueries(county_df,
                                              schema,
                                              "total",
                                              labels=True)
    sdftools.show(county_totals_df, "County total pops")

    # 2. Calculate L1(MDF, CEF)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    abs_error_county_totals_df = sdftools.getL1(county_totals_df,
                                                colname="AbsError",
                                                col1=AC.PRIV,
                                                col2=AC.ORIG)

Example #9

0

Show file

File: filtering_toy_data_by_quantile.py Project: p-b-j/census-das-container

        
    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)


    # aggregate geolevels
    df = df.withColumn("block", sf.col(AC.GEOCODE)[0:3]).persist()
    df = df.withColumn("county", sf.col(AC.GEOCODE)[0:2]).persist()
    df = df.withColumn("nation", sf.col(AC.GEOCODE)[0:1]).persist()
    sdftools.show(df, "df with geolevel crosswalk columns")
    df = sdftools.aggregateGeolevels(spark, df, ['block', 'county', 'nation'])
    sdftools.show(df, "df after geolevel aggregation", 1000)

    # answer total query
    qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False)
    sdftools.show(qdf, "Query df with the query 'total'", 1000)
    
    # select geounits by quantile bins
    rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY])
    sdftools.show(rdd.collect(), "Row groups")
    

    def row_selection_mapper(rows, selection_function, **selection_kwargs):
        pandas_df = pandas.DataFrame(rows)
        pandas_df = selection_function(pandas_df, **selection_kwargs)
        rows = pandas_df.to_dict('records')

Example #10

0

Show file

 
 geounit = nm_state.take(1).pop()
 
 print(geounit)
 geocode = geounit.geocode
 print(geocode)
 dp_queries = geounit.dp_queries
 
 experiment_name = "dhcp_eps4_run36"
 experiment_path = f"{S3_BASE}/lecle301/dhcp_eps4/run36_of_25/full_person/"
 experiment = analysis.make_experiment(experiment_name, experiment_path)
 
 df = experiment.getDF()
 sdftools.print_item(df, "Experiment DF", show=100)
 
 geolevel_df = sdftools.aggregateGeolevels(experiment.spark, df, ['STATE'])
 sdftools.print_item(geolevel_df, "Geolevel DF")
 
 filtered_df = df.filter(df.geocode == geocode).persist()
 sdftools.print_item(filtered_df, "Experiment DF", show=1000)

Example #11

0

Show file

File: cumin002_analysis.py Project: p-b-j/census-das-container

def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 eps,
                 schema_name="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    geolevel = geolevels[0]
    EPT = table_name[:4] + "_" + schema_name
    graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C."
    plt.figure(1, figsize=(20, 40))
    sns.set(style="ticks")
    fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True)
    axes_flat = axes.ravel()
    sns.despine(fig=fig)
    #.set_title(graph_title)
    print(
        f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}"
    )
    num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName(
        schema_name, query, table_name, eps)
    plt.xscale('log')
    plt.yscale('symlog', linthreshy=100)
    for k, path in enumerate(paths):
        axes_flat[k].set_title(spines[k] + '_' + mechanisms[k])
        experiment = analysis.make_experiment(
            experiment_name, [path],
            schema_name=schema_name,
            dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
        spark_df = experiment.getDF()
        sdftools.print_item(experiment.__dict__, "Experiment Attributes")

        schema = experiment.schema
        sdftools.print_item(spark_df, "Flat Experiment DF")

        queries = [query]
        spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
        # jitter points to make them visually distinct:
        spark_df = sdftools.answerQueries(spark_df, schema, queries) \
                           .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \
                           .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.)
        if geolevel == "AIAN_AREAS":
            spark_df = spark_df.filter(spark_df.geocode != "9999")
        elif geolevel == 'OSE':
            spark_df = spark_df.filter(
                sf.col(AC.GEOCODE).substr(
                    sf.length(sf.col(AC.GEOCODE)) -
                    4, sf.length(sf.col(AC.GEOCODE))) != "99999")
        elif geolevel == 'AIANTract':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 11)
        elif geolevel == 'AIANState':
            spark_df = spark_df.filter(spark_df.geocode != "99")
        elif geolevel == 'AIANBlock':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 16)
        # t = spark_df.filter(sf.abs(spark_df.Error) > 1000)
        spark_df = spark_df.select(["orig", "Error"])

        pandas_df = spark_df.toPandas()
        #if pandas_df.max()["Error"] == pandas_df.min()["Error"]:
        #    continue
        sns.scatterplot(x="orig",
                        y="Error",
                        data=pandas_df,
                        alpha=.6,
                        s=10,
                        marker="+",
                        ax=axes_flat[k])
        axes_flat[k].axhline(0., ls='--')

    filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}"
    plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/"
    du.makePath(plot_path)
    plt.savefig(plot_path + filename + ".png")
    plt.clf()

Example #12

0

Show file

def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 key,
                 agekey,
                 sexkey,
                 bucketkey,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels,
                                               agekey, sexkey, bucketkey, key)
    file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

Example #13

0

Show file

File: matt_spence_metrics3.py Project: p-b-j/census-das-container

def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    #spark_df.show(spark_df.count(), False)

    for g in geolevels:
        spark_df = spark_df[spark_df['geolevel'] == g]
        print("This has all levels")
        spark_df.show(150, False)

        metrics_dataframe = sdftools.mattsmetrics(spark_df, spark)
        Counts = spark_df.count()
        print("Counts are", Counts)
        newRow = spark.createDataFrame([(Counts, "Counts")])
        metrics_dataframe = metrics_dataframe.union(newRow)
        pandas_df = metrics_dataframe.toPandas()
        csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
        du.makePath(du.getdir(csv_savepath))
        pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_default_bucket_list2:  # If data needs bucketing

            for b in default_buckets2:  # calculate Metrics at each bucket
                subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                          b]  #subset into bins
                subset_sparkdf = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))
                )  # Removes instances of Not Hispanic..from dataframe
                subset_sparkdf.show(100, False)
                print("Make sure its bucketed and without 'Not' values")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark)
                Counts = subset_sparkdf.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

Example #14

0

Show file

def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    for g in geolevels:
        spark_df1 = spark_df[spark_df['geolevel'] ==
                             g]  # Separate data for each geolevel
        if table_name in table_default_no_bucket:  # If data is not in buckets
            if table_name in table_race_query:  # Table 17, 18, 21 and others
                print("no buckets, with race query")
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                spark_df2.show(100, False)
                print("Make sure 'Not' values are removed")
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

            else:
                print("no buckets, without race query")
                spark_df1.show(100, False)
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                print("with Not removed")
                spark_df2.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_age_bracket1:
            print("Data is in age brackets, 0 to 17, 18 to 64, 65+")
            spark_df1.show(100, False)
            for age_range in age_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket2:
            print("Data is age buckets, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket3:
            print("Data is in age brackets of 5 year age groups")
            spark_df1.show(100, False)
            for age_range in age_range_list2:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket4:
            print("Data is age buckets of 5 year age groups, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list2:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list2:  # If data is in buckets [0,10],[10,100),[100+)
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets2:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list:  # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

Example #15

0

Show file

def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    spark_df.show()
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getCountBins(
        spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000,
                                        100000]).persist()

    for b in default_buckets:  # calculate Metrics
        subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                  b]  #subset into bins
        subset_sparkdf.show()
        MAE_value = sdftools.MAE(subset_sparkdf)
        print("Bucket size is", b)
        print("MAE value is", MAE_value)

        RMS_value = sdftools.RMS(subset_sparkdf)
        CoV_value = sdftools.Coe_of_variation(subset_sparkdf, RMS_value)

        print("RMS value is", RMS_value)
        print("Coefficient of Variation is", CoV_value)
        MAPE_value = sdftools.MAPE(subset_sparkdf)
        print("MAPE value is", MAPE_value)

        MALPE_value = sdftools.MALPE(subset_sparkdf)

        print("MALPE value is", MALPE_value)

        print("Counts of percent differences between 5 and 10 percent: ")
        # 5to10percentCount = sdftools.Count_percentdiff_5to10percent(subset_spark)
        # This function disabled for now
        greaterthan10percentCount = sdftools.Count_percentdiff_10percent(
            subset_sparkdf)

        #ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05, True)),F.count(F.when(F.col("abs diff div cef")<0.1,True))).show()
        #  ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05 and F.col("abs diff div cef")<0.1),True)).show()
        print("Counts of percent differences greater than 10 percent: ")

        greaterthan10percentCount.show()

Example #16

0

Show file

File: EPT3_racecombo_and_denseworkload.py Project: p-b-j/census-das-container

def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)

    # Old approach to computing df with abs diff, bucketed by true count:
    #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema)
    #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]):
    #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY])
    #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    #rdd = rdd.map(lambda row: Row(**row[1]))
    #spark_df = rdd.toDF().persist()

    # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count:
    # (avoids pandas dfs inside mappers, which is RAM-hungry)
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()

    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg()
    # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re
    #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg()
    #print("^^^^ after averaging, spark_df looks like ^^^^")
    pandas_df = spark_df.toPandas()
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"})
    #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1)
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"})
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")
    new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket
    buckets = [buckets[i] for i in new_bucket_order]
    print(f"Sorted bucket names: {buckets}")


    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str)