geocode_dict = {3: 'block', 2: 'county'}

    # build geounits
    geounits = toytools.getToyGeounitData(schema, geocodes, geocode_dict)

    rdd = spark.sparkContext.parallelize(geounits).persist()

    sdftools.print_item(rdd.take(1), "One of the toy example geounits")

    # use Analysis to transform the rdd of geounitnodes into a spark dataframe
    df = datatools.rdd2df(rdd, schema)
    sdftools.print_item(df, "Toy example DF", 300)

    # perform analyses
    # L1
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    df = sdftools.getL1(df, colname="L1_cell", col1='priv', col2='orig')
    sdftools.print_item(df, "Toy example L1", 300)

    # adding in a simple row-counting column
    df = df.withColumn("row_count", sf.lit(1)).persist()
    sdftools.print_item(df, "Totals + rowcounter column")

    # total within each geocode
    df = sdftools.answerQuery(df, schema, "total", labels=False)
    sdftools.print_item(df, "Totals within each geounit", 300)

    # L1 of total
    df = sdftools.getL1(df, colname="L1_total", col1="priv", col2="orig")
    sdftools.print_item(df, "Totals + rowcounter column + L1")
    """
    # 1a. Aggregate to County geographic level
    county_df = sdftools.aggregateGeolevels(spark, df, [C.COUNTY])
    sdftools.show(county_df, "Counties")

    # 1b. Answer the "total" query for all counties
    county_totals_df = sdftools.answerQueries(county_df,
                                              schema,
                                              "total",
                                              labels=True)
    sdftools.show(county_totals_df, "County total pops")

    # 2. Calculate L1(MDF, CEF)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    abs_error_county_totals_df = sdftools.getL1(county_totals_df,
                                                colname="AbsError",
                                                col1=AC.PRIV,
                                                col2=AC.ORIG)
    sdftools.show(abs_error_county_totals_df,
                  "L1 between MDF and CEF County total pops")

    # 3a. Calculate the mean of L1 across all county total pops
    groupby = [AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP, AC.QUERY]
    mae_county_totals_df = abs_error_county_totals_df.groupBy(
        groupby).avg().persist()
    mae_county_totals_df = sdftools.stripSQLFromColumns(
        mae_county_totals_df).persist()
    sdftools.show(
        mae_county_totals_df,
        "Mean Absolute Error of total population counts at the County level")

    #geolevels = [
예제 #3
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)

    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    if table_name not in (table_list_3_plus_list_age):

        for g in geolevels:
            spark_df1 = spark_df[spark_df['geolevel'] ==
                                 g]  # Separate data for each geolevel
            if table_name in table_default_no_bucket:  # If data is not in buckets
                bucket_size = "NA"
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="A")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list2:  # if data is bucketed in 3 buckets,
                bucket_size = default_buckets2
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list1:  # Table 1 and 2, six buckets
                bucket_size = default_buckets1
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

    if table_name in table_list_3geolevels:  #three geolevels, state, county, place, Tables 10,14,18,22

        metrics_result = sdftools.metrics_with_3geolevels(
            spark_df, spark, geolevels)
        file_name = f"{table_name}.csv"

    if table_name in table_list_age:  # Tables 32-35

        if table_name in table_age_bracket1:

            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="A")
        if table_name in table_age_bracket2:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="B")
        if table_name in table_age_bracket3:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="A")
        if table_name in table_age_bracket4:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="B")

        file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
예제 #4
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 key,
                 agekey,
                 sexkey,
                 bucketkey,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels,
                                               agekey, sexkey, bucketkey, key)
    file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    #spark_df.show(spark_df.count(), False)

    for g in geolevels:
        spark_df = spark_df[spark_df['geolevel'] == g]
        print("This has all levels")
        spark_df.show(150, False)

        metrics_dataframe = sdftools.mattsmetrics(spark_df, spark)
        Counts = spark_df.count()
        print("Counts are", Counts)
        newRow = spark.createDataFrame([(Counts, "Counts")])
        metrics_dataframe = metrics_dataframe.union(newRow)
        pandas_df = metrics_dataframe.toPandas()
        csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
        du.makePath(du.getdir(csv_savepath))
        pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_default_bucket_list2:  # If data needs bucketing

            for b in default_buckets2:  # calculate Metrics at each bucket
                subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                          b]  #subset into bins
                subset_sparkdf = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))
                )  # Removes instances of Not Hispanic..from dataframe
                subset_sparkdf.show(100, False)
                print("Make sure its bucketed and without 'Not' values")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark)
                Counts = subset_sparkdf.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
예제 #6
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    for g in geolevels:
        spark_df1 = spark_df[spark_df['geolevel'] ==
                             g]  # Separate data for each geolevel
        if table_name in table_default_no_bucket:  # If data is not in buckets
            if table_name in table_race_query:  # Table 17, 18, 21 and others
                print("no buckets, with race query")
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                spark_df2.show(100, False)
                print("Make sure 'Not' values are removed")
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

            else:
                print("no buckets, without race query")
                spark_df1.show(100, False)
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                print("with Not removed")
                spark_df2.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_age_bracket1:
            print("Data is in age brackets, 0 to 17, 18 to 64, 65+")
            spark_df1.show(100, False)
            for age_range in age_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket2:
            print("Data is age buckets, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket3:
            print("Data is in age brackets of 5 year age groups")
            spark_df1.show(100, False)
            for age_range in age_range_list2:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket4:
            print("Data is age buckets of 5 year age groups, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list2:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list2:  # If data is in buckets [0,10],[10,100),[100+)
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets2:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list:  # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
예제 #7
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    spark_df.show()
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getCountBins(
        spark_df, column=AC.ORIG, bins=[0, 1000, 5000, 10000, 50000,
                                        100000]).persist()

    for b in default_buckets:  # calculate Metrics
        subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                  b]  #subset into bins
        subset_sparkdf.show()
        MAE_value = sdftools.MAE(subset_sparkdf)
        print("Bucket size is", b)
        print("MAE value is", MAE_value)

        RMS_value = sdftools.RMS(subset_sparkdf)
        CoV_value = sdftools.Coe_of_variation(subset_sparkdf, RMS_value)

        print("RMS value is", RMS_value)
        print("Coefficient of Variation is", CoV_value)
        MAPE_value = sdftools.MAPE(subset_sparkdf)
        print("MAPE value is", MAPE_value)

        MALPE_value = sdftools.MALPE(subset_sparkdf)

        print("MALPE value is", MALPE_value)

        print("Counts of percent differences between 5 and 10 percent: ")
        # 5to10percentCount = sdftools.Count_percentdiff_5to10percent(subset_spark)
        # This function disabled for now
        greaterthan10percentCount = sdftools.Count_percentdiff_10percent(
            subset_sparkdf)

        #ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05, True)),F.count(F.when(F.col("abs diff div cef")<0.1,True))).show()
        #  ze.groupBy().agg(F.count(F.when(F.col("abs diff div cef")>0.05 and F.col("abs diff div cef")<0.1),True)).show()
        print("Counts of percent differences greater than 10 percent: ")

        greaterthan10percentCount.show()
예제 #8
0
    # aggregate geolevels
    df = df.withColumn("block", sf.col(AC.GEOCODE)[0:3]).persist()
    df = df.withColumn("county", sf.col(AC.GEOCODE)[0:2]).persist()
    df = df.withColumn("nation", sf.col(AC.GEOCODE)[0:1]).persist()
    sdftools.show(df, "df with geolevel crosswalk columns")
    df = sdftools.aggregateGeolevels(spark, df, ['block', 'county', 'nation'])
    sdftools.show(df, "df after geolevel aggregation", 1000)

    # answer total query
    qdf = sdftools.answerQuery(df,
                               schema,
                               "total",
                               labels=False,
                               merge_dims=False)
    sdftools.show(qdf, "Query df with the query 'total'", 1000)

    # calculate the L1
    qdf = sdftools.getL1(qdf)
    qdf = qdf.orderBy([AC.GEOLEVEL, AC.GEOCODE, AC.QUERY])
    sdftools.show(qdf, "L1", 1000)

    # Calculate the quantiles for each (geolevel, query) group for the orig, priv, and L1 columns
    # i.e. look at the distributions of geocode/geounit (orig, priv, L1) values for each unique (geolevel, query) tuple
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    columns = [AC.ORIG, AC.PRIV, "L1"]
    groupby = [AC.GEOLEVEL, AC.QUERY]
    quantiles = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    qdf = sdftools.getGroupQuantiles(qdf, columns, groupby, quantiles)
    qdf = qdf.orderBy([AC.GEOLEVEL, AC.QUERY, 'quantile'])
    sdftools.show(qdf, "Quantiles", 1000)