Ejemplo n.º 1
0
    def __init__(self, spark, save_location, analysis_script_path):
        """
        example of save_location (since s3_base and linux_base are expected to be static)
        save_location = "moran331/analysis_results/logname_and_other_info/"

        which, for save_location_s3, will become
        "s3://uscb-decennial-ite-das/users/moran331/analysis_results/logname_and_other_info/"

        and for save_location_linux
        "/mnt/users/moran331/analysis_results/logname_and_other_info/"
        """
        self.save_location = save_location

        self.save_location_s3 = f"{AC.S3_BASE}{save_location}"
        print("")
        print(f"___ANALYSIS_RESULTS_LOCATION_S3___: {self.save_location_s3}")
        print("")

        self.save_location_linux = f"{AC.LINUX_BASE}{save_location}"
        print("")
        print(
            f"___ANALYSIS_RESULTS_LOCATION_LINUX___: {self.save_location_linux}"
        )
        print("")
        du.makePath(self.save_location_linux)

        self.spark = spark
        self.analysis_script_path = analysis_script_path
Ejemplo n.º 2
0
def saveRunData(path, config=None, feas_dict=None, rdd=None, batchSize=10):
    if path[-1] == '/':
        path = path[0:-1]

    # needed when not an s3 path, as the with open context assumes the folder already exists
    if not das_utils.isS3Path(path):
        das_utils.makePath(path)

    if config is not None:
        config_path = path + "/config.ini"
        logging.debug("Saving config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, config)

    if rdd is not None:
        logging.debug("Pickle Batch Size: {}".format(batchSize))
        data_path = path + "/data"
        logging.debug("Saving data to directory: {}".format(data_path))
        das_utils.savePickledRDD(data_path, rdd, batchSize=batchSize)

    if feas_dict is not None:
        for key in feas_dict.keys():
            feas_dict[key] = feas_dict[
                key].value  #this seems redundant, but is actually needed for the accumulator
        logging.info("Feasibility dictionary: {}".format(feas_dict))
        feas_path = path + "/feas_dict.json"
        logging.debug("Saving feas_dict to directory: {}".format(feas_path))
        das_utils.saveJSONFile(feas_path, feas_dict)
def analyzeQuery(query, table_name, analysis, spark, geolevels, eps, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    num_trials, paths, experiment_name, eps_str = getPathsAndName(schema_name, query, table_name, eps)
    print(f"Passing paths to Analysis experiment maker: {paths}")
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]

    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()
    missing_rows_pandas_df = sdftools.getMissingRowCounts(spark_df, schema, queries, groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.PLB, AC.BUDGET_GROUP])
    missing_rows_dict = defaultdict(int)
    for index, row in missing_rows_pandas_df.iterrows():
        #print(f"missing df row # {index} geolevel, sum(missing) = {row['geolevel']},{row['sum(missing)']}")
        missing_rows_dict[row['geolevel']] = row['sum(missing)']
    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    pandas_df = spark_df.toPandas()
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str, missing_rows_dict, num_trials)
def saveFigs(plot_savepath, filetypes, filename, plt):
    for filetype in filetypes:
        filetype_savepath = plot_savepath + filetype + "/"
        full_path = filetype_savepath + filename + "." + filetype
        print(f"Making dir-path to {filetype_savepath}\nAnd saving to {full_path}")
        du.makePath(filetype_savepath)
        if filetype != "pdf":
            plt.savefig(full_path)
        else:
            plt.savefig(full_path, dpi=300)
Ejemplo n.º 5
0
    def saveRunData(self, path, feas_dict=None, rdd=None):
        self.annotate("saveRunData", verbose=True)
        if path[-1] == '/':
            path = path[0:-1]

        # RDD must be saved first, because it needs an empty prefix.
        if rdd is not None:
            output_datafile_name = os.path.join(path, self.output_datafname)

            if self.overwrite_flag:
                das_utils.clearPath(output_datafile_name)

            # needed when not an s3 path, as the with open context assumes the folder already exists
            if not das_utils.isS3Path(output_datafile_name):
                das_utils.makePath(output_datafile_name)

            output_metadata_file_name = output_datafile_name + "/0_metadata"  # sorts before 'p'
            output_header_file_name = output_datafile_name + "/1_header"  # sorts before 'p' but after '1'
            self.annotate(f"writing RDD to {output_datafile_name}")
            self.saveRDD(output_datafile_name, rdd)

            if self.write_metadata:
                now = datetime.datetime.now().isoformat()
                self.saveMetadata(path=output_metadata_file_name,
                                  now=now,
                                  count=rdd.count())
                self.saveHeader(path=output_header_file_name)

            if self.s3cat:
                self.annotate(f"combining {output_datafile_name} with s3cat")
                s3cat.s3cat(output_datafile_name,
                            demand_success=True,
                            suffix=self.s3cat_suffix,
                            verbose=self.s3cat_verbose)
                self.add_output_path(output_datafile_name + self.s3cat_suffix)
            else:
                self.add_output_path(output_datafile_name)

        config_path = os.path.join(path, C.CONFIG_INI)
        self.annotate("Saving config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, self.config)

        if feas_dict is not None:
            for key in feas_dict.keys():
                if hasattr(feas_dict[key], 'value'):
                    feas_dict[key] = feas_dict[
                        key].value  # this seems redundant, but is actually needed for the accumulator
            self.log_and_print(f"Feasibility dictionary: {feas_dict}")
            feas_path = os.path.join(path, C.FEAS_DICT_JSON)
            self.annotate(f"Saving feas_dict to directory: {feas_path}")
            das_utils.saveJSONFile(feas_path, feas_dict)
def main():
    spark = SparkSession.builder.appName('RI Redistricting Data - PL94_P12 - Extracting agecat totals').getOrCreate()
    experiments = [
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/"
    ]


    # add das_decennial zip file to the spark context (to be sent to the core nodes)
    spark.sparkContext.addPyFile("/mnt/users/moran331/das_decennial.zip")
    schema_name = "PL94_P12"
    schema = SchemaMaker.fromName(name=schema_name)
    for path in experiments:
        tree = treetools.RunTree(path)
        runs = tree.runs
        for r, run in enumerate(runs):
            if r == 0:
                df = sdftools.getSparseDF(spark, run, schema, run_columns=True).persist()
            else:
                df = df.union(sdftools.getSparseDF(spark, run, schema, run_columns=True)).persist()
        df = df.persist()
        plb = str(du.algset2plb(du.findallSciNotationNumbers(tree.algsets[0])[0]))
        df = df.withColumn("plb", sf.lit(plb))
        df.show()
        geodict = aggtools.getGeodict()
        geolevel = "State"
        mapping = geodict[geolevel]
        group = ["plb", "run_id", geolevel] + schema.dimnames
        geodf = df.withColumn(geolevel, df.geocode[0:mapping]).groupBy(group).sum().persist()
        geodf = sdftools.stripSQLFromColumns(geodf)
        geodf.show()
        queryname = "votingage"
        querydf = sdftools.getQueryDF(geodf, queryname, schema, basegroup=["run_id", geolevel, "plb"]).persist()
        querydf.show()
        savepath = f"/mnt/users/moran331/redistricting/agecat_redistricting_state_totals_2019_06_27/{tree.algsets[0]}/"
        du.makePath(savepath)
        pd = querydf.toPandas().to_csv(savepath + "votingage_all_25_runs.csv", index=False)
        print(f"---Agecat--- | \n{querydf.toPandas().to_string()}")
Ejemplo n.º 7
0
    def save_log(self, to_linux=True, to_s3=True):
        """
        Saves a copy of the output log locally (in linux) or in S3 (or both)

        Parameters
        ==========
        to_linux : bool (default is True)
            - Save the log locally?

        to_s3 : bool (default is True)
            - Save the log to S3?

        Notes
        =====
        - Will save the log in the analysis results folder (i.e. the save_location_linux/s3 attributes) for 'this' instance of Analysis.
        """
        if to_linux:
            du.makePath(self.save_location_linux)
            print("___SAVE_LOG_TO_LINUX___")

        if to_s3:
            print("___SAVE_LOG_TO_S3___")
Ejemplo n.º 8
0
    def save_analysis_script(self, to_linux=True, to_s3=True):
        """
        Saves a copy of the analysis script (.py file) currently being run locally (in linux) or in S3 (or both)

        Parameters
        ==========
        to_linux : bool (default is True)
            - Save the analysis script locally?

        to_s3 : bool (default is True)
            - Save the analysis script to S3?

        Notes
        =====
        - Will save the analysis script in the analysis results folder (i.e. the save_location_linux/s3 attributes) for 'this' instance of Analysis.
        """
        if to_linux:
            du.makePath(self.save_location_linux)
            print(self.save_location_linux)
            print("^^^^^ Created save location for analysis results ^^^^^\n")

            # copy the analysis script to the local save location (i.e. in Linux)
            shutil.copy2(self.analysis_script_path, self.save_location_linux)
            print(
                f"{self.save_location_linux}{self.analysis_script_path.split('/')[-1]}"
            )
            print(
                "^^^^^ Copied analysis script to save location for analysis results ^^^^^\n"
            )

        if to_s3:
            # copy the analysis script to the s3 save location
            script_path = self.analysis_script_path.split("/")[-1]
            s3.put_s3url(f"{self.save_location_s3}{script_path}",
                         self.analysis_script_path)
            print(f"{self.save_location_s3}{script_path}")
            print(
                "^^^^^ Copied analysis script to save location in s3 ^^^^^\n")
Ejemplo n.º 9
0
    data = [{
        "Area": 'State',
        'Value': no_state
    }, {
        "Area": 'County',
        'Value': no_county
    }, {
        "Area": 'Tract',
        'Value': no_tract
    }, {
        "Area": 'Place',
        'Value': no_place
    }, {
        "Area": 'Block Group',
        'Value': no_group
    }, {
        "Area": 'Block',
        'Value': no_block
    }, {
        "Area": 'SLDL',
        'Value': no_sldl
    }, {
        "Area": 'SLDU',
        'Value': no_sldu
    }]
    df = spark.createDataFrame(data)
    pandas_df = df.toPandas()
    csv_savepath = "s3://uscb-decennial-ite-das/rao001/Geolevel_counts.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
Ejemplo n.º 10
0
def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    df = experiment.getDF()
    if TEST:
        df = df.limit(TEST_NUM)
    print("df looks like:")
    df.show()
    schema = experiment.schema
    sdftools.print_item(df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    rddWithAbsDiff = getRddWithAbsDiff(spark, df, geolevels, queries, schema)
    rddWithAbsDiff = sdftools.getFullWorkloadDF(
        rddWithAbsDiff,
        schema,
        queri,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    rdd = sdftools.getRowGroupsAsRDD(rddWithAbsDiff,
                                     groupby=[AC.GEOLEVEL, AC.QUERY])
    rdd = rdd.flatMapValues(
        lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    rdd = rdd.map(lambda row: Row(**row[1]))
    df = rdd.toDF().persist()

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    df = df.groupby([
        'geocode', 'geolevel', 'level', 'Bin0', 'Bin1', 'Bin2', 'Bin3', 'Bin4',
        'Bin5'
    ]).avg()
    pandas_df = df.toPandas()
    pandas_df = pandas_df.rename(columns={
        "avg(abs diff)": metric_name,
        "avg(orig)": "orig"
    })
    pandas_df[x_axis_variable_name] = pandas_df.apply(
        lambda row: binIndexToInteger(row, buckets), axis=1)
    plt.figure(1, figsize=(11, 8.5))
    plt.rc('axes', labelsize=8)
    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries,
              x_axis_variable_name, metric_name, geolevels, pandas_df, buckets,
              schema_name)
Ejemplo n.º 11
0
def makePlots(experiment, experiment_name, table_name, queries,
              x_axis_variable_name, metric_name, geolevels, pandas_df, buckets,
              schema_name):
    EPT = table_name[:4] + "_" + schema_name
    for query in queries:
        for geolevel in geolevels:
            subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel]
            race_iterates = major_omb_race_names if (
                "EPT3" in table_name and "allraces *" in query) else [""]
            race_iterates = ["2+ Races"] if (race_iterates == [""]
                                             and "tomr *" in query) else [""]
            race_iterates = major_combination_race_names if (
                "EPT3" in table_name
                and query.str.contains("combo")) else race_iterates
            print(
                f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})"
            )

            # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor?
            for race in race_iterates:
                plotting_df = subsetted_df
                if race in major_omb_race_names:
                    plotting_df = subsetted_df[
                        subsetted_df['level'].str.contains(race)]
                if race in major_combination_race_names:
                    plotting_df = subsetted_df[
                        subsetted_df['level'].str.contains(race)]
                bucket_counts = {}
                for bindex, bucket in enumerate(buckets):
                    trueRows = plotting_df[plotting_df[f"Bin{bindex}"] == True]
                    bucket_counts[bucket] = trueRows.shape[0]
                print(
                    f"Geolevel {geolevel} has bucket_counts: {bucket_counts}")
                bucket_names = [
                    str(bucket) for bucket in buckets
                    if bucket_counts[bucket] != 0
                ]
                if (np.array(list(bucket_counts.values())) > 0).any():
                    graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}"
                    if race in major_omb_race_names:
                        graph_title += f"Race Alone: {race.title()}"
                    if race == "2+ Races":
                        graph_title += f"2+ Races"
                    if race in major_combination_race_names:
                        graph_title += f"Race: {race}"

                    graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C."

                    # Scatterplot strips superimposed on violin plots
                    sns.set(style="whitegrid")
                    sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names,
                                            inner = None, color="0.8") \
                                            .set_title(graph_title)
                    sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names)  \
                                            .set_title(graph_title)
                    plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/scatteredViolin/{experiment_name}_"
                    plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf"
                    du.makePath(du.getdir(plot_savepath))
                    print(
                        f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}"
                    )
                    plt.savefig(plot_savepath)
                    plt.clf()
                else:
                    print(
                        f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated."
                    )
Ejemplo n.º 12
0
        #'detailed'
        'cenrace'
]

sdftools.show(df, "df with geolevel crosswalk columns")
sdftools.show(df, "df with geolevel crosswalk columns")
df = sdftools.aggregateGeolevels(spark, df, geolevels)
sdftools.show(df, "df after geolevel aggregation", 1000)
qdf = sdftools.answerQuery(df, schema, "total", labels=False, merge_dims=False)
sdftools.show(qdf, "Query df with the query 'total'", 1000)
rdd = sdftools.getRowGroupsAsRDD(qdf, groupby=[AC.GEOLEVEL, AC.QUERY])
#sdftools.show(rdd.collect(), "Row groups")

path = save_location_linux + "Gel.csv"
q = qdf.toPandas()
du.makePath(du.getdir(path))
q.to_csv(path, index=False)
def Qualbins(rows, column, bins):
    #only works if bins > 2
    pandas_df = pandas.DataFrame(rows)
    q = 1/bins
    
    p = bins+1
    for i in range (1, p):
        k=str(i)
        pandas_df['Bin'+k]=(np.quantile(pandas_df[column],q)>=pandas_df[column])&(pandas_df[column]>=np.quantile(pandas_df[column],q-1/bins))
            
        q=q+1/bins    
         
   
    rows = pandas_df.to_dict('records')
Ejemplo n.º 13
0
    queri = ["allraces"]
    
    
    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)
    df_table = sdftools.answerQueries(df_geolevel, schema, queri, labels=True).persist()
    df_withmissingrows=sdftools.getFullWorkloadDF(df_table, schema, queri,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    #print(df_withmissingrows.head(200))
    sparse = sdftools.getCellSparsityByGroup(df_withmissingrows,schema,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP,AC.QUERY])
    zero=ReturnZeroCounts(df_withmissingrows, geolevels)
    print("This is sparse:")
    print(sparse.head(20))
    print("This is zero")
    print(zero.head(20))
    csv_savepath = save_location_linux + f"origtable.csv"
    csv_savepath2 = save_location_linux + f"missingrows.csv"
    du.makePath(du.getdir(csv_savepath))
    du.makePath(du.getdir(csv_savepath2))
    pandas_df_table=df_table.toPandas()
    
    pandas_df_table.to_csv(csv_savepath, index=False)
    pandas_dfmissing=df_withmissingrows.toPandas()
    pandas_dfmissing.to_csv(csv_savepath2, index=False)


#    df_geolevel = sdftools.aggregateGeolevels(spark, df, geolevels)
#    races_names1=['White','Black or African American','American Indian and Alaskan Native']
#    races_names2=['Asian','Native Hawaiian and Other Pacific Islander','Some Other Race']
#    white1=['Aian']
#    buckets=[(0,0),(1,10),(11,100),(100,1000),(1000,10000),(10000,float('inf'))]
#    path = du.addslash(save_location)
#    plt.figure(figsize=(11,8.5))
Ejemplo n.º 14
0
def analyzeQuery(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 eps,
                 schema_name="DHCP_HHGQ"):
    """
        Main plotting fxn.
            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing
            schema          : str, name of ../programs/schema/schemas/schemamaker.py schema associated with target data
        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    geolevel = geolevels[0]
    EPT = table_name[:4] + "_" + schema_name
    graph_title = f"Error for query: {table_name}-{query}, eps: {int(eps)}, geography: {geolevels}\nDisclosure Prohibited - Title 13 U.S.C."
    plt.figure(1, figsize=(20, 40))
    sns.set(style="ticks")
    fig, axes = plt.subplots(ncols=3, nrows=2, sharey=True, sharex=True)
    axes_flat = axes.ravel()
    sns.despine(fig=fig)
    #.set_title(graph_title)
    print(
        f"For table {table_name}, analyzing query {query} at geolevel {geolevel} with schema_name {schema_name} and eps: {eps}"
    )
    num_trials, paths, experiment_name, eps_str, spines, mechanisms = getPathsAndName(
        schema_name, query, table_name, eps)
    plt.xscale('log')
    plt.yscale('symlog', linthreshy=100)
    for k, path in enumerate(paths):
        axes_flat[k].set_title(spines[k] + '_' + mechanisms[k])
        experiment = analysis.make_experiment(
            experiment_name, [path],
            schema_name=schema_name,
            dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
        spark_df = experiment.getDF()
        sdftools.print_item(experiment.__dict__, "Experiment Attributes")

        schema = experiment.schema
        sdftools.print_item(spark_df, "Flat Experiment DF")

        queries = [query]
        spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
        # jitter points to make them visually distinct:
        spark_df = sdftools.answerQueries(spark_df, schema, queries) \
                           .withColumn("Error", sf.col("priv") - sf.col("orig") + sf.rand() - 1/2.) \
                           .withColumn("orig", sf.col("orig") + sf.rand() - 1/2.)
        if geolevel == "AIAN_AREAS":
            spark_df = spark_df.filter(spark_df.geocode != "9999")
        elif geolevel == 'OSE':
            spark_df = spark_df.filter(
                sf.col(AC.GEOCODE).substr(
                    sf.length(sf.col(AC.GEOCODE)) -
                    4, sf.length(sf.col(AC.GEOCODE))) != "99999")
        elif geolevel == 'AIANTract':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 11)
        elif geolevel == 'AIANState':
            spark_df = spark_df.filter(spark_df.geocode != "99")
        elif geolevel == 'AIANBlock':
            spark_df = spark_df.filter(spark_df.geocode != "9" * 16)
        # t = spark_df.filter(sf.abs(spark_df.Error) > 1000)
        spark_df = spark_df.select(["orig", "Error"])

        pandas_df = spark_df.toPandas()
        #if pandas_df.max()["Error"] == pandas_df.min()["Error"]:
        #    continue
        sns.scatterplot(x="orig",
                        y="Error",
                        data=pandas_df,
                        alpha=.6,
                        s=10,
                        marker="+",
                        ax=axes_flat[k])
        axes_flat[k].axhline(0., ls='--')

    filename = f"{table_name}_{query.replace(' ', '_')}_{geolevel}"
    plot_path = f"{experiment.save_location_linux}epsilon_{eps_str}/"
    du.makePath(plot_path)
    plt.savefig(plot_path + filename + ".png")
    plt.clf()
Ejemplo n.º 15
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)

    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    if table_name not in (table_list_3_plus_list_age):

        for g in geolevels:
            spark_df1 = spark_df[spark_df['geolevel'] ==
                                 g]  # Separate data for each geolevel
            if table_name in table_default_no_bucket:  # If data is not in buckets
                bucket_size = "NA"
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="A")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list2:  # if data is bucketed in 3 buckets,
                bucket_size = default_buckets2
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

            if table_name in table_bucket_list1:  # Table 1 and 2, six buckets
                bucket_size = default_buckets1
                print("BUCKET SIZE IS:", bucket_size)
                metrics_result = sdftools.metrics_with_popbucket(spark_df1,
                                                                 bucket_size,
                                                                 spark,
                                                                 key="B")
                file_name = f"{table_name}_{g}.csv"

    if table_name in table_list_3geolevels:  #three geolevels, state, county, place, Tables 10,14,18,22

        metrics_result = sdftools.metrics_with_3geolevels(
            spark_df, spark, geolevels)
        file_name = f"{table_name}.csv"

    if table_name in table_list_age:  # Tables 32-35

        if table_name in table_age_bracket1:

            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="A")
        if table_name in table_age_bracket2:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list,
                                                       key="B")
        if table_name in table_age_bracket3:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="A")
        if table_name in table_age_bracket4:
            metrics_result = sdftools.metrics_with_age(spark_df,
                                                       spark,
                                                       age_range_list2,
                                                       key="B")

        file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
Ejemplo n.º 16
0
    def saveRunData(self, path, feas_dict=None, rdd=None):
        self.annotate("saveRunData", verbose=True)
        if path[-1] == '/':
            path = path[0:-1]

        # RDD must be saved first, because it needs an empty prefix.
        if rdd is not None:
            output_datafile_name      = os.path.join(path, self.output_datafname)

            if self.overwrite_flag:
                das_utils.clearPath(output_datafile_name)

            # needed when not an s3 path, as the with open context assumes the folder already exists
            if not das_utils.isS3Path(output_datafile_name):
                das_utils.makePath(output_datafile_name)

            output_metadata_file_name = output_datafile_name+"/0_metadata"  # sorts before 'p'
            output_header_file_name   = output_datafile_name+"/1_header"    # sorts before 'p' but after '1'
            self.annotate(f"writing RDD to {output_datafile_name}")
            self.saveRDD(output_datafile_name, rdd)

            if self.write_metadata:
                now = datetime.datetime.now().isoformat()
                self.saveMetadata(path=output_metadata_file_name, now=now, count=rdd.count())
                self.saveHeader(path=output_header_file_name)

            if self.s3cat:
                # If we combine the data with s3cat
                # note the combined filename in the annotated output, the DFXML file, the DVS object, and do it.

                self.annotate(f"combining {output_datafile_name} with s3cat")

                # Record this with DFXML
                ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_S3CAT,
                              {'output_datafile_name':output_datafile_name,
                               'demand_success':'True',
                               'suffix':self.s3cat_suffix,
                               'verbose':str(self.s3cat_verbose)})

                self.add_output_path(output_datafile_name + self.s3cat_suffix)
                s3cat.s3_cat(output_datafile_name)
            else:
                # Otherwise just note the prefix in DFS and DFXML
                ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_OUTPUT).text=output_datafile_name+"/"
                self.add_output_path(output_datafile_name + "/")


        config_path = os.path.join(path, f"{self.output_datafname}_{CC.CONFIG_INI}")

        self.annotate("Saving the flattened config to directory: {}".format(config_path))
        das_utils.saveConfigFile(config_path, self.config)
        f = io.StringIO()
        self.config.write(f)
        ET.SubElement(self.das.dfxml_writer.doc, CC.DAS_CONFIG).text = f.getvalue()


        if feas_dict is not None:
            for key in feas_dict.keys():
                if hasattr(feas_dict[key], 'value'):
                    feas_dict[key] = feas_dict[key].value  # this seems redundant, but is actually needed for the accumulator
            self.log_and_print(f"Feasibility dictionary: {feas_dict}")
            feas_path = os.path.join(path, f"{self.output_datafname}_{CC.FEAS_DICT_JSON}")
            self.annotate(f"Saving feas_dict to directory: {feas_path}")
            das_utils.saveJSONFile(feas_path, feas_dict)
def analyzeQuery(query, table_name, analysis, spark, geolevels, buckets=default_buckets, schema="DHCP_HHGQ"):
    """
        Main plotting fxn.

            query           : str, name of a valid query for the target experiment's schema
            table_name      : str, name of a table (used for file-naming conventions)
            analysis        : Analysis setuptools.setup object, organizes Analysis metadata
            spark           : SparkSession object, attached to analysis object
            geolevels       : [str, ...], geolevels to compute over for the current query
            buckets         : [(int,int), ...], list of mutually exclusive bucket boundaries for Tab(CEF) bucketing

        Note, also, major control parameters hard-coded in getPaths for setting experiment ingest locations from s3.
    """
    print(f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}")
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(experiment_name, paths, schema_name=schema_name, dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    #y=sdftools.getAnswers(spark,df,geolevels,schema,queries)

    # Old approach to computing df with abs diff, bucketed by true count:
    #sparkDFWithAbsDiff = getSparkDFWithAbsDiff(spark, spark_df, geolevels, queries, schema)
    #getSignedErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]):
    #rdd = sdftools.getRowGroupsAsRDD(sparkDFWithAbsDiff, groupby=[AC.GEOLEVEL, AC.QUERY])
    #rdd = rdd.flatMapValues(lambda rows: sepBounds(rows, 'orig', buckets)).persist()
    #rdd = rdd.map(lambda row: Row(**row[1]))
    #spark_df = rdd.toDF().persist()

    # New (actually preexisting) approach to computing spark_df with abs diff, bucketed by true count:
    # (avoids pandas dfs inside mappers, which is RAM-hungry)
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(spark_df, schema, queries,groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])
    spark_df = sdftools.getAvgAbsErrorByTrueCountRuns(spark_df, bins=[0,1,10,100,1000,10000]).persist()

    spark_df.show()
    print("^^^^ with abs error, DF looks like ^^^^")

    metric_name = "Avg( |q(MDF) - q(CEF)| )"
    x_axis_variable_name = 'CEF Count, Binned'

    # spark_df = spark_df.groupby(['geocode','geolevel','level','Bin0','Bin1','Bin2','Bin3','Bin4','Bin5']).avg()
    # Below spark_df has cols: geocode, geolevel, run_id, plb, budget_group, query, orig_count_bin, signed_error, re
    #spark_df = spark_df.groupby(['geocode', 'geolevel', 'plb', 'budget_group', 'query', 'orig_count_bin']).avg()
    #print("^^^^ after averaging, spark_df looks like ^^^^")
    pandas_df = spark_df.toPandas()
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig)":"orig"})
    #pandas_df[x_axis_variable_name] = pandas_df.apply(lambda row: binIndexToInteger(row, buckets), axis=1)
    #pandas_df = pandas_df.rename(columns={"avg(signed_error)":metric_name, "avg(orig_count_bin)":"orig"})
    pandas_df = pandas_df.rename(columns={"abs_error":metric_name, "orig_count_bin":x_axis_variable_name})
    plt.figure(1, figsize=(11,8.5))
    plt.rc('axes', labelsize=8)
    print(f"pandas df before plotting has cols: {pandas_df.columns.values}")
    print(f"{x_axis_variable_name} column has distinct levels: {pandas_df[x_axis_variable_name].unique()}")
    buckets = pandas_df[x_axis_variable_name].unique()
    buckets = sorted(buckets, key=lambda bucket_name: largestIntInStr(bucket_name))
    print(f"Sorted bucket names: {buckets}")
    new_bucket_order = [0,1,2,3,5,4] # Apply ordering system to make 10000+ the last bucket
    buckets = [buckets[i] for i in new_bucket_order]
    print(f"Sorted bucket names: {buckets}")


    """
    print(pandas_df.head(30))
    print(f"pandas_df headers: {list(pandas_df.columns.values)}")
    tmpDf = pandas_df[[x_axis_variable_name, 'orig', metric_name]]
    print("tmpDf looks like:")
    with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tmpDf)
    print("^^^^ pandas df looks like ^^^^")
    print("And first 3 rows:")
    print(pandas_df.iloc[:3])
    #print(df.dtypes)
    print("And first 100 rows, subset to Bins:")
    print(pandas_df.iloc[0:101,3:9])
    print(pandas_df.iloc[0:101,-1])
    """

    # Saving data frame
    csv_savepath = experiment.save_location_linux + f"Executive_Priority_Tabulations_#1_{experiment_name}_{table_name}_{query}.csv"
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)

    makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str)
Ejemplo n.º 18
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    for g in geolevels:
        spark_df1 = spark_df[spark_df['geolevel'] ==
                             g]  # Separate data for each geolevel
        if table_name in table_default_no_bucket:  # If data is not in buckets
            if table_name in table_race_query:  # Table 17, 18, 21 and others
                print("no buckets, with race query")
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                spark_df2.show(100, False)
                print("Make sure 'Not' values are removed")
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

            else:
                print("no buckets, without race query")
                spark_df1.show(100, False)
                spark_df2 = spark_df1.subtract(
                    spark_df1.filter(spark_df1.level.rlike("Not")))
                print("with Not removed")
                spark_df2.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(spark_df2, spark)
                Counts = spark_df2.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_age_bracket1:
            print("Data is in age brackets, 0 to 17, 18 to 64, 65+")
            spark_df1.show(100, False)
            for age_range in age_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket2:
            print("Data is age buckets, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket3:
            print("Data is in age brackets of 5 year age groups")
            spark_df1.show(100, False)
            for age_range in age_range_list2:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(age_range))
                subset_sparkdf1.show(100, False)
                metrics_dataframe = sdftools.mattsmetrics(
                    subset_sparkdf1, spark)
                #subset_sparkdf1.show(100, False)
                Counts = subset_sparkdf1.count()
                print("Counts are", Counts)
                newRow = spark.createDataFrame([(Counts, "Counts")])
                metrics_dataframe = metrics_dataframe.union(newRow)
                pandas_df = metrics_dataframe.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_age_bracket4:
            print("Data is age buckets of 5 year age groups, with sex query")
            spark_df1.show(100, False)
            for sexlevel in sex_range_list:
                subset_sparkdf1 = spark_df1.filter(
                    spark_df1.level.rlike(sexlevel))
                for age_range in age_range_list2:
                    subset_sparkdf2 = subset_sparkdf1.filter(
                        subset_sparkdf1.level.rlike(age_range))
                    subset_sparkdf2.show(100, False)
                    metrics_dataframe = sdftools.mattsmetrics(
                        subset_sparkdf2, spark)
                    #subset_sparkdf1.show(100, False)
                    Counts = subset_sparkdf2.count()
                    print("Counts are", Counts)
                    newRow = spark.createDataFrame([(Counts, "Counts")])
                    metrics_dataframe = metrics_dataframe.union(newRow)
                    pandas_df = metrics_dataframe.toPandas()
                    csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{age_range}_{sexlevel}.csv"
                    du.makePath(du.getdir(csv_savepath))
                    pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list2:  # If data is in buckets [0,10],[10,100),[100+)
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets2:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)

        if table_name in table_default_bucket_list:  # If data is in buckets [0,1000],[1000,5000),etc. Table 1 and 2
            print("data is bucketed and treated accordingly")
            #if table_name in table_race_query:

            for b in default_buckets:  # calculate Metrics at each bucket
                print("Bucket is:", b)
                subset_sparkdf = spark_df1[spark_df1['orig_count_bin'] ==
                                           b]  #subset into bins
                subset_sparkdf.show(100, False)
                print("Bucketed data")
                subset_sparkdf1 = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not")))
                subset_sparkdf1.show(100, False)
                print("Make sure its bucketed and 'Not' values are removed")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf1, spark)
                Counts = subset_sparkdf1.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
Ejemplo n.º 19
0
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 key,
                 agekey,
                 sexkey,
                 bucketkey,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)
    spark_df = sdftools.getFullWorkloadDF(
        spark_df,
        schema,
        queries,
        groupby=[AC.GEOCODE, AC.GEOLEVEL, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP])

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_bucket_list1):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    spark_df.show(100, False)

    metrics_result = sdftools.combined_metrics(spark_df, spark, geolevels,
                                               agekey, sexkey, bucketkey, key)
    file_name = f"{table_name}.csv"
    pandas_df = metrics_result.toPandas()
    csv_savepath = experiment.save_location_linux + file_name
    du.makePath(du.getdir(csv_savepath))
    pandas_df.to_csv(csv_savepath, index=False)
def MattsMetrics(query,
                 table_name,
                 analysis,
                 spark,
                 geolevels,
                 buckets=default_buckets,
                 schema="DHCP_HHGQ"):
    """
    This function computes metrics for MAE, MALPE, CoV, RMS, MAPE, and percent thresholds"

    """
    print(
        f"For table {table_name}, analyzing query {query} at geolevels {geolevels} with schema {schema}"
    )
    schema_name = schema
    paths, experiment_name, eps_str = getPathsAndName(schema_name)
    experiment = analysis.make_experiment(
        experiment_name,
        paths,
        schema_name=schema_name,
        dasruntype=AC.EXPERIMENT_FRAMEWORK_FLAT)
    sdftools.print_item(experiment.__dict__, "Experiment Attributes")

    spark_df = experiment.getDF()
    print("df looks like:")
    spark_df.show()
    schema = experiment.schema
    sdftools.print_item(spark_df, "Flat Experiment DF")

    queries = [query]
    spark_df = sdftools.aggregateGeolevels(spark, spark_df, geolevels)
    spark_df = sdftools.answerQueries(spark_df, schema, queries)

    #spark_df.show(spark_df.count(), False)
    # AC.PRIV means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    spark_df = sdftools.getL1(spark_df,
                              colname="L1",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    spark_df = sdftools.getL2(spark_df,
                              colname="L2",
                              col1=AC.PRIV,
                              col2=AC.ORIG)
    # apply bin functions for particular tables
    if (table_name in table_default_bucket_list):
        spark_df = sdftools.getCountBins(
            spark_df,
            column=AC.ORIG,
            bins=[0, 1000, 5000, 10000, 50000, 100000]).persist()
    if (table_name in table_default_bucket_list2):
        spark_df = sdftools.getCountBins(spark_df,
                                         column=AC.ORIG,
                                         bins=[0, 10, 100]).persist()
    # This finds overall metrics
    #spark_df.show(spark_df.count(), False)

    for g in geolevels:
        spark_df = spark_df[spark_df['geolevel'] == g]
        print("This has all levels")
        spark_df.show(150, False)

        metrics_dataframe = sdftools.mattsmetrics(spark_df, spark)
        Counts = spark_df.count()
        print("Counts are", Counts)
        newRow = spark.createDataFrame([(Counts, "Counts")])
        metrics_dataframe = metrics_dataframe.union(newRow)
        pandas_df = metrics_dataframe.toPandas()
        csv_savepath = experiment.save_location_linux + f"{table_name}_{g}.csv"
        du.makePath(du.getdir(csv_savepath))
        pandas_df.to_csv(csv_savepath, index=False)
        if table_name in table_default_bucket_list2:  # If data needs bucketing

            for b in default_buckets2:  # calculate Metrics at each bucket
                subset_sparkdf = spark_df[spark_df['orig_count_bin'] ==
                                          b]  #subset into bins
                subset_sparkdf = subset_sparkdf.subtract(
                    subset_sparkdf.filter(subset_sparkdf.level.rlike("Not"))
                )  # Removes instances of Not Hispanic..from dataframe
                subset_sparkdf.show(100, False)
                print("Make sure its bucketed and without 'Not' values")
                subset_metrics = sdftools.mattsmetrics(subset_sparkdf, spark)
                Counts = subset_sparkdf.count()
                newRow = spark.createDataFrame([(b, "Bucket")])
                newRow1 = spark.createDataFrame([(Counts, "Counts")])
                subset_metrics = subset_metrics.union(newRow).union(newRow1)
                pandas_df = subset_metrics.toPandas()
                csv_savepath = experiment.save_location_linux + f"{table_name}_{g}_{b}.csv"
                du.makePath(du.getdir(csv_savepath))
                pandas_df.to_csv(csv_savepath, index=False)
Ejemplo n.º 21
0
    print("This is q")
    q.show(10)
    q=q.groupby(['geolevel','run_id']).sum()
    columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)']
    print("this is q2")
    q=q.drop(*columnstodrop)
    q.show(10)
    z=q.groupby(['geolevel']).avg()
    print("this is z")
    z.show(10)
    return q,z

get,get2=NEbias(spark,df,geolevels,queries,schema)
path=save_location_linux+"NEbias.csv"
pdf=get.toPandas()
du.makePath(du.getdir(path))
pdf.to_csv(path,index=False)

path2=save_location_linux+"NEbias_av.csv"
pdf2=get2.toPandas()
du.makePath(du.getdir(path2))
pdf2.to_csv(path2,index=False)

get3,get4=MMbias(get)
path3=save_location_linux+"MMbias.csv"
pdf3=get3.toPandas()
du.makePath(du.getdir(path3))
pdf3.to_csv(path3,index=False)

path4=save_location_linux+"MMbias_av.csv"
pdf4=get4.toPandas()
def makePlots(experiment, experiment_name, table_name, queries, x_axis_variable_name,
                                           metric_name, geolevels, pandas_df, buckets,
                                           schema_name, eps_str):
    EPT = table_name[:4]+"_"+schema_name
    for query in queries:
        for geolevel in geolevels:
            subsetted_df = pandas_df[pandas_df['geolevel'] == geolevel]
            race_iterates = major_omb_race_names if ("EPT3" in table_name and "allraces *" in query) else [""]
            race_iterates = race_combo_names if (race_iterates == [""] and "racecombos *" in query) else race_iterates
            race_iterates = ["2+ Races"] if (race_iterates == [""] and "tomr *" in query) else race_iterates
            print(f"table {table_name}, query {query}, geolevel {geolevel}, race_iterates {race_iterates})")

            # TODO: graph titling/subsetting for distinct queries is getting convoluted. Re-factor?
            for race in race_iterates:
                plotting_df = subsetted_df
                if race in major_omb_race_names:
                    plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)]
                if race in race_combo_names:
                    plotting_df = subsetted_df[subsetted_df['level'].str.contains(race)]
                bucket_counts = {}
                for bucket in buckets:
                    trueRows = plotting_df[plotting_df[x_axis_variable_name] == bucket]
                    bucket_counts[bucket] = trueRows.shape[0]
                print(f"Geolevel {geolevel} has bucket_counts: {bucket_counts}")
                bucket_names = [str(bucket) for bucket in buckets if bucket_counts[bucket] != 0]
                if (np.array(list(bucket_counts.values())) > 0).any():
                    graph_title = f"Average L1 Error (over trials) for {query}\nGeolevel: {geolevel}"
                    if race in major_omb_race_names:
                        graph_title += f"\n Race Alone: {race.title()}"
                    if race in race_combo_names:
                        graph_title += f"\n Race Combination: {race.title()}"

                    if race == "2+ Races":
                        graph_title += f"\n 2+ Races"
                    graph_title += f"\n(binned by CEF count)\nDisclosure Prohibited - Title 13 U.S.C."

                    pandas.set_option('display.max_columns', None)
                    pandas.set_option('display.max_rows', None)
                    #print(f"Before plotting, plotting_df looks like:")
                    #print(plotting_df)
                    #print(f"Feeding plotting_df to seaborn with xvar {x_axis_variable_name} & yvar {metric_name}")
                    #print(f"And bucket_names are: {bucket_names}")

                    # Scatterplot strips superimposed on violin plots
                    sns.set(style="whitegrid")
                    ax = sns.violinplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names,
                                            inner = None, color="0.8") \
                                            .set_title(graph_title)

                    maxVal = plotting_df[metric_name].max()
                    minVal = plotting_df[metric_name].min()
                    print(f"maxVal type, val: {type(maxVal)}, {maxVal}")
                    print(f"minVal type, val: {type(minVal)}, {minVal}")
                    if abs(maxVal - minVal) < 0.1:
                        #print(f"violin ax has type {type(ax)} & methods: {dir(ax)}")
                        #print(f"violin ax.axes has type {type(ax.axes)} & methods: {dir(ax.axes)}")
                        ax.axes.set(ylim=(minVal-10, maxVal+10))
                    ax = sns.stripplot(x=x_axis_variable_name, y=metric_name, data=plotting_df, order=bucket_names)  \
                                            .set_title(graph_title)
                    #if geolevel == C.US:
                    #    #print(f"strip ax has type {type(ax)} & methods: {dir(ax)}")
                    #    ax.axes.set(ylim=(plotting_df[x_axis_variable_name].min - 10,plotting_df[x_axis_variable_name].max + 10))
                    plot_savepath = f"{experiment.save_location_linux}plots/{EPT}/epsilon{eps_str}/scatteredViolin/{experiment_name}_"
                    plot_savepath += f"{table_name}_{query.replace(' ', '_')}_{geolevel}_{race.replace(' ','_')}.pdf"
                    du.makePath(du.getdir(plot_savepath))
                    print(f"Saving scatterstrips w/ violins for query {query}, geolevel {geolevel}, & race {race} to: {plot_savepath}")
                    plt.savefig(plot_savepath)
                    plt.clf()
                else:
                    print(f"No observations for {table_name}, {query}, {geolevel}, {race}. Plot not generated.")