def setup():
    jbid = os.environ.get('JBID', 'temp_jbid')
    analysis_results_save_location = f"{jbid}/analysis_reports/"
    spark_loglevel = "ERROR"
    analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel)
    analysis.save_log(to_linux=False, to_s3=True)
    spark = analysis.spark
    spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
    return analysis, spark
Example #2
0
import analysis.constants as AC
import constants as C
from constants import CC
from pyspark.sql import functions as sf
from pyspark.sql import Row

import analysis.tools.setuptools as setuptools
from programs.schema.schemas.schemamaker import SchemaMaker
import operator
import programs.datadict as dd
import sys
import os

analysis_results_save_location = f"/mnt/users/rao00316/analysis_reports/"
spark_loglevel = "ERROR"
analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel)
spark = analysis.spark
S3_BASE="s3://uscb-decennial-ite-das/users"
save_location_linux = f"/mnt/users/rao00316/bias/"


path = [
    "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP_reRun1/td16/"
]

runs = datatools.getDASRuns(path)

schema_name = "DHCP_HHGQ"

schema = SchemaMaker.fromName(name=schema_name)
    # setup tools will return the analysis object
    # the analysis object contains the spark session and save location path for this run
    # NOTE: You will need to specify a location for the results to be saved
    #       It should be passed into setuptools.setup, where it will be altered to
    #       add a subdirectory matching the logfile's name

    # Recommended location: "/mnt/users/[your_jbid]/analysis_results/"
    save_location = "/mnt/users/moran331/PL94_P12_Geounit_totals/"

    # Most common options are "INFO" and "ERROR"
    # In addition to the analysis print statements...
    # "INFO" provides ALL spark statements, including stage and task info
    # "ERROR" provides only error statements from spark/python
    loglevel = "ERROR"

    analysis = setuptools.setup(save_location=save_location,
                                spark_loglevel="ERROR")

    spark = analysis.spark

    # Specify the experiment paths
    experiment_paths = [
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/",
        "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/"
    ]
    schema = C.CC.SCHEMA_PL94_P12
Example #4
0
import constants as C

from pyspark.sql import functions as sf

import analysis.tools.setuptools as setuptools
import programs.datadict as dd

if __name__ == "__main__":
    # setup tools will return the spark session and save location path for this run
    # NOTE: You will need to specify a location for the results to be saved
    #       It should be passed into setuptools.setup, where it will be altered to
    #       add a subdirectory matching the logfile's name

    # Recommended location: "/mnt/users/[your_jbid]/analysis_results/"
    save_location = "/mnt/users/moran331/large_scale_analysis3/"
    analysis = setuptools.setup(save_location=save_location)

    spark = analysis.spark

    # Specify the experiment paths
    experiment_paths = [
        "s3://uscb-decennial-ite-das/users/lecle301/experiments/full_person/smallCellQuery/avgLE1/"
    ]
    schema_name = "DHCP_HHGQ"

    geolevels = [C.STATE]
    queries = ['detailed']

    # a metric builder object is needed for analysis, as experiments are built using it
    mb = sdftools.MetricBuilder()
    mb.add(desc="Detailed Query, Keep Sparse",
if __name__ == "__main__":
    # overwrite save_location with some other local space (not /mnt/users/ ...etc)
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_hierarchical_age_range/quantiles_pop_GEQ1/male/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danvariant1/totalOnly/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/cenraceXhispanic/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/hhgq/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/detailed/"
    save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/ageBins4/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/totalOnly/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/cenraceXhispanic/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/hhgq/"
    #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/detailed/"
    #save_location = "/mnt/users/lecle301/analysis/test/"
    # setup tools will return the spark session and save location path for this run
    spark, save_location = setuptools.setup(save_location=save_location)

    experiment_paths = [
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td001/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td01/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td025/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td05/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td1/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td2/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td4/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_rerun1/td8/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_rerun1/td16/"
        "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1/"
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td001/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td01/",
        #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td025/",