def setup(): jbid = os.environ.get('JBID', 'temp_jbid') analysis_results_save_location = f"{jbid}/analysis_reports/" spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel) analysis.save_log(to_linux=False, to_s3=True) spark = analysis.spark spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) return analysis, spark
import analysis.constants as AC import constants as C from constants import CC from pyspark.sql import functions as sf from pyspark.sql import Row import analysis.tools.setuptools as setuptools from programs.schema.schemas.schemamaker import SchemaMaker import operator import programs.datadict as dd import sys import os analysis_results_save_location = f"/mnt/users/rao00316/analysis_reports/" spark_loglevel = "ERROR" analysis = setuptools.setup(save_location=analysis_results_save_location, spark_loglevel=spark_loglevel) spark = analysis.spark S3_BASE="s3://uscb-decennial-ite-das/users" save_location_linux = f"/mnt/users/rao00316/bias/" path = [ "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP_reRun1/td16/" ] runs = datatools.getDASRuns(path) schema_name = "DHCP_HHGQ" schema = SchemaMaker.fromName(name=schema_name)
# setup tools will return the analysis object # the analysis object contains the spark session and save location path for this run # NOTE: You will need to specify a location for the results to be saved # It should be passed into setuptools.setup, where it will be altered to # add a subdirectory matching the logfile's name # Recommended location: "/mnt/users/[your_jbid]/analysis_results/" save_location = "/mnt/users/moran331/PL94_P12_Geounit_totals/" # Most common options are "INFO" and "ERROR" # In addition to the analysis print statements... # "INFO" provides ALL spark statements, including stage and task info # "ERROR" provides only error statements from spark/python loglevel = "ERROR" analysis = setuptools.setup(save_location=save_location, spark_loglevel="ERROR") spark = analysis.spark # Specify the experiment paths experiment_paths = [ "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td10_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/" ] schema = C.CC.SCHEMA_PL94_P12
import constants as C from pyspark.sql import functions as sf import analysis.tools.setuptools as setuptools import programs.datadict as dd if __name__ == "__main__": # setup tools will return the spark session and save location path for this run # NOTE: You will need to specify a location for the results to be saved # It should be passed into setuptools.setup, where it will be altered to # add a subdirectory matching the logfile's name # Recommended location: "/mnt/users/[your_jbid]/analysis_results/" save_location = "/mnt/users/moran331/large_scale_analysis3/" analysis = setuptools.setup(save_location=save_location) spark = analysis.spark # Specify the experiment paths experiment_paths = [ "s3://uscb-decennial-ite-das/users/lecle301/experiments/full_person/smallCellQuery/avgLE1/" ] schema_name = "DHCP_HHGQ" geolevels = [C.STATE] queries = ['detailed'] # a metric builder object is needed for analysis, as experiments are built using it mb = sdftools.MetricBuilder() mb.add(desc="Detailed Query, Keep Sparse",
if __name__ == "__main__": # overwrite save_location with some other local space (not /mnt/users/ ...etc) #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_hierarchical_age_range/quantiles_pop_GEQ1/male/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danvariant1/totalOnly/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/cenraceXhispanic/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/hhgq/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/detailed/" save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_danVariant1/ageBins4/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/totalOnly/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/cenraceXhispanic/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/hhgq/" #save_location = "/mnt/users/lecle301/analysis/Aug29_DHCP_manualTopDown/detailed/" #save_location = "/mnt/users/lecle301/analysis/test/" # setup tools will return the spark session and save location path for this run spark, save_location = setuptools.setup(save_location=save_location) experiment_paths = [ #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td001/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td01/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td025/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td05/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td1/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td2/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output/td4/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_rerun1/td8/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_rerun1/td16/" "s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_hierarchicalAgeRangeTopDown_branchFactor4_output_danVariant1/" #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td001/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td01/", #"s3://uscb-decennial-ite-das/users/lecle301/Aug29_experiments_manualTopDown_output_DHCP/td025/",