"s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td1_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state/td3_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td025_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state2/td05_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td001_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td01_1/", "s3://uscb-decennial-ite-das/experiments/convertedFromGeounitNodeToDict/PL94_SF1_c1state3/td2_1/" ] schema = C.CC.SCHEMA_PL94_P12 geolevels = [C.STATE, C.COUNTY, C.BLOCK] queries = ["total"] # a metric builder object is needed for analysis, as experiments are built using it mb = sdftools.MetricBuilder() mb.add(desc="Geounit totals", metric_function=lambda sdf: sdf.getGeolevels(geolevels). answerQueries(queries)) # build an experiment and add it to analysis analysis.add_experiment( name="PL94_P12_Analysis", # need a unique name for each experiment runs= experiment_paths, # can be either the string paths, or the DASRun objects themselves schema=schema, # the schema (or its name) for the data provided metric_builder=mb, # the metric builder object constructed above mtype=AC. SPARSE # mapper type: we almost always will want to use AC.SPARSE )
queries = ['sex * agegroup4'] #queries = ['sex * agegroup16'] #queries = ['sex * agegroup64'] #queries = getRangeQNames() groupby = [AC.RUN_ID, AC.PLB, AC.GEOLEVEL] #, AC.GEOCODE] # Retain these columns in final csv # a metric builder object is needed for analysis, as experiments are built using it """ mbCustom = sdftools.MetricBuilder(geolevels, queries) mbCustom.addCustom( desc = "L1 Error in Quantiles", metric_function = lambda sdf: L1_quantile_rdd(sdf, geolevels) ) """ mb = sdftools.MetricBuilder(geolevels, queries) mb.add( desc="Per Geolevel 1-TVD", split_geolevels=False, # Only want one csv, not one per geolevel fill= None, # Don't use (deeply inefficient) row-by-row filling in of sampling zeros metric_function=lambda sdf: (sdf.show().geolevel_tvd("Geolevel TVD", groupby=groupby).show())) """ mb.add( desc = "Total L1 Error", split_geolevels = False, # Only want one csv, not one per geolevel fill = None, # Don't use (deeply inefficient) row-by-row filling in of sampling zeros metric_function = lambda sdf: ( sdf.show() .L1("L1").show() # Compute L1 error for each range query