def TAES(spark,df,geolevels,queries,schema,u): z=sdftools.getAnswers(spark,df,geolevels,schema,queries) z=z.groupby(['geolevel','run_id']).sum() u.show(10) print("this is z") z.show(10) q=u.join(z, on=['geolevel','run_id']) columnstodrop=['plb','budget_group'] q=q.drop(*columnstodrop) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production q=q.withColumn('MDF/sum',sf.col('priv')/sf.col('sum(priv)')) q=q.withColumn('CEF/sum',sf.col('orig')/sf.col('sum(orig)')) q=q.withColumn('difference',sf.col('MDF/sum')-sf.col('CEF/sum')) q=q.withColumn('abs',sf.abs(sf.col('difference'))) print("This is q") q.show(10) q=q.groupby(['geolevel','run_id']).sum() columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)'] print("this is q2") q=q.drop(*columnstodrop) q.show(10) z=q.groupby(['geolevel']).avg() print("this is z") z.show(10) return q,z
def getRddWithAbsDiff(spark, df, geolevels, queries, schema): rddWithAnswers = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production rddWithDiff = rddWithAnswers.withColumn('diff', sf.col('priv') - sf.col('orig')) rddWithAbsDiff = rddWithDiff.withColumn('abs diff', sf.abs(sf.col('diff'))) return rddWithAbsDiff
def MAE(spark, df, geolevels, queries, schema): u = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u = u.withColumn('diff', sf.col('priv') - sf.col('orig')) u = u.withColumn('abs diff', sf.abs(sf.col('diff'))) y = u.groupby(['geocode', 'geolevel', 'level']).avg() z = u.groupby(['geolevel']).avg() return u, y, z
def AvgSqError(spark, df, geolevels, queries, schema): # This function calculates the average squared error for levels at each geounit and geolevel u = sdftools.getAnswers(spark, df, geolevels, schema, queries) print("u is") u.show() # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u = u.withColumn('diff', sf.col('priv') - sf.col('orig')) u = u.withColumn('sq', sf.lit(2)) u = u.withColumn('sq diff', sf.pow(sf.col('diff'), sf.col('sq'))) u = u.groupBy(['geocode', 'geolevel', 'level']).avg() return u
runs = datatools.getDASRuns(path) schema_name = "DHCP_HHGQ" schema = SchemaMaker.fromName(name=schema_name) experiment = analysis.make_experiment("DHCP", path) df = experiment.getDF() schema = experiment.schema geolevels = [ C.COUNTY ] #, C.COUNTY, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU, C.CD] queries = ['total'] y = sdftools.getAnswers(spark, df, geolevels, schema, queries) path2 = save_location_linux + "RAW.csv" pdf2 = y.toPandas() du.makePath(du.getdir(path2)) pdf2.to_csv(path2, index=False) def MAE(spark, df, geolevels, queries, schema): u = sdftools.getAnswers(spark, df, geolevels, schema, queries) # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production u = u.withColumn('diff', sf.col('priv') - sf.col('orig')) u = u.withColumn('abs diff', sf.abs(sf.col('diff'))) y = u.groupby(['geocode', 'geolevel', 'level']).avg() z = u.groupby(['geolevel']).avg() return u, y, z