コード例 #1
0
def TAES(spark,df,geolevels,queries,schema,u):
    z=sdftools.getAnswers(spark,df,geolevels,schema,queries)
    z=z.groupby(['geolevel','run_id']).sum()
    u.show(10)
    print("this is z")
    z.show(10)
    q=u.join(z, on=['geolevel','run_id'])
    columnstodrop=['plb','budget_group']
    q=q.drop(*columnstodrop)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    q=q.withColumn('MDF/sum',sf.col('priv')/sf.col('sum(priv)'))
    q=q.withColumn('CEF/sum',sf.col('orig')/sf.col('sum(orig)'))
    q=q.withColumn('difference',sf.col('MDF/sum')-sf.col('CEF/sum'))
    q=q.withColumn('abs',sf.abs(sf.col('difference')))
    print("This is q")
    q.show(10)
    q=q.groupby(['geolevel','run_id']).sum()
    columnstodrop=['sum(diff)','sum(sum(orig))','sum(sum(priv))','sum(MDF/sum)','sum(CEF/sum)','sum(difference)']
    print("this is q2")
    q=q.drop(*columnstodrop)
    q.show(10)
    z=q.groupby(['geolevel']).avg()
    print("this is z")
    z.show(10)
    return q,z
コード例 #2
0
def getRddWithAbsDiff(spark, df, geolevels, queries, schema):
    rddWithAnswers = sdftools.getAnswers(spark, df, geolevels, schema, queries)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    rddWithDiff = rddWithAnswers.withColumn('diff',
                                            sf.col('priv') - sf.col('orig'))
    rddWithAbsDiff = rddWithDiff.withColumn('abs diff', sf.abs(sf.col('diff')))
    return rddWithAbsDiff
コード例 #3
0
 def MAE(spark, df, geolevels, queries, schema):
     u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
     # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
     u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
     u = u.withColumn('abs diff', sf.abs(sf.col('diff')))
     y = u.groupby(['geocode', 'geolevel', 'level']).avg()
     z = u.groupby(['geolevel']).avg()
     return u, y, z
コード例 #4
0
def AvgSqError(spark, df, geolevels, queries, schema):
    # This function calculates the average squared error for levels at each geounit and geolevel
    u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
    print("u is")
    u.show()
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
    u = u.withColumn('sq', sf.lit(2))
    u = u.withColumn('sq diff', sf.pow(sf.col('diff'), sf.col('sq')))
    u = u.groupBy(['geocode', 'geolevel', 'level']).avg()
    return u
コード例 #5
0
runs = datatools.getDASRuns(path)

schema_name = "DHCP_HHGQ"

schema = SchemaMaker.fromName(name=schema_name)

experiment = analysis.make_experiment("DHCP", path)
df = experiment.getDF()
schema = experiment.schema

geolevels = [
    C.COUNTY
]  #, C.COUNTY, C.TRACT, C.BLOCK_GROUP, C.BLOCK, C.SLDL, C.SLDU, C.CD]

queries = ['total']
y = sdftools.getAnswers(spark, df, geolevels, schema, queries)

path2 = save_location_linux + "RAW.csv"
pdf2 = y.toPandas()
du.makePath(du.getdir(path2))
pdf2.to_csv(path2, index=False)


def MAE(spark, df, geolevels, queries, schema):
    u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
    u = u.withColumn('abs diff', sf.abs(sf.col('diff')))
    y = u.groupby(['geocode', 'geolevel', 'level']).avg()
    z = u.groupby(['geolevel']).avg()
    return u, y, z