def runOneAttack(guessedCol, knownCols, attack, table, numClaims): # -------------- Attack phase ------------------ # And now run the attack for some fraction of the attackable cells if v: print(f"RunOneAttack with guessed '{guessedCol}', known {knownCols}") allCols = [guessedCol] + list(knownCols) sql = "SELECT " sql += comma_ize(allCols) sql += str(f"count(*) FROM {table} ") sql += makeGroupBy(allCols) query = dict(sql=sql) attack.askAttack(query) reply = attack.getAttack() if 'error' in reply: doQueryErrorAndExit(reply,attack) # Build a dict out of the knownCols values, and remember the index # for cases where the knownCols has a single guessedCol value s = {} ans = reply['answer'] for r in range(len(ans)): # I want a 'foo'.join(thing) here, but need to deal with fact that # the value might not be a string key = '' for i in range(1,len(allCols)): key += '::' + str(f"{ans[r][i]}") if key in s: s[key] = -1 else: s[key] = r for key,r in s.items(): if r == -1: continue # This is a potential inference spec = {} known = [] row = ans[r] for i in range(1,len(allCols)): known.append({'col':allCols[i],'val':row[i]}) spec['known'] = known if row[0] is None: pp.pprint(ans) spec['guess'] = [{'col':guessedCol,'val':row[0]}] attack.askClaim(spec) while True: reply = attack.getClaim() numClaims += 1 if v: pp.pprint(reply) if reply['stillToCome'] == 0: break return numClaims
def _makeHistSql(self, table, columns, colInfo, uid, minCount, maxCount): if self._p: self._pp.pprint(colInfo) sql = "select " notnull = 'WHERE ' for col in columns: notnull += str(f"{col} IS NOT NULL AND ") if colInfo[col]['condition'] == 'none': sql += str(f"{col}, ") else: unit = colInfo[col]['condition'] if colInfo[col]['colType'][:4] == 'date': sql += str(f"extract({unit} from {col})::integer, ") elif colInfo[col]['colType'] == 'text': sql += str(f"substring({col} from 1 for {unit}), ") else: # int or real sql += str(f"floor({col}/{unit})*{unit}, ") duidClause = str(f"count(distinct {uid}) ") groupby = makeGroupBy(columns) notnull = notnull[:-4] sql += duidClause + str(f"from {table} ") + notnull + groupby #sql += "having " + duidClause #sql += str(f"between {minCount} and {maxCount}") return (sql)
def dumb_list_linkability_attack(params): """ Dumb List attack for the Linkability criteria. All it does is request rows with all columns from the anonymized link database. The attack succeeds if the anonymized database returns rows that single out users, and fails otherwise. It is designed to work against raw and pseudonymized data. NOTE: This is effectively the same attack as with singling out dumb list.""" attack = gdaAttack(params) # ------------------- Exploration Phase ------------------------ # We need to know the columns that are in the anonymized database # and in the raw database. It is these columns that we can attack. # (Note that pseudonymization schemes typically delete some columns.) table = attack.getAttackTableName() rawColNames = attack.getColNames(dbType='rawDb') anonColNames = attack.getColNames(dbType='anonDb') colNames = list(set(rawColNames) & set(anonColNames)) # ------------------- Prior Knowledge Phase -------------------- # This attack doesn't require any prior knowledge # ------------------- Attack Phase ----------------------------- query = {} sql = "SELECT " sql += comma_ize(colNames) sql += str(f"count(*) FROM {table} ") sql += makeGroupBy(colNames) sql += " HAVING count(*) = 1 ORDER BY count(*) LIMIT 100" query['sql'] = sql print("-------------------- Attack query:") print(sql) attack.askAttack(query) reply = attack.getAttack() if v: print("-------------------- Attack reply:") if v: pp.pprint(reply) # ------------------- Claims Phase ---------------------------- if 'answer' not in reply: print("ERROR: reply to claim query contains no answer") pp.pprint(reply) attack.cleanUp() sys.exit() for row in reply['answer']: spec = {} guess = [] for i in range(len(colNames)): guess.append({'col': colNames[i], 'val': row[i]}) spec['guess'] = guess attack.askClaim(spec) if v: print("------------------- Attack claims:") while True: reply = attack.getClaim() if v: pp.pprint(reply) if reply['stillToCome'] == 0: break # ------------------- Scores Phase ---------------------------- attackResult = attack.getResults() sc = gdaScores(attackResult) score = sc.getScores() if v: pp.pprint(score) attack.cleanUp() final = finishGdaAttack(params, score) pp.pprint(final)
def dumb_list_inference_attack(params): """ Dumb List attack for the Inference criteria. In an inference attack, there are 'known' column values, and 'guessed' column values. An inference claim succeeds when all users with the known column values have the same guessed column values. There only needs to be one such user, so we can try making inferences on all columns by using all the other columns as known values. """ attack = gdaAttack(params) # ------------------- Exploration Phase ------------------------ # We need to know the columns that are in the anonymized database # and in the raw database. It is these columns that we can attack. # (Note that pseudonymization schemes typically delete some columns.) table = attack.getAttackTableName() rawColNames = attack.getColNames(dbType='rawDb') anonColNames = attack.getColNames(dbType='anonDb') colNames = list(set(rawColNames) & set(anonColNames)) # Get the total number of rows so that we can later determine fraction # of cells per column that are susceptible sql = str(f"SELECT count(*) FROM {table}") if v: print(sql) query = dict(db="raw", sql=sql) attack.askExplore(query) reply = attack.getExplore() if 'error' in reply: doQueryErrorAndExit(reply, attack) totalRows = reply['answer'][0][0] # ------------------- Prior Knowledge Phase -------------------- # This attack doesn't require any prior knowledge # ------------------- Attack Phase ----------------------------- # I'm going to attack each (guessed) column by using the remaining # columns as the known colums. In the following, I loop through # attack and claims for each guessed column. for guessedCol in colNames: remainingCols = [x for x in colNames if x != guessedCol] # -------------- Attack phase ------------------ # And now run the attack for some fraction of the attackable cells sql = "SELECT " sql += comma_ize(remainingCols) sql += str(f"max({guessedCol}) FROM {table} WHERE ") sql += makeInNotNullConditions(remainingCols) sql += makeGroupBy(remainingCols) sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1 ") sql += str(f"ORDER BY 1 LIMIT 20") if v: print(sql) query = dict(sql=sql) attack.askAttack(query) reply = attack.getAttack() if 'error' in reply: # For this attack, cloak can't deal with max(text_col), # so just continue without claims continue # -------------- Claims phase ------------------ for row in reply['answer']: spec = {} known = [] for i in range(len(remainingCols)): known.append({'col': remainingCols[i], 'val': row[i]}) spec['known'] = known i = len(remainingCols) spec['guess'] = [{'col': guessedCol, 'val': row[i]}] attack.askClaim(spec) while True: reply = attack.getClaim() if v: pp.pprint(reply) if reply['stillToCome'] == 0: break # ------------------- Scores Phase ---------------------------- attackResult = attack.getResults() sc = gdaScores(attackResult) # New we need to assign susceptibility scores, which means making # some explore queries for guessedCol in colNames: remainingCols = [x for x in colNames if x != guessedCol] if len(remainingCols) > 20: remainingCols = remainingCols[:20] # -------------- More exploration phase ------------------ # First find out how many of the cells are attackable sql = "SELECT sum(rows) FROM (SELECT " sql += comma_ize(remainingCols) sql += str(f"count(*) AS rows FROM {table} ") sql += makeGroupBy(remainingCols) sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1) t") if v: print("-------------------- Explore query:") if v: print(sql) query = dict(db="raw", sql=sql) attack.askExplore(query) reply = attack.getExplore() if 'error' in reply: doQueryErrorAndExit(reply, attack) numRows = reply['answer'][0][0] if v: print("-------------------- Explore reply:") if v: pp.pprint(reply) susValue = numRows / totalRows sc.assignColumnSusceptibility(guessedCol, susValue) score = sc.getScores() if v: pp.pprint(score) final = finishGdaAttack(params, score) attack.cleanUp() pp.pprint(final)
def generateDBSqlForTable(self, argv, dbType): paramsList = self._setupGdaUtilityParametersForSqlScripts( argv, criteria="singlingOut") if self._v: pp.pprint(paramsList) #Create a dictionary for mapping rather than mappingDBTypesDict = { "bigint": "int", "bytea": "int", "boolean": "int", "integer": "int", "int": "int", "smallint": "int", "char": "text", "varchar": "text", "text": "text", "char": "text", "character varying": "text", "real": "real", "decimal": "real", "double precision": "real", "numeric": "real", "timestamp without time zone": "datetime", "time": "datetime", "timestamp": "datetime", "date": "date" } for param in paramsList: if param['finished'] == True: print( "The following Utility script for table has been executed:" ) if self._v: pp.pprint(param) print(f"Results may be found at {param['resultsPath']}") continue #Add mandatory fields required for now. Have remove once scope of these parameters are changed. path = self._p['dbConfig'] for x in range(5): path = "../" + path if os.path.isfile(path): break pass fh = open(path, "r") j = json.load(fh) for key in j: param['anonDb'] = key param['criteria'] = "singlingOut" if self._v: pp.pprint(j) attack = gdaAttack(param) table = attack.getAttackTableName() # The following is a quick and dirty fix for a bug that occured # when the given databases has tables we don't want to examine tableNames = [table] if self._v: print(f"table {table}, tableNames {tableNames}") resultsPath = param['resultsPath'] try: f = open(resultsPath, 'w') except: e = str(f"Failed to open {resultsPath} for write") sys.exit(e) for table in tableNames: # Table scheme createTable = f"create table {table}_char (column_name text, column_type text, num_rows int," \ f"num_uids int, num_distinct_vals int, av_rows_per_vals real, av_uids_per_val real,std_rows_per_val real,std_uids_per_val real,max text, min text, column_label text);" colNames = attack.getColNamesAndTypes(dbType=dbType, tableName=table) print(f" column names: {colNames}") f.write(createTable + '\n') num_rows = 0 sql = "SELECT " sql += str(f"count(*) FROM {table} ") answer = self.queryDB(attack, sql, dbType) for an in answer: num_rows = an[0] # Query to get Number of distinct UID's num_uids = 0 sql = "SELECT" sql += str(f" count( distinct uid) FROM {table}") answer = self.queryDB(attack, sql, dbType) for an in answer: num_uids = an[0] for raCol in colNames: column_name = raCol[0] column_type = '' if raCol[1] in mappingDBTypesDict.keys(): mappedDBType = mappingDBTypesDict.get(raCol[1]) column_type += mappedDBType num_distinct_vals = 0 # Query to get distinct values of a column sql = "SELECT " sql += str(f"count ( distinct {raCol[0]}) FROM {table} ") answer = self.queryDB(attack, sql, dbType) for an in answer: num_distinct_vals = an[0] av_rows_per_val = num_rows / num_distinct_vals av_uids_per_val = num_uids / num_distinct_vals # std_rows_per_val std_uids_per_val max min ''' select sf_flag,count (*)from rides group by 1 ''' stdRowsPerVal = [] stdUidsPerVal = [] # Query to get Find standard deviation Per value of a column sql = "SELECT " sql += (raCol[0]) sql += str(f",count(*) FROM {table} ") sql += makeGroupBy([raCol[0]]) answer = self.queryDB(attack, sql, dbType) for an in answer: stdRowsPerVal.append(an[1]) if len(stdRowsPerVal) > 1: std_rows_per_val = stdev(stdRowsPerVal) else: std_rows_per_val = -1 sql = "SELECT " sql += (raCol[0]) sql += str(f",count(distinct uid) FROM {table} ") sql += makeGroupBy([raCol[0]]) answer = self.queryDB(attack, sql, dbType) for an in answer: stdUidsPerVal.append(an[1]) if len(stdUidsPerVal) > 1: std_uids_per_val = stdev(stdUidsPerVal) else: std_uids_per_val = -1 # Max: and Min maxi = '' mini = '' # Query to find Max and Min(Single query). sql = "SELECT " sql += str(f"{raCol[0]} FROM {table}") listOfValues = [] answer = self.queryDB(attack, sql, dbType) for an in answer: listOfValues.append(an[0]) if not None in listOfValues: maxi = max(listOfValues) mini = min(listOfValues) continousDBTypeList = ["real", "datetime", "date"] enumerateDBTypeList = ["text"] if column_type in continousDBTypeList: columnlabel = 'continuous' elif column_type in enumerateDBTypeList: columnlabel = 'enumerative' elif column_type == 'int': if num_distinct_vals < 100: columnlabel = 'enumerative' else: x = (int(maxi) - int(mini)) / 10 sql = "SELECT " sql += str(f"floor (({raCol[0]})/{x})*{x}") sql += str(f",count(*) FROM {table} ") sql += makeGroupBy([1]) answer = self.queryDB(attack, sql, dbType) countList = [] for an in answer: countList.append(an[1]) minInList = min(countList) averageOfList = sum(countList) / len(countList) if minInList < (0.5 * averageOfList): columnlabel = 'continuous' else: columnlabel = 'enumerative' ''' if (maxi == '' and mini == ''): insert = f"insert into {table}_char values (\'{column_name}\',\'{column_type}\',{num_rows},{num_uids},{num_distinct_vals},{av_rows_per_val},{av_uids_per_val}," \ f"{std_rows_per_val},{std_uids_per_val},'','',\'{columnlabel}\');" else: ''' insert = f"insert into {table}_char values (\'{column_name}\',\'{column_type}\',{num_rows},{num_uids},{num_distinct_vals},{av_rows_per_val},{av_uids_per_val}," \ f"{std_rows_per_val},{std_uids_per_val},\'{maxi}\',\'{mini}\',\'{columnlabel}\');" print(insert) f.write(insert + '\n') attack.cleanUp()
def _measureCoverage(self,param,attack,tabChar,table, rawColNames,anonColNames): # Here I only look at individual columns, # making the assumption that if I can query an individual column, # then I can also query combinations of columns. # Each entry in this list is for one column coverageScores=[] for colName in rawColNames: # These hold the query results or indication of lack thereof rawDbrowsDict = {} anonDbrowsDict = {} # There are couple conditions under which the column can be # considered not covered at all. if colName not in anonColNames: # Column doesn't even exist entry = copy.deepcopy(self._nonCoveredDict) entry['col1'] = colName coverageScores.append(entry) continue else: # See how much of the column is NULL sql = str(f"SELECT count({colName}) FROM {table}") rawAns = self._doExplore(attack,"raw",sql) anonAns = self._doExplore(attack,"anon",sql) numRawRows = rawAns[0][0] numAnonRows = anonAns[0][0] if numAnonRows == 0: # Column is completely NULL entry = copy.deepcopy(self._nonCoveredDict) entry['col1'] = colName coverageScores.append(entry) continue # Ok, there is an anonymized column. if tabChar[colName]['column_label'] == 'continuous': # If a column is continuous, then in any event it can be # completely covered with range queries, though only if # range queries are possible rangePossible = 1 # TODO: Here we put checks for any anonymization types that # don't have range queries. For now there are no such. # if (param['anonType'] == 'foobar': if rangePossible: entry = copy.deepcopy(self._rangeDict) entry['col1'] = colName entry['coverage']['coveragePerCol'] = numAnonRows/numRawRows coverageScores.append(entry) continue else: pass # Ok, the anonymized column is not covered by a range (either # enumerative or no range function exists), so query the DB to # evaluate coverage sql = "SELECT " sql += (colName) if(param['measureParam']=="*"): sql += str(f", count(*) FROM {table} ") else: sql += str(f", count( distinct {param['uid']}) FROM {table} ") sql += makeGroupBy([colName]) rawDbrows = self._doExplore(attack,"raw",sql) anonDbrows = self._doExplore(attack,"anon",sql) for row in anonDbrows: anonDbrowsDict[row[0]] = row[1] for row in rawDbrows: rawDbrowsDict[row[0]] = row[1] coverageEntry = self._calCoverage(rawDbrowsDict, anonDbrowsDict,[colName],param) coverageScores.append(coverageEntry ) return coverageScores
def dumb_list_singling_out_attack(params): """ Dumb List attack for the Singling Out criteria. All it does is request rows with all columns from the anonymized database. The attack succeeds if the anonymized database returns rows that single out users, and fails otherwise. It is designed to work against raw and pseudonymized data.""" attack = gdaAttack(params) # ------------------- Exploration Phase ------------------------ # We need to know the columns that are in the anonymized database # and in the raw database. It is these columns that we can attack. # (Note that pseudonymization schemes can delete some columns.) table = attack.getAttackTableName() rawColNames = attack.getColNames(dbType='rawDb') anonColNames = attack.getColNames(dbType='anonDb') uid = attack.getUidColName() colNamesAll = list(set(rawColNames) & set(anonColNames)) if v: print(f"Use columns: {colNamesAll}") # The cloak can't handle queries with a large number of columns, # so we split up the attack into groups of 5 columns each. Each group # contains the uid column, so that we are sure that the resulting # answer pertains to a single user. groupSize = 5 minAttacksPerGroup = 5 groups = [] colsWithoutUid = colNamesAll.copy() colsWithoutUid.remove(uid) if v: print(colNamesAll) if v: print(colsWithoutUid) index = 0 while (1): if index >= len(colsWithoutUid): break endIndex = index + groupSize - 1 nextGroup = colsWithoutUid[index:endIndex] nextGroup.append(uid) groups.append(nextGroup) index += groupSize - 1 # This will give us around 100 attack queries total: numAttacksPerGroup = min(int(100 / len(groups)) + 1, minAttacksPerGroup) if v: pp.pprint(groups) # ------------------- Prior Knowledge Phase -------------------- # This attack doesn't require any prior knowledge # ------------------- Attack Phase ----------------------------- for colNames in groups: query = {} sql = "SELECT " sql += comma_ize(colNames) sql += str(f"count(*) FROM {table} WHERE ") sql += makeInNotNullConditions(colNames) sql += makeGroupBy(colNames) sql += " HAVING count(*) = 1 ORDER BY uid " sql += str(f" LIMIT {numAttacksPerGroup} ") query['sql'] = sql print("-------------------- Attack query:") print(sql) attack.askAttack(query) reply = attack.getAttack() if v: print("-------------------- Attack reply:") if v: pp.pprint(reply) # ------------------- Claims Phase ---------------------------- if 'answer' not in reply: print("ERROR: reply to claim query contains no answer") pp.pprint(reply) attack.cleanUp() sys.exit() for row in reply['answer']: spec = {} guess = [] for i in range(len(colNames)): guess.append({'col': colNames[i], 'val': row[i]}) spec['guess'] = guess attack.askClaim(spec) if v: print("------------------- Attack claims:") while True: reply = attack.getClaim() if v: pp.pprint(reply) if reply['stillToCome'] == 0: break # ------------------- Scores Phase ---------------------------- attackResult = attack.getResults() sc = gdaScores(attackResult) score = sc.getScores() if v: pp.pprint(score) attack.cleanUp() final = finishGdaAttack(params, score) pp.pprint(final)
def _doOneMeasure(self, x, params, columns, table, tabChar, minCount, maxCount): # Record the columns' types. colInfo = {} # The 'condition' variable in colInfo tells us what to do to form the # query that looks for the needed bucket sizes. 'none' means don't # make any condition at all. This is the default. for col in columns: colInfo[col] = dict(condition='none') colInfo[col]['colType'] = tabChar[col]['column_type'] colInfo[col]['dVals'] = tabChar[col]['num_distinct_vals'] colInfo[col]['minVal'] = tabChar[col]['min'] colInfo[col]['maxVal'] = tabChar[col]['max'] # While we are at it, record the total number of distinct UIDs # and rows which happens to be the same for every column dUids = tabChar[col]['num_uids'] dVals = tabChar[col]['num_distinct_vals'] uid = params['uid'] if self._p: self._pp.pprint(colInfo) if self._p: print(f"UID: {uid}, num UIDs: {dUids}") if len(columns) == 2: # Determine the number of distinct value pairs (note that in the # case of one column, we'll have already recorded it above) sql = str(f"select count(*) from (select ") more = '' for col in columns: more += str(f"{col}, ") more = more[0:-2] + ' ' groupby = makeGroupBy(columns) sql += more + str(f"from {table} ") + groupby + ") t" if self._p: print(sql) print(sql) query = dict(db="raw", sql=sql) x.askExplore(query) ans = x.getExplore() if not ans: x.cleanUp(exitMsg="Failed query 2") if 'error' in ans: x.cleanUp(exitMsg="Failed query 2 (error)") if self._p: self._pp.pprint(ans) dVals = ans['answer'][0][0] if self._p: print(f"{dVals} distinct values or value pairs") # base is the number of UIDs per combined val base = dUids / dVals # target here is the number of UIDs per bucket that I want target = minCount + ((maxCount - minCount) / 2) # I want to compute by what factor I need to grow the uid/bucket # count in order to get close to the target grow = target / base if self._p: print(f"base {base}, target {target}, grow {grow}") if grow <= 2: # I can't usually grow by anything less than 2x, so let's just # go with no conditions if self._p: print("Needed growth too small, so use column values as is") sql = self._makeHistSql(table, columns, colInfo, uid, minCount, maxCount) if self._p: print(sql) answer = self._queryAndGather(x, sql, colInfo, columns, minCount, maxCount) if (len(answer['buckets']) > 0) and self._ansNotDup(answer): self._ret.append(answer) if self._p: self._pp.pprint(self._ret) return # We'll need to generalize, so see if we have text or datetime columns, # and gather the information needed to know roughly the number of # distinct UIDs we'll be able to get for col in columns: if colInfo[col]['colType'] != 'text': continue sql = str( f"select count(distinct ones), count(distinct twos), " f"count(distinct threes) from ( " f"select substring({col} from 1 for 1) as ones, " f"substring({col} from 1 for 2) as twos, " f"substring({col} from 1 for 3) as threes from {table}) t") if self._p: print(sql) print(sql) query = dict(db="raw", sql=sql) x.askExplore(query) ans = x.getExplore() if not ans: x.cleanUp(exitMsg="Failed query") if 'error' in ans: x.cleanUp(exitMsg="Failed query (error)") if self._p: self._pp.pprint(ans) colInfo[col]['buckets'] = [] colInfo[col]['buckets'].append([1, ans['answer'][0][0]]) colInfo[col]['buckets'].append([2, ans['answer'][0][1]]) colInfo[col]['buckets'].append([3, ans['answer'][0][2]]) if self._p: self._pp.pprint(colInfo) for col in columns: if colInfo[col]['colType'][:4] != 'date': continue sql = str(f"select count(distinct years), count(distinct months), " f"count(distinct days) from ( select " f"extract(year from {col})::integer as years, " f"extract(month from {col})::integer as months, " f"extract(day from {col})::integer as days " f"from {table}) t") if self._p: print(sql) print(sql) query = dict(db="raw", sql=sql) x.askExplore(query) ans = x.getExplore() if not ans: x.cleanUp(exitMsg="Failed query") if 'error' in ans: x.cleanUp(exitMsg="Failed query (error)") if self._p: self._pp.pprint(ans) colInfo[col]['buckets'] = [] colInfo[col]['buckets'].append(['year', ans['answer'][0][0]]) colInfo[col]['buckets'].append(['month', ans['answer'][0][1]]) colInfo[col]['buckets'].append(['day', ans['answer'][0][2]]) if self._p: self._pp.pprint(colInfo) if len(columns) == 1: # If just one column, then simply find a good bucketization factor = 1 while (1): if colInfo[col]['colType'][:4] == 'date': newColInfo = self._generalizeTextOrDatetime( col, colInfo, grow) elif colInfo[col]['colType'] == 'text': newColInfo = self._generalizeTextOrDatetime( col, colInfo, grow) else: # int or real newColInfo = self._generalizeNumber(col, colInfo, grow=grow) if newColInfo is None: if self._p: print("Couldn't generalize at all") return sql = self._makeHistSql(table, columns, newColInfo, uid, minCount, maxCount) if self._p: print(sql) answer = self._queryAndGather(x, sql, newColInfo, columns, minCount, maxCount) if self._p: self._pp.pprint(answer) if (len(answer['buckets']) > 0) and self._ansNotDup(answer): self._ret.append(answer) if factor != 1: break if answer['guess'] == 'bucketsJustRight': break if answer['guess'] == 'bucketsTooBig': factor = 0.5 else: factor = 2 if self._p: self._pp.pprint(self._ret) return # What I'll do is take one of the columns, create an increasing number # of buckets for it, and then set the appropriate number of bucket for # the other column. Ideal candidate for this is a numerical column with # lots of distinct values (cause can make more bucket sizes). Next would # be datetime, and last would be text numDistinct = 0 col1 = '' for col in columns: if ((colInfo[col]['colType'] == "real") or ((colInfo[col]['colType'][:3] == "int"))): if colInfo[col]['dVals'] > numDistinct: numDistinct = colInfo[col]['dVals'] col1 = col if numDistinct == 0: # Didn't find a numeric column, so look for datetime for col in columns: if colInfo[col]['colType'][:4] == "date": col1 = col if len(col1) == 0: # Didn't find a datetime either, so just pick the first one col1 = columns[0] if columns[0] == col1: col2 = columns[1] else: col2 = columns[0] if self._p: print(f"col1 is {col1}, type {colInfo[col1]['colType']}") if self._p: print(f"col2 is {col2}, type {colInfo[col2]['colType']}") if ((colInfo[col1]['colType'] == "real") or ((colInfo[col1]['colType'][:3] == "int"))): numBuckets = 2 while (1): partColInfo = self._generalizeNumber(col1, colInfo, targetBuckets=numBuckets) if partColInfo == None: if self._p: print(f"partColInfo == None (numBuckets {numBuckets})") break # partColInfo now has the structure for col1 set. We need to set # the sturcture for col2. if self._p: print("partColInfo:") if self._p: self._pp.pprint(partColInfo) fudge = 1 allDone = 0 while (1): if colInfo[col2]['colType'][:4] == 'date': newColInfo = self._generalizeTextOrDatetime( col2, colInfo, grow) allDone = 1 elif colInfo[col2]['colType'] == 'text': newColInfo = self._generalizeTextOrDatetime( col2, colInfo, grow) allDone = 1 else: # int or real newColInfo = self._generalizeNumber( col2, partColInfo, grow=grow, factor=(numBuckets * fudge)) if newColInfo is None: if self._p: print( f"newColInfo == None (numBuckets {numBuckets})" ) allDone = 1 break if self._p: print("newColInfo:") if self._p: self._pp.pprint(newColInfo) sql = self._makeHistSql(table, columns, newColInfo, uid, minCount, maxCount) if self._p: print(sql) answer = self._queryAndGather(x, sql, newColInfo, columns, minCount, maxCount) if self._p: self._pp.pprint(answer) if (len(answer['buckets']) > 0) and self._ansNotDup(answer): self._ret.append(answer) if fudge != 1: break if answer['guess'] == 'bucketsJustRight': break if answer['guess'] == 'bucketsTooBig': fudge = 0.5 else: fudge = 2 if allDone: break if self._p: self._pp.pprint(self._ret) numBuckets *= 2 return # Neither column is a number. For now, we require that at least one # column is numeric return
def diffix_infer_1_attack(params): ''' This is an inference attack against Diffix In this attack, we find attribute groups where the inference conditions exist (one one guessed column value exists for some set of one or more known column values). This is designed to work against Diffix and Full K-anonymity at least. ''' attack = gdaAttack(params) # ------------------- Exploration Phase ------------------------ # We need to know the columns that are in the anonymized database # and in the raw database. It is these columns that we can attack. table = attack.getAttackTableName() rawColNames = attack.getColNames(dbType='rawDb') anonColNames = attack.getColNames(dbType='anonDb') colNames = list(set(rawColNames) & set(anonColNames)) if v: print(f"Common columns are: {colNames}") # Get the total number of rows so that we can later determine fraction # of cells per column that are susceptible sql = str(f"SELECT count(*) FROM {table}") query = dict(db="rawDb",sql=sql) attack.askExplore(query) reply = attack.getExplore() if 'error' in reply: doQueryErrorAndExit(reply,attack) totalRows = reply['answer'][0][0] if v: print(f"Total Rows: {totalRows}") # There is really no point in trying to find instances of # inference where the guessed column has a large number of values. # In these cases, the chances of finding an inference instance is # very low. We (arbitrarily for now) set the threshold for this at 10 # By the same token, an attack where the known column has a majority # values that are distinct to a single user won't work for an attack, # because in the case of Diffix, they will be low-count filtered, and # in the case of Full K-anonymity, they may be aggregated # So we record the number of distinct values per column. (In practice, # this would not be known exactly, but the attacker can be assumed to # have a reasonable guess just based on knowledge of the column.) distincts = {} guessableCols = [] for col in colNames: sql = str(f"SELECT count(DISTINCT {col}) FROM {table}") query = dict(db="rawDb",sql=sql) attack.askAttack(query) reply = attack.getAttack() if 'error' in reply: doQueryErrorAndExit(reply,attack) totalDistinct = reply['answer'][0][0] distincts[col] = totalDistinct if totalDistinct <= 10: guessableCols.append(col) if v: print(f"Distincts: {distincts}") if v: print(f"guessableCols: {guessableCols}") # ------------------- Prior Knowledge Phase -------------------- # This attack doesn't require any prior knowledge for guessedCol in guessableCols: numClaims = 0 remainingCols = [x for x in colNames if x != guessedCol] # We want to try various combinations of the remaining columns, # and try the attack if the ratio of distinct values (or expected # distinct value combinations) is not too high unusedCombinations = 0 for num in range(len(remainingCols)): if unusedCombinations > 1000: # If we don't find a useable combination 1000 # consecutive times, then give up break if numClaims > 25: break combs = itertools.combinations(remainingCols,num+1) while True: if unusedCombinations > 1000: break if numClaims > 25: break try: knownCols = next(combs) except: break totalDistinct = 1 for c in knownCols: totalDistinct *= distincts[c] if v: print(f"totalDistinct: {totalDistinct} " "from known columns {knownCols}") if (totalDistinct / totalRows) > 0.8: unusedCombinations += 1 continue unusedCombinations = 0 numClaims = runOneAttack(guessedCol, knownCols, attack, table, numClaims) # ------------------- Scores Phase ---------------------------- attackResult = attack.getResults() sc = gdaScores(attackResult) # New we need to assign susceptibility scores, which means making # some explore queries for guessedCol in colNames: remainingCols = [x for x in colNames if x != guessedCol] # -------------- More exploration phase ------------------ # First find out how many of the cells are attackable sql = "SELECT sum(rows) FROM (SELECT " sql += comma_ize(remainingCols) sql += str(f"count(*) AS rows FROM {table} ") sql += makeGroupBy(remainingCols) sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1) t") if v: print("-------------------- Explore query:") if v: print(sql) query = dict(db="raw",sql=sql) attack.askExplore(query) reply = attack.getExplore() if 'error' in reply: doQueryErrorAndExit(reply,attack) numRows = reply['answer'][0][0] if v: print("-------------------- Explore reply:") if v: pp.pprint(reply) susValue = numRows / totalRows sc.assignColumnSusceptibility(guessedCol,susValue) # Get average score (default behavior) score = sc.getScores() if v: pp.pprint(score) score = sc.getScores(numColumns=1) if v: pp.pprint(score) attack.cleanUp(cleanUpCache=False) final = finishGdaAttack(params,score) pp.pprint(final)