Ejemplo n.º 1
0
    def distinctUidUtilityMeasureSingleAndDoubleColumn(self,param):
        """ Measures coverage and accuracy.

            `param` is a single data structure from the list of structures
            returned by setupGdaUtilityParameters(). The contents of
            `param` are read from the configuration file. The elements
            of the configuration file are as follows: <br/>
            `name`: The basis for the name of the output json file. Should
            be unique among all measures. <br/>
            `rawDb`: The raw (non-anonymized) database used. <br/>
            `anonDb`: The anonymized database to use. <br/>
            `table`: The name of the table in the database. <br/>
            `anonType`: The type of anonymization (this appears in the
            GDA Score diagram but does not otherwise affect operation). <br/>
            `anonSubType`: Also appears in the GDA Score diagram. <br/>
            `uid`: The name of the uid column. <br/>
            `measureParam`: The thing that gets measured. Only current value
            is "uid", which indicates that counts of distinct uids should
            be measured. <br/>
            `samples`: States the number of samples over which each utility
            group should be measured. <br/>
            `ranges`: A list of ranges. Each range specifies the lower and
            upper bound on the number of "things" that an answer should
            contain as specified by `measureParam`. <br/>
        """
        attack = gdaAttack(param)
        table = attack.getAttackTableName()
        uid = attack.getUidColName()
        rawColNames = attack.getColNames(dbType='rawDb')
        anonColNames = attack.getColNames(dbType='anonDb')
        # Get table characteristics. This tells us if a given column is
        # enumerative or continuous.
        tabChar = attack.getTableCharacteristics()
        if self._p: pp.pprint(tabChar)
        coverageScores = self._measureCoverage(param,attack,tabChar,table,
                rawColNames,anonColNames)
        allowedColumns = self._getAllowedColumns(coverageScores)
        pp.pprint(coverageScores)
        pp.pprint(allowedColumns)

        accuracyScores = self._measureAccuracy(param,attack,tabChar,
                table,uid,allowedColumns)
        self._ar['coverage']=coverageScores
        self._ar['accuracy']=accuracyScores
        self._ar['tableStats'] = tabChar
        attackResult = attack.getResults()
        self._ar['operational']=attackResult['operational']
        attack.cleanUp()
Ejemplo n.º 2
0
# This example gathers a set of query conditions for one set of bin sizes,
# between 15 and 50 distinct users. (This is the case for building synthetic
# data)

results = []

params = dict(name='getBucketSizes2',
              rawDb='gdaScoreTaxiRaw',
              anonDb='cloakTaxi',
              criteria='singlingOut',
              table='rides',
              uid='uid',
              flushCache=False,
              verbose=False)

x = gdaAttack(params)
colsTypes = x.getColNamesAndTypes(dbType="anonDb")
pp.pprint(colsTypes)
x.cleanUp(doExit=False)

f = open('getBucketSizes2.txt', 'w')

# I'm just going to do this brute force ...
for entries in colsTypes:
    col = entries[0]
    colType = entries[1]
    # For now we'll only deal with ints and reals
    if ((colType != 'real') and (colType != 'integer')):
        print(f"Skip column {col} with type {colType}")
        continue
    q = findQueryConditions(params, [col], 15, 50)
Ejemplo n.º 3
0
def dumb_list_linkability_attack(params):
    """ Dumb List attack for the Linkability criteria.

        All it does is request rows with all columns from the anonymized link
        database. The attack succeeds if the anonymized database returns
        rows that single out users, and fails otherwise. It is designed to
        work against raw and pseudonymized data.
        
        NOTE: This is effectively the same attack as with singling out
        dumb list."""
    attack = gdaAttack(params)

    # -------------------  Exploration Phase  ------------------------
    # We need to know the columns that are in the anonymized database
    # and in the raw database. It is these columns that we can attack.
    # (Note that pseudonymization schemes typically delete some columns.)

    table = attack.getAttackTableName()
    rawColNames = attack.getColNames(dbType='rawDb')
    anonColNames = attack.getColNames(dbType='anonDb')
    colNames = list(set(rawColNames) & set(anonColNames))

    # -------------------  Prior Knowledge Phase  --------------------
    # This attack doesn't require any prior knowledge

    # -------------------  Attack Phase  -----------------------------

    query = {}
    sql = "SELECT "
    sql += comma_ize(colNames)
    sql += str(f"count(*) FROM {table} ")
    sql += makeGroupBy(colNames)
    sql += " HAVING count(*) = 1 ORDER BY count(*) LIMIT 100"
    query['sql'] = sql
    print("-------------------- Attack query:")
    print(sql)
    attack.askAttack(query)
    reply = attack.getAttack()
    if v: print("-------------------- Attack reply:")
    if v: pp.pprint(reply)

    # -------------------  Claims Phase  ----------------------------

    if 'answer' not in reply:
        print("ERROR: reply to claim query contains no answer")
        pp.pprint(reply)
        attack.cleanUp()
        sys.exit()
    for row in reply['answer']:
        spec = {}
        guess = []
        for i in range(len(colNames)):
            guess.append({'col': colNames[i], 'val': row[i]})
        spec['guess'] = guess
        attack.askClaim(spec)

    if v: print("------------------- Attack claims:")
    while True:
        reply = attack.getClaim()
        if v: pp.pprint(reply)
        if reply['stillToCome'] == 0:
            break

    # -------------------  Scores Phase  ----------------------------

    attackResult = attack.getResults()
    sc = gdaScores(attackResult)
    score = sc.getScores()
    if v: pp.pprint(score)
    attack.cleanUp()
    final = finishGdaAttack(params, score)
    pp.pprint(final)
Ejemplo n.º 4
0
def dumb_list_inference_attack(params):
    """ Dumb List attack for the Inference criteria.

        In an inference attack, there are 'known' column values, and
        'guessed' column values. An inference claim succeeds when all
        users with the known column values have the same guessed column
        values. There only needs to be one such user, so we can try
        making inferences on all columns by using all the other columns
        as known values.
        """
    attack = gdaAttack(params)

    # -------------------  Exploration Phase  ------------------------
    # We need to know the columns that are in the anonymized database
    # and in the raw database. It is these columns that we can attack.
    # (Note that pseudonymization schemes typically delete some columns.)

    table = attack.getAttackTableName()
    rawColNames = attack.getColNames(dbType='rawDb')
    anonColNames = attack.getColNames(dbType='anonDb')
    colNames = list(set(rawColNames) & set(anonColNames))

    # Get the total number of rows so that we can later determine fraction
    # of cells per column that are susceptible
    sql = str(f"SELECT count(*) FROM {table}")
    if v: print(sql)
    query = dict(db="raw", sql=sql)
    attack.askExplore(query)
    reply = attack.getExplore()
    if 'error' in reply:
        doQueryErrorAndExit(reply, attack)
    totalRows = reply['answer'][0][0]

    # -------------------  Prior Knowledge Phase  --------------------
    # This attack doesn't require any prior knowledge

    # -------------------  Attack Phase  -----------------------------
    # I'm going to attack each (guessed) column by using the remaining
    # columns as the known colums. In the following, I loop through
    # attack and claims for each guessed column.

    for guessedCol in colNames:
        remainingCols = [x for x in colNames if x != guessedCol]
        # -------------- Attack phase ------------------
        # And now run the attack for some fraction of the attackable cells
        sql = "SELECT "
        sql += comma_ize(remainingCols)
        sql += str(f"max({guessedCol}) FROM {table} WHERE ")
        sql += makeInNotNullConditions(remainingCols)
        sql += makeGroupBy(remainingCols)
        sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1 ")
        sql += str(f"ORDER BY 1 LIMIT 20")
        if v: print(sql)
        query = dict(sql=sql)
        attack.askAttack(query)
        reply = attack.getAttack()
        if 'error' in reply:
            # For this attack, cloak can't deal with max(text_col),
            # so just continue without claims
            continue
        # -------------- Claims phase ------------------
        for row in reply['answer']:
            spec = {}
            known = []
            for i in range(len(remainingCols)):
                known.append({'col': remainingCols[i], 'val': row[i]})
            spec['known'] = known
            i = len(remainingCols)
            spec['guess'] = [{'col': guessedCol, 'val': row[i]}]
            attack.askClaim(spec)
            while True:
                reply = attack.getClaim()
                if v: pp.pprint(reply)
                if reply['stillToCome'] == 0:
                    break

    # -------------------  Scores Phase  ----------------------------

    attackResult = attack.getResults()
    sc = gdaScores(attackResult)
    # New we need to assign susceptibility scores, which means making
    # some explore queries
    for guessedCol in colNames:
        remainingCols = [x for x in colNames if x != guessedCol]
        if len(remainingCols) > 20:
            remainingCols = remainingCols[:20]
        # -------------- More exploration phase ------------------
        # First find out how many of the cells are attackable
        sql = "SELECT sum(rows) FROM (SELECT "
        sql += comma_ize(remainingCols)
        sql += str(f"count(*) AS rows FROM {table} ")
        sql += makeGroupBy(remainingCols)
        sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1) t")
        if v: print("-------------------- Explore query:")
        if v: print(sql)
        query = dict(db="raw", sql=sql)
        attack.askExplore(query)
        reply = attack.getExplore()
        if 'error' in reply:
            doQueryErrorAndExit(reply, attack)
        numRows = reply['answer'][0][0]
        if v: print("-------------------- Explore reply:")
        if v: pp.pprint(reply)
        susValue = numRows / totalRows
        sc.assignColumnSusceptibility(guessedCol, susValue)
    score = sc.getScores()
    if v: pp.pprint(score)
    final = finishGdaAttack(params, score)
    attack.cleanUp()
    pp.pprint(final)
Ejemplo n.º 5
0
    def generateDBSqlForTable(self, argv, dbType):
        paramsList = self._setupGdaUtilityParametersForSqlScripts(
            argv, criteria="singlingOut")
        if self._v: pp.pprint(paramsList)
        #Create a dictionary for mapping rather than
        mappingDBTypesDict = {
            "bigint": "int",
            "bytea": "int",
            "boolean": "int",
            "integer": "int",
            "int": "int",
            "smallint": "int",
            "char": "text",
            "varchar": "text",
            "text": "text",
            "char": "text",
            "character varying": "text",
            "real": "real",
            "decimal": "real",
            "double precision": "real",
            "numeric": "real",
            "timestamp without time zone": "datetime",
            "time": "datetime",
            "timestamp": "datetime",
            "date": "date"
        }

        for param in paramsList:
            if param['finished'] == True:
                print(
                    "The following Utility script for table has been executed:"
                )
                if self._v: pp.pprint(param)
                print(f"Results may be found at {param['resultsPath']}")
                continue
            #Add mandatory fields required for now. Have remove once scope of these parameters are changed.
            path = self._p['dbConfig']
            for x in range(5):
                path = "../" + path
                if os.path.isfile(path):
                    break
                pass
            fh = open(path, "r")
            j = json.load(fh)
            for key in j:
                param['anonDb'] = key
            param['criteria'] = "singlingOut"
            if self._v: pp.pprint(j)

            attack = gdaAttack(param)
            table = attack.getAttackTableName()
            # The following is a quick and dirty fix for a bug that occured
            # when the given databases has tables we don't want to examine
            tableNames = [table]
            if self._v: print(f"table {table}, tableNames {tableNames}")
            resultsPath = param['resultsPath']
            try:
                f = open(resultsPath, 'w')
            except:
                e = str(f"Failed to open {resultsPath} for write")
                sys.exit(e)
            for table in tableNames:
                # Table scheme
                createTable = f"create table {table}_char  (column_name text, column_type text, num_rows int," \
                              f"num_uids int, num_distinct_vals int, av_rows_per_vals real, av_uids_per_val real,std_rows_per_val real,std_uids_per_val real,max text, min text, column_label text);"
                colNames = attack.getColNamesAndTypes(dbType=dbType,
                                                      tableName=table)
                print(f" column names: {colNames}")
                f.write(createTable + '\n')

                num_rows = 0

                sql = "SELECT "
                sql += str(f"count(*) FROM {table} ")
                answer = self.queryDB(attack, sql, dbType)
                for an in answer:
                    num_rows = an[0]

                # Query to get Number of distinct UID's
                num_uids = 0
                sql = "SELECT"
                sql += str(f" count( distinct uid) FROM {table}")
                answer = self.queryDB(attack, sql, dbType)
                for an in answer:
                    num_uids = an[0]
                for raCol in colNames:
                    column_name = raCol[0]
                    column_type = ''

                    if raCol[1] in mappingDBTypesDict.keys():
                        mappedDBType = mappingDBTypesDict.get(raCol[1])
                        column_type += mappedDBType

                    num_distinct_vals = 0
                    # Query to get distinct values of a column
                    sql = "SELECT "
                    sql += str(f"count ( distinct {raCol[0]}) FROM {table} ")
                    answer = self.queryDB(attack, sql, dbType)
                    for an in answer:
                        num_distinct_vals = an[0]

                    av_rows_per_val = num_rows / num_distinct_vals
                    av_uids_per_val = num_uids / num_distinct_vals
                    # std_rows_per_val std_uids_per_val max min
                    '''
                    select sf_flag,count (*)from rides  group by 1
                    '''
                    stdRowsPerVal = []
                    stdUidsPerVal = []
                    # Query to get Find standard deviation Per value of a column
                    sql = "SELECT "
                    sql += (raCol[0])
                    sql += str(f",count(*) FROM {table} ")
                    sql += makeGroupBy([raCol[0]])
                    answer = self.queryDB(attack, sql, dbType)
                    for an in answer:
                        stdRowsPerVal.append(an[1])
                    if len(stdRowsPerVal) > 1:
                        std_rows_per_val = stdev(stdRowsPerVal)
                    else:
                        std_rows_per_val = -1
                    sql = "SELECT "
                    sql += (raCol[0])
                    sql += str(f",count(distinct uid) FROM {table} ")
                    sql += makeGroupBy([raCol[0]])
                    answer = self.queryDB(attack, sql, dbType)
                    for an in answer:
                        stdUidsPerVal.append(an[1])
                    if len(stdUidsPerVal) > 1:
                        std_uids_per_val = stdev(stdUidsPerVal)
                    else:
                        std_uids_per_val = -1

                    # Max: and Min
                    maxi = ''
                    mini = ''
                    # Query to  find Max and Min(Single query).
                    sql = "SELECT "
                    sql += str(f"{raCol[0]} FROM {table}")
                    listOfValues = []
                    answer = self.queryDB(attack, sql, dbType)
                    for an in answer:
                        listOfValues.append(an[0])
                    if not None in listOfValues:
                        maxi = max(listOfValues)
                        mini = min(listOfValues)
                    continousDBTypeList = ["real", "datetime", "date"]
                    enumerateDBTypeList = ["text"]
                    if column_type in continousDBTypeList:
                        columnlabel = 'continuous'
                    elif column_type in enumerateDBTypeList:
                        columnlabel = 'enumerative'
                    elif column_type == 'int':
                        if num_distinct_vals < 100:
                            columnlabel = 'enumerative'
                        else:
                            x = (int(maxi) - int(mini)) / 10
                            sql = "SELECT "
                            sql += str(f"floor (({raCol[0]})/{x})*{x}")
                            sql += str(f",count(*) FROM {table} ")
                            sql += makeGroupBy([1])
                            answer = self.queryDB(attack, sql, dbType)
                            countList = []
                            for an in answer:
                                countList.append(an[1])
                            minInList = min(countList)
                            averageOfList = sum(countList) / len(countList)
                            if minInList < (0.5 * averageOfList):
                                columnlabel = 'continuous'
                            else:
                                columnlabel = 'enumerative'
                    '''
                    if (maxi == '' and mini == ''):
                        insert = f"insert into {table}_char values (\'{column_name}\',\'{column_type}\',{num_rows},{num_uids},{num_distinct_vals},{av_rows_per_val},{av_uids_per_val}," \
                                 f"{std_rows_per_val},{std_uids_per_val},'','',\'{columnlabel}\');"
                    else:
                    '''
                    insert = f"insert into {table}_char values (\'{column_name}\',\'{column_type}\',{num_rows},{num_uids},{num_distinct_vals},{av_rows_per_val},{av_uids_per_val}," \
                                 f"{std_rows_per_val},{std_uids_per_val},\'{maxi}\',\'{mini}\',\'{columnlabel}\');"

                    print(insert)
                    f.write(insert + '\n')
            attack.cleanUp()
Ejemplo n.º 6
0
def dumb_list_singling_out_attack(params):
    """ Dumb List attack for the Singling Out criteria.

        All it does is request rows with all columns from the anonymized
        database. The attack succeeds if the anonymized database returns
        rows that single out users, and fails otherwise. It is designed to
        work against raw and pseudonymized data."""
    attack = gdaAttack(params)

    # -------------------  Exploration Phase  ------------------------
    # We need to know the columns that are in the anonymized database
    # and in the raw database. It is these columns that we can attack.
    # (Note that pseudonymization schemes can delete some columns.)

    table = attack.getAttackTableName()
    rawColNames = attack.getColNames(dbType='rawDb')
    anonColNames = attack.getColNames(dbType='anonDb')
    uid = attack.getUidColName()
    colNamesAll = list(set(rawColNames) & set(anonColNames))
    if v: print(f"Use columns: {colNamesAll}")

    # The cloak can't handle queries with a large number of columns,
    # so we split up the attack into groups of 5 columns each. Each group
    # contains the uid column, so that we are sure that the resulting
    # answer pertains to a single user.
    groupSize = 5
    minAttacksPerGroup = 5
    groups = []
    colsWithoutUid = colNamesAll.copy()
    colsWithoutUid.remove(uid)
    if v: print(colNamesAll)
    if v: print(colsWithoutUid)
    index = 0
    while (1):
        if index >= len(colsWithoutUid):
            break
        endIndex = index + groupSize - 1
        nextGroup = colsWithoutUid[index:endIndex]
        nextGroup.append(uid)
        groups.append(nextGroup)
        index += groupSize - 1

    # This will give us around 100 attack queries total:
    numAttacksPerGroup = min(int(100 / len(groups)) + 1, minAttacksPerGroup)
    if v: pp.pprint(groups)

    # -------------------  Prior Knowledge Phase  --------------------
    # This attack doesn't require any prior knowledge

    # -------------------  Attack Phase  -----------------------------

    for colNames in groups:
        query = {}
        sql = "SELECT "
        sql += comma_ize(colNames)
        sql += str(f"count(*) FROM {table} WHERE ")
        sql += makeInNotNullConditions(colNames)
        sql += makeGroupBy(colNames)
        sql += " HAVING count(*) = 1 ORDER BY uid "
        sql += str(f" LIMIT {numAttacksPerGroup} ")
        query['sql'] = sql
        print("-------------------- Attack query:")
        print(sql)
        attack.askAttack(query)
        reply = attack.getAttack()
        if v: print("-------------------- Attack reply:")
        if v: pp.pprint(reply)

        # -------------------  Claims Phase  ----------------------------

        if 'answer' not in reply:
            print("ERROR: reply to claim query contains no answer")
            pp.pprint(reply)
            attack.cleanUp()
            sys.exit()
        for row in reply['answer']:
            spec = {}
            guess = []
            for i in range(len(colNames)):
                guess.append({'col': colNames[i], 'val': row[i]})
            spec['guess'] = guess
            attack.askClaim(spec)

        if v: print("------------------- Attack claims:")
        while True:
            reply = attack.getClaim()
            if v: pp.pprint(reply)
            if reply['stillToCome'] == 0:
                break

    # -------------------  Scores Phase  ----------------------------

    attackResult = attack.getResults()
    sc = gdaScores(attackResult)
    score = sc.getScores()
    if v: pp.pprint(score)
    attack.cleanUp()
    final = finishGdaAttack(params, score)
    pp.pprint(final)
Ejemplo n.º 7
0
def diffix_infer_1_attack(params):
    ''' This is an inference attack against Diffix

        In this attack, we find attribute groups where the inference
        conditions exist (one one guessed column value exists for some
        set of one or more known column values). This is designed to work
        against Diffix and Full K-anonymity at least.
    '''
    attack = gdaAttack(params)
    
    # -------------------  Exploration Phase  ------------------------
    # We need to know the columns that are in the anonymized database
    # and in the raw database. It is these columns that we can attack.
    
    table = attack.getAttackTableName()
    rawColNames = attack.getColNames(dbType='rawDb')
    anonColNames = attack.getColNames(dbType='anonDb')
    colNames = list(set(rawColNames) & set(anonColNames))
    if v: print(f"Common columns are: {colNames}")

    # Get the total number of rows so that we can later determine fraction
    # of cells per column that are susceptible
    sql = str(f"SELECT count(*) FROM {table}")
    query = dict(db="rawDb",sql=sql)
    attack.askExplore(query)
    reply = attack.getExplore()
    if 'error' in reply:
        doQueryErrorAndExit(reply,attack)
    totalRows = reply['answer'][0][0]
    if v: print(f"Total Rows: {totalRows}")

    # There is really no point in trying to find instances of
    # inference where the guessed column has a large number of values.
    # In these cases, the chances of finding an inference instance is
    # very low. We (arbitrarily for now) set the threshold for this at 10

    # By the same token, an attack where the known column has a majority
    # values that are distinct to a single user won't work for an attack,
    # because in the case of Diffix, they will be low-count filtered, and
    # in the case of Full K-anonymity, they may be aggregated

    # So we record the number of distinct values per column. (In practice,
    # this would not be known exactly, but the attacker can be assumed to
    # have a reasonable guess just based on knowledge of the column.)
    distincts = {}
    guessableCols = []
    for col in colNames:
        sql = str(f"SELECT count(DISTINCT {col}) FROM {table}")
        query = dict(db="rawDb",sql=sql)
        attack.askAttack(query)
        reply = attack.getAttack()
        if 'error' in reply:
            doQueryErrorAndExit(reply,attack)
        totalDistinct = reply['answer'][0][0]
        distincts[col] = totalDistinct
        if totalDistinct <= 10:
            guessableCols.append(col)
    if v: print(f"Distincts: {distincts}")
    if v: print(f"guessableCols: {guessableCols}")

    # -------------------  Prior Knowledge Phase  --------------------
    # This attack doesn't require any prior knowledge
    
    for guessedCol in guessableCols:
        numClaims = 0
        remainingCols = [x for x in colNames if x != guessedCol]
        # We want to try various combinations of the remaining columns,
        # and try the attack if the ratio of distinct values (or expected
        # distinct value combinations) is not too high
        unusedCombinations = 0
        for num in range(len(remainingCols)):
            if unusedCombinations > 1000:
                # If we don't find a useable combination 1000
                # consecutive times, then give up
                break
            if numClaims > 25:
                break
            combs = itertools.combinations(remainingCols,num+1)
            while True:
                if unusedCombinations > 1000:
                    break
                if numClaims > 25:
                    break
                try:
                    knownCols = next(combs)
                except:
                    break
                totalDistinct = 1
                for c in knownCols:
                    totalDistinct *= distincts[c]
                if v: print(f"totalDistinct: {totalDistinct} "
                        "from known columns {knownCols}")
                if (totalDistinct / totalRows) > 0.8:
                    unusedCombinations += 1
                    continue
                unusedCombinations = 0
                numClaims = runOneAttack(guessedCol, knownCols,
                        attack, table, numClaims)

    # -------------------  Scores Phase  ----------------------------
    
    attackResult = attack.getResults()
    sc = gdaScores(attackResult)
    # New we need to assign susceptibility scores, which means making
    # some explore queries
    for guessedCol in colNames:
        remainingCols = [x for x in colNames if x != guessedCol]
        # -------------- More exploration phase ------------------
        # First find out how many of the cells are attackable
        sql = "SELECT sum(rows) FROM (SELECT "
        sql += comma_ize(remainingCols)
        sql += str(f"count(*) AS rows FROM {table} ")
        sql += makeGroupBy(remainingCols)
        sql += str(f" HAVING count(DISTINCT {guessedCol}) = 1) t")
        if v: print("-------------------- Explore query:")
        if v: print(sql)
        query = dict(db="raw",sql=sql)
        attack.askExplore(query)
        reply = attack.getExplore()
        if 'error' in reply:
            doQueryErrorAndExit(reply,attack)
        numRows = reply['answer'][0][0]
        if v: print("-------------------- Explore reply:")
        if v: pp.pprint(reply)
        susValue = numRows / totalRows
        sc.assignColumnSusceptibility(guessedCol,susValue)
    # Get average score (default behavior)
    score = sc.getScores()
    if v: pp.pprint(score)
    score = sc.getScores(numColumns=1)
    if v: pp.pprint(score)
    attack.cleanUp(cleanUpCache=False)
    final = finishGdaAttack(params,score)
    pp.pprint(final)