Beispiel #1
0
def makeDictFromDB(idFields, fname):
    '''
    Create and return a dictionary keyed by a concatenation of fields with value the number
    of entries containing all and only those fields from a .db file. Open the named
    database file (assumed to be sql lite), get the contents, and then call
    makeDict(idFields, dbList) to create the dictionary, which is returned.

    :param idFields: List of indexes of the fields to be concatenated to form the dictionary key
    :param fname:  List of entries in the database to be used to form the dictionary
    :return: a dictionary, keyed by the concatenation of the values of the index files, with
        values the number of items that have those values
    '''

    c = dbOpen(fname)
    c.execute('SELECT * FROM source ORDER BY user_id')
    fulllist = c.fetchall()
    retDict = makeDict(idFields, fulllist)
    return retDict
def makeDictFromDB(idFields, fname):
    '''
    Create and return a dictionary keyed by a concatenation of fields with value the number
    of entries containing all and only those fields from a .db file. Open the named
    database file (assumed to be sql lite), get the contents, and then call
    makeDict(idFields, dbList) to create the dictionary, which is returned.

    :param idFields: List of indexes of the fields to be concatenated to form the dictionary key
    :param fname:  List of entries in the database to be used to form the dictionary
    :return: a dictionary, keyed by the concatenation of the values of the index files, with
        values the number of items that have those values
    '''

    c = dbOpen(fname)
    c.execute('SELECT * FROM source ORDER BY user_id')
    fulllist = c.fetchall()
    retDict = makeDict(idFields, fulllist)
    return retDict
Beispiel #3
0
#!/usr/bin/env python
"""
Given some set of databases, count the number of distinct values in a particular field. Right now, it is
assumed that the table is named "source"
"""

from de_id_functions import dbOpen
import sys

def count_fields(c, fname, tbl_name):
    db_command = "Select " + fname + " from " + tbl_name " group by " + fname
    c.execute(db_command)
    return len(c.fetchall())

if __name__ == '__main__':
    for i in range(1, len(sys.argv)):
        print sys.argv[i]
        c = dbOpen(sys.argv[i])
        print 'Number of unique user ids = ', str(count_fields(c, 'user_id', 'source'))
        print 'Number of unique user, class combinations = ', str(count_fields(c, 'user_id, course_id', 'source'))
        print ''
#!/usr/bin/env python
'''
Run buildcountrygeneralizer.py to generate a group of generalization files for countries.
The base name of the files that will be produced is countryGen, to which will be appended the string that is
the first member of the pairs in bin_info, and with bin sizes that are the second of the pair in the list of bin_info.
Current values are 0 (no binning), 5k, 10k, 15, 20k, and 25k. The script assumes that a pickled dictionary mapping
countries to larger regions exists two directories above the script and is named 'country_continent'
'''

import buildcountrygeneralizer as bcg
from de_id_functions import dbOpen

bin_info = [('01k', 1000),
            ('02k', 2000),
            ('03k', 3000),
            ('04k', 4000)
            ]

cr = dbOpen('year.db')
cr.execute('Select cc_by_ip from source')
cc_list = cr.fetchall()

for bi in bin_info:
    outfile = 'countryGen'+bi[0]
    cc_to_regFile = '../../country_continent'
    bin_size = bi[1]
    bcg.main(cc_list, outfile, cc_to_regFile, bin_size)
Beispiel #5
0
#!/usr/bin/env python
'''
This runs the program to build the full set of records to suppress once the binning and suppression based on
identification based on the classes for which a user enrolled.
'''
import buildFullSuppressionSet
from de_id_functions import dbOpen

#bin_size = ['05', '10', '15', '20', '25']
bin_size = ['00']
k_values = [3, 4, 5, 6]

cr = dbOpen('year.db')
geo_base = 'countryGen'
for k_val in k_values:
    for s in bin_size:
        class_supp = 'classSuppressSet' + str(k_val) + 'P'
        geo_suppress = geo_base + s + 'k'
        yob_fname = 'yobbin' + s + 'k'
        forum_fname = 'postbin' + s + 'k'
        suppress_out = 'fullSuppress'+ s + s + str(k_val) + 'P'
        buildFullSuppressionSet.main(cr, class_supp, geo_suppress, yob_fname, forum_fname, suppress_out, k_val)
Beispiel #6
0
            dropClass(classlist, cdict[classlist], cdict, c, suppressionset, use_suppress)
    print count
    print len(suppressionset)
    sfile = open(outname, 'w')
    pickle.dump(suppressionset, sfile)
    sfile.close()


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage: courseSetDeidentify.py dbname k-value {P,R}'
        print 'where P is suppression on level of participation and R is random'
    dbName = sys.argv[1]
    outname = 'classSuppressSet'
    k_val = int(sys.argv[2])
    if sys.argv[3] == 'R':
        suppress_method = 'R'
    else:
        suppress_method = "P"
    outname = outname + str(k_val) + suppress_method

    c = dbOpen(dbName)
    try:
        c.execute("Create Index user_id_idx on source ('user_id')")
    except:
        pass
    c.execute('SELECT user_id, course_id FROM source ORDER BY user_id')
    user_class_list = c.fetchall()
    main(user_class_list, c, k_val, suppress_method, outname)
    dbClose(c)
Beispiel #7
0
                      use_suppress)
    print count
    print len(suppressionset)
    sfile = open(outname, 'w')
    pickle.dump(suppressionset, sfile)
    sfile.close()


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage: courseSetDeidentify.py dbname k-value {P,R}'
        print 'where P is suppression on level of participation and R is random'
    dbName = sys.argv[1]
    outname = 'classSuppressSet'
    k_val = int(sys.argv[2])
    if sys.argv[3] == 'R':
        suppress_method = 'R'
    else:
        suppress_method = "P"
    outname = outname + str(k_val) + suppress_method

    c = dbOpen(dbName)
    try:
        c.execute("Create Index user_id_idx on source ('user_id')")
    except:
        pass
    c.execute('SELECT user_id, course_id FROM source ORDER BY user_id')
    user_class_list = c.fetchall()
    main(user_class_list, k_val, suppress_method, outname)
    dbClose(c)
#!/usr/bin/env python

from de_id_functions import dbOpen
import sys

if __name__ == '__main__':
    dbname = sys.argv[1]
    c = dbOpen(dbname)
    c.execute('Select course_id, user_id from source')
    all_rec = c.fetchall()
    users = set()
    for l in all_rec:
        users.add(l[1])

    print 'Total number of records = ', len(all_rec)
    print 'Total number of users = ', len(users)