Esempio n. 1
0
     minimum bin size;

     The mapping from country code to country or region is named countryGenXXk, where XX is the minimum bin size (in
     thousands) of the bins for the country or region

     The file containing the full suppression set is named fullSuppressXXYYk{P,R} where XX is the bin size for the
     YOB binning, YY is the bin size for forum binning. It is assumed that the country binning will be the same as
     the YoB binning.
'''
import buildDeIdentifiedCSV as csvBuilder
from deIdentify.Archive.de_id_functions import dbOpen

bin_size = ['00','05', '10', '15', '20', '25']
k_vals = [3,4,5,6]

cr = dbOpen('year.db')
cr.execute(csvBuilder.build_select_string('source'))
user_course_list = cr.fetchall()

country_base = 'countryGen'

for k in k_vals:
    for s in bin_size:
        class_supp = 'classSuppressSet' + str(k) + 'P'
        country_file = country_base + s + 'k'
        yob_fname = 'yobbin'+ s + 'k'
        fbin_fname = 'postbin' + s + 'k'
        full_suppress = 'fullSuppress' + s + s + str(k) + 'P'
        de_id_fname = 'deIdFile' + s + s + str(k) + 'P.csv'
        csvBuilder.main(user_course_list, de_id_fname, full_suppress,
                                          country_file, yob_fname, fbin_fname)
Esempio n. 2
0
#!/usr/bin/env python
"""
Given some set of databases, count the number of distinct values in a particular field. Right now, it is
assumed that the table is named "source"
"""

from deIdentify.Archive.de_id_functions import dbOpen
import sys

def count_fields(c, fname, tbl_name):
    db_command = "Select " + fname + " from " + tbl_name " group by " + fname
    c.execute(db_command)
    return len(c.fetchall())

if __name__ == '__main__':
    for i in range(1, len(sys.argv)):
        print (sys.argv[i])
        c = dbOpen(sys.argv[i])
        print('Number of unique user ids = ', str(count_fields(c, 'user_id', 'source')))
        print ('Number of unique user, class combinations = ', str(count_fields(c, 'user_id, course_id', 'source')))
        print ('')
Esempio n. 3
0
            dropClass(classlist, cdict[classlist], cdict, c, suppressionset, use_suppress)
    print count
    print len(suppressionset)
    sfile = open(outname, 'w')
    pickle.dump(suppressionset, sfile)
    sfile.close()


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage: courseSetDeidentify.py dbname k-value {P,R}'
        print 'where P is suppression on level of participation and R is random'
    dbName = sys.argv[1]
    outname = 'classSuppressSet'
    k_val = int(sys.argv[2])
    if sys.argv[3] == 'R':
        suppress_method = 'R'
    else:
        suppress_method = "P"
    outname = outname + str(k_val) + suppress_method

    c = dbOpen(dbName)
    try:
        c.execute("Create Index user_id_idx on source ('user_id')")
    except:
        pass
    c.execute('SELECT user_id, course_id FROM source ORDER BY user_id')
    user_class_list = c.fetchall()
    main(user_class_list, c, k_val, suppress_method, outname)
    dbClose(c)
Esempio n. 4
0
#!/usr/bin/env python

from deIdentify.Archive.de_id_functions import dbOpen
import sys

if __name__ == '__main__':
    dbname = sys.argv[1]
    c = dbOpen(dbname)
    c.execute('Select course_id, user_id from source')
    all_rec = c.fetchall()
    users = set()
    for l in all_rec:
        users.add(l[1])

    print 'Total number of records = ', len(all_rec)
    print 'Total number of users = ', len(users)