minimum bin size; The mapping from country code to country or region is named countryGenXXk, where XX is the minimum bin size (in thousands) of the bins for the country or region The file containing the full suppression set is named fullSuppressXXYYk{P,R} where XX is the bin size for the YOB binning, YY is the bin size for forum binning. It is assumed that the country binning will be the same as the YoB binning. ''' import buildDeIdentifiedCSV as csvBuilder from deIdentify.Archive.de_id_functions import dbOpen bin_size = ['00','05', '10', '15', '20', '25'] k_vals = [3,4,5,6] cr = dbOpen('year.db') cr.execute(csvBuilder.build_select_string('source')) user_course_list = cr.fetchall() country_base = 'countryGen' for k in k_vals: for s in bin_size: class_supp = 'classSuppressSet' + str(k) + 'P' country_file = country_base + s + 'k' yob_fname = 'yobbin'+ s + 'k' fbin_fname = 'postbin' + s + 'k' full_suppress = 'fullSuppress' + s + s + str(k) + 'P' de_id_fname = 'deIdFile' + s + s + str(k) + 'P.csv' csvBuilder.main(user_course_list, de_id_fname, full_suppress, country_file, yob_fname, fbin_fname)
#!/usr/bin/env python """ Given some set of databases, count the number of distinct values in a particular field. Right now, it is assumed that the table is named "source" """ from deIdentify.Archive.de_id_functions import dbOpen import sys def count_fields(c, fname, tbl_name): db_command = "Select " + fname + " from " + tbl_name " group by " + fname c.execute(db_command) return len(c.fetchall()) if __name__ == '__main__': for i in range(1, len(sys.argv)): print (sys.argv[i]) c = dbOpen(sys.argv[i]) print('Number of unique user ids = ', str(count_fields(c, 'user_id', 'source'))) print ('Number of unique user, class combinations = ', str(count_fields(c, 'user_id, course_id', 'source'))) print ('')
dropClass(classlist, cdict[classlist], cdict, c, suppressionset, use_suppress) print count print len(suppressionset) sfile = open(outname, 'w') pickle.dump(suppressionset, sfile) sfile.close() if __name__ == '__main__': if len(sys.argv) < 3: print 'Usage: courseSetDeidentify.py dbname k-value {P,R}' print 'where P is suppression on level of participation and R is random' dbName = sys.argv[1] outname = 'classSuppressSet' k_val = int(sys.argv[2]) if sys.argv[3] == 'R': suppress_method = 'R' else: suppress_method = "P" outname = outname + str(k_val) + suppress_method c = dbOpen(dbName) try: c.execute("Create Index user_id_idx on source ('user_id')") except: pass c.execute('SELECT user_id, course_id FROM source ORDER BY user_id') user_class_list = c.fetchall() main(user_class_list, c, k_val, suppress_method, outname) dbClose(c)
#!/usr/bin/env python from deIdentify.Archive.de_id_functions import dbOpen import sys if __name__ == '__main__': dbname = sys.argv[1] c = dbOpen(dbname) c.execute('Select course_id, user_id from source') all_rec = c.fetchall() users = set() for l in all_rec: users.add(l[1]) print 'Total number of records = ', len(all_rec) print 'Total number of users = ', len(users)