def main(): args = parse_arguments([{'name': 'fields', 'type': str}]) # extract arguments dataset = load_dataset(args.dataset_path) original_dataset = load_dataset(args.origin_path) fields = json.loads(args.fields) usernames = [] iterate_and_apply(original_dataset, fields, usernames.append) # detect names by traversing the dataset matches = [] find_names(dataset, usernames, matches) # notify the detected user names if len(matches) > 0: messages = ["Detected user names: "] + \ [' - %s' % m for m in matches[:10]] if len(matches) > 10: messages.append(' - ...') notify_validation_result(ValidationOutcome.FAILURE, '\n'.join(messages)) else: notify_validation_result(ValidationOutcome.SUCCESS, 'No user name detected')
def main(): args = parse_arguments([ {'name': 'quasi_identifiers', 'type': str}, {'name': 'k', 'type': int} ]) dataset = load_dataset(args.dataset_path) quasi_identifiers = json.loads(args.quasi_identifiers) relevant_arrays = find_selected_arrays(dataset, quasi_identifiers) if len(relevant_arrays) == 0: notify_validation_result( ValidationOutcome.WARNING, "No quasi identifier was found") else: # compute k-anonymity for each array containing QI's. # final k-anonymity will be the lowest of them ks = [compute_kanonymity(a['array'], a['field_selection']) for a in relevant_arrays] k = min(ks) if k >= args.k: notify_validation_result( ValidationOutcome.SUCCESS, '%d-anonymous (>= %d)' % (k, args.k)) else: notify_validation_result( ValidationOutcome.FAILURE, '%d-anonymous (< %d)' % (k, args.k))
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) users = dataset['data']['users'] # prepare id and name substitutions # id substitution = hash(id) id_substitutions = { user['_id']: { 'substitution': sha256_hash(user['_id']), 'regex': user['_id'] } for user in users } name_substitutions = { user['name']: { # name substitution = id substitution 'substitution': id_substitutions[user['_id']]['substitution'], 'regex': get_regex(user['name']) } for user in users } substitutions = [v['substitution'] for v in id_substitutions.values()] # replace the user ids in actions for action in dataset['data']['actions']: if 'user' in action: userid = action['user'] if userid not in id_substitutions: id_substitutions[userid] = { 'substitution': sha256_hash(userid), 'regex': userid } action['user'] = id_substitutions[userid]['substitution'] # replace the user ids in appInstanceResources for app_instance_res in dataset['data']['appInstanceResources']: if 'user' in app_instance_res: userid = app_instance_res['user'] if userid not in id_substitutions: id_substitutions[userid] = { 'substitution': sha256_hash(userid), 'regex': userid } app_instance_res['user'] = id_substitutions[userid]['substitution'] # generically search and replace occurrences of usernames and ids find_and_replace(dataset, name_substitutions) find_and_replace(dataset, id_substitutions) # the users fields are now just the hashes dataset['data']['users'] = substitutions save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([ {'name': 'quasi_identifiers', 'type': str}, {'name': 'sensitive_attributes', 'type': str}, {'name': 'l', 'type': int} ]) dataset = load_dataset(args.dataset_path) quasi_identifiers = json.loads(args.quasi_identifiers) sensitive_attributes = json.loads(args.sensitive_attributes) relevant_arrays = find_selected_arrays( dataset, quasi_identifiers, sensitive_attributes) if len(relevant_arrays) == 0: notify_validation_result( ValidationOutcome.WARNING, "No sensitive attribute was found") else: # compute l-diversity for each array with SA's # final l-diversity will be the smallest of them ls = [compute_ldiversity(array['array'], array['quasi_identifiers'], array['sensitive_attributes']) for array in relevant_arrays] l = min(ls) if l >= args.l: notify_validation_result( ValidationOutcome.SUCCESS, '%d-diversified (>= %d)' % (l, args.l)) else: notify_validation_result( ValidationOutcome.FAILURE, '%d-diversified (< %d)' % (l, args.l))
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) # write your dataset changes here save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'positive_integer', 'type': int}]) dataset = load_dataset(args.dataset_path) if args.positive_integer < 0: raise Exception save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'fields', 'type': str}]) dataset = load_dataset(args.dataset_path) fields = json.loads(args.fields) iterate_and_suppress(dataset, fields) save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'password', 'type': str}]) dataset = load_dataset(args.dataset_path) if args.password != 'PASSWORD': raise Exception save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'fields', 'type': str}]) dataset = load_dataset(args.dataset_path) fields = json.loads(args.fields) relevant_arrays = find_selected_arrays(dataset, fields) for array in relevant_arrays: shuffle_attributes(array['array'], array['field_selection']), save_dataset(dataset, args.output_path)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) for action in dataset['data']['actions']: if 'user' in action: action['user'] = sha256_hash(action['user']) for appInstanceResource in dataset['data']['appInstanceResources']: if 'user' in appInstanceResource: appInstanceResource['user'] = sha256_hash(appInstanceResource['user']) save_dataset(dataset, args.output_path)
def main(): # Prepares the parameters for the algorithm (dataset_path and output_path # come by default). You can then use them with args.parameter_name. # Avoid editing the parameters here, use the dedicated utility instead and # the code will change accordingly. args = parse_arguments() # load the json dataset, available as a python dictionary dataset = load_dataset(args.dataset_path) # write your validation code here # notify the validation outcome # outcome should be one of ValidationOutcome.SUCCESS, # ValidationOutcome.WARNING or ValidationOutcome.FAILURE # write the information to display in info outcome = ValidationOutcome.SUCCESS info = "" notify_validation_result(outcome, info)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) # hash the 'user' id for every action for action in dataset['data']['actions']: if 'user' in action: action['user'] = sha256_hash(action['user']) # hash the 'user' id for every appInstanceResource for appInstanceResource in dataset['data']['appInstanceResources']: if 'user' in appInstanceResource: appInstanceResource['user'] = sha256_hash( appInstanceResource['user']) # hash the user '_id' for every user and remove every other attribute new_users = [] for user in dataset['data']['users']: if '_id' in user: new_users.append({'_id': sha256_hash(user['_id'])}) dataset['data']['users'] = new_users save_dataset(dataset, args.output_path)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) has_user_name = False has_AP_settings = False has_API_data = False # verify in 'actions' if 'data' and 'geolocation' are present has_action_data = False has_action_geolocation = False actions = dataset['data']['actions'] for action in actions: if 'data' in action: data = action['data'] if data != None and data != '' and data != {}: has_action_data = True if 'geolocation' in action: geolocation = action['geolocation'] if geolocation != None and geolocation != '' and geolocation != {}: has_action_geolocation = True # verify in 'users' if 'name' is present users = dataset['data']['users'] for user in users: if 'name' in user: name = user['name'] if name != None and name != '': has_user_name = True # verify in 'appInstances' if 'settings' is present appInstances = dataset['data']['appInstances'] for ar in appInstances: if 'settings' in ar: settings = ar['settings'] if settings != None and settings != '' and settings != {}: has_AP_settings = True # verify in 'appInstanceResources' if 'data' is present appInstanceResources = dataset['data']['appInstanceResources'] for air in appInstanceResources: if 'data' in air: data = air['data'] if data != None and data != '' and data != {}: has_API_data = True # issue a warning if any of these potentially dangerousattributes # are present potentially_dangerous = (has_action_data or has_action_geolocation or has_user_name or has_AP_settings or has_API_data) if potentially_dangerous: messages = ['Potentially dangerous attributes: '] if has_action_data: messages.append('- actions > data') if has_action_geolocation: messages.append('- actions > geolocation') if has_user_name: messages.append('- users > name') if has_AP_settings: messages.append('- appInstances > settings') if has_API_data: messages.append('- appInstanceResources > data') notify_validation_result(ValidationOutcome.WARNING, '\n'.join(messages)) else: notify_validation_result(ValidationOutcome.SUCCESS, 'No potentially dangerous attributes')
def main(): args = parse_arguments([{'name': 'k', 'type': int}]) dataset = load_dataset(args.dataset_path) actions = dataset['data']['actions'] # get the geolocations for each user geolocations_per_user = {} for action in actions: if 'geolocation' in action: user = action['user'] if user not in geolocations_per_user: geolocations_per_user[user] = [] geoloc = action['geolocation'] country = geoloc.get('country') region = geoloc.get('region') city = geoloc.get('city') geolocations_per_user[user].append((country, region, city)) # get most represented geolocation for each user geolocation_mapping = {} for user, geolocations in geolocations_per_user.items(): most_common = Counter(geolocations).most_common(1) if len(most_common) == 1: [((country, region, city), _)] = most_common geolocation_mapping[user] = { 'country': country, 'region': region, 'city': city, } # group users by country then region then city grouped = {} for user, geo in geolocation_mapping.items(): country = geo['country'] region = geo['region'] city = geo['city'] if country not in grouped: grouped[country] = {'count': 0, 'regions': {}} country_group = grouped[country] country_group['count'] += 1 country_regions = country_group['regions'] if region not in country_regions: country_regions[region] = {'count': 0, 'cities': {}} region_group = country_regions[region] region_group['count'] += 1 region_cities = region_group['cities'] if city not in region_cities: region_cities[city] = {'count': 0, 'users': []} city_group = region_cities[city] city_group['count'] += 1 city_group['users'].append(user) # remove each value that is not represented at least k times for country, country_group in grouped.items(): country_user_count = country_group.get('count', 0) regions = country_group.get('regions', {}) for region, region_group in regions.items(): region_user_count = region_group.get('count', 0) cities = region_group.get('cities', {}) for city, city_group in cities.items(): city_user_count = city_group.get('count', 0) users = city_group.get('users', []) if city_user_count < args.k: for user in users: geolocation_mapping[user]['city'] = '' if region_user_count < args.k: for user in users: geolocation_mapping[user]['region'] = '' if country_user_count < args.k: for user in users: geolocation_mapping[user]['country'] = '' # update with new values for action in actions: if 'geolocation' in action: user = action['user'] action['geolocation'] = geolocation_mapping[user] save_dataset(dataset, args.output_path)