def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) users = dataset['data']['users'] # prepare id and name substitutions # id substitution = hash(id) id_substitutions = { user['_id']: { 'substitution': sha256_hash(user['_id']), 'regex': user['_id'] } for user in users } name_substitutions = { user['name']: { # name substitution = id substitution 'substitution': id_substitutions[user['_id']]['substitution'], 'regex': get_regex(user['name']) } for user in users } substitutions = [v['substitution'] for v in id_substitutions.values()] # replace the user ids in actions for action in dataset['data']['actions']: if 'user' in action: userid = action['user'] if userid not in id_substitutions: id_substitutions[userid] = { 'substitution': sha256_hash(userid), 'regex': userid } action['user'] = id_substitutions[userid]['substitution'] # replace the user ids in appInstanceResources for app_instance_res in dataset['data']['appInstanceResources']: if 'user' in app_instance_res: userid = app_instance_res['user'] if userid not in id_substitutions: id_substitutions[userid] = { 'substitution': sha256_hash(userid), 'regex': userid } app_instance_res['user'] = id_substitutions[userid]['substitution'] # generically search and replace occurrences of usernames and ids find_and_replace(dataset, name_substitutions) find_and_replace(dataset, id_substitutions) # the users fields are now just the hashes dataset['data']['users'] = substitutions save_dataset(dataset, args.output_path)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) # write your dataset changes here save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'positive_integer', 'type': int}]) dataset = load_dataset(args.dataset_path) if args.positive_integer < 0: raise Exception save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'fields', 'type': str}]) dataset = load_dataset(args.dataset_path) fields = json.loads(args.fields) iterate_and_suppress(dataset, fields) save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'password', 'type': str}]) dataset = load_dataset(args.dataset_path) if args.password != 'PASSWORD': raise Exception save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'fields', 'type': str}]) dataset = load_dataset(args.dataset_path) fields = json.loads(args.fields) relevant_arrays = find_selected_arrays(dataset, fields) for array in relevant_arrays: shuffle_attributes(array['array'], array['field_selection']), save_dataset(dataset, args.output_path)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) for action in dataset['data']['actions']: if 'user' in action: action['user'] = sha256_hash(action['user']) for appInstanceResource in dataset['data']['appInstanceResources']: if 'user' in appInstanceResource: appInstanceResource['user'] = sha256_hash(appInstanceResource['user']) save_dataset(dataset, args.output_path)
def main(): args = parse_arguments() dataset = load_dataset(args.dataset_path) # hash the 'user' id for every action for action in dataset['data']['actions']: if 'user' in action: action['user'] = sha256_hash(action['user']) # hash the 'user' id for every appInstanceResource for appInstanceResource in dataset['data']['appInstanceResources']: if 'user' in appInstanceResource: appInstanceResource['user'] = sha256_hash( appInstanceResource['user']) # hash the user '_id' for every user and remove every other attribute new_users = [] for user in dataset['data']['users']: if '_id' in user: new_users.append({'_id': sha256_hash(user['_id'])}) dataset['data']['users'] = new_users save_dataset(dataset, args.output_path)
def main(): args = parse_arguments([{'name': 'k', 'type': int}]) dataset = load_dataset(args.dataset_path) actions = dataset['data']['actions'] # get the geolocations for each user geolocations_per_user = {} for action in actions: if 'geolocation' in action: user = action['user'] if user not in geolocations_per_user: geolocations_per_user[user] = [] geoloc = action['geolocation'] country = geoloc.get('country') region = geoloc.get('region') city = geoloc.get('city') geolocations_per_user[user].append((country, region, city)) # get most represented geolocation for each user geolocation_mapping = {} for user, geolocations in geolocations_per_user.items(): most_common = Counter(geolocations).most_common(1) if len(most_common) == 1: [((country, region, city), _)] = most_common geolocation_mapping[user] = { 'country': country, 'region': region, 'city': city, } # group users by country then region then city grouped = {} for user, geo in geolocation_mapping.items(): country = geo['country'] region = geo['region'] city = geo['city'] if country not in grouped: grouped[country] = {'count': 0, 'regions': {}} country_group = grouped[country] country_group['count'] += 1 country_regions = country_group['regions'] if region not in country_regions: country_regions[region] = {'count': 0, 'cities': {}} region_group = country_regions[region] region_group['count'] += 1 region_cities = region_group['cities'] if city not in region_cities: region_cities[city] = {'count': 0, 'users': []} city_group = region_cities[city] city_group['count'] += 1 city_group['users'].append(user) # remove each value that is not represented at least k times for country, country_group in grouped.items(): country_user_count = country_group.get('count', 0) regions = country_group.get('regions', {}) for region, region_group in regions.items(): region_user_count = region_group.get('count', 0) cities = region_group.get('cities', {}) for city, city_group in cities.items(): city_user_count = city_group.get('count', 0) users = city_group.get('users', []) if city_user_count < args.k: for user in users: geolocation_mapping[user]['city'] = '' if region_user_count < args.k: for user in users: geolocation_mapping[user]['region'] = '' if country_user_count < args.k: for user in users: geolocation_mapping[user]['country'] = '' # update with new values for action in actions: if 'geolocation' in action: user = action['user'] action['geolocation'] = geolocation_mapping[user] save_dataset(dataset, args.output_path)