def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    users = dataset['data']['users']

    # prepare id and name substitutions
    # id substitution = hash(id)
    id_substitutions = {
        user['_id']: {
            'substitution': sha256_hash(user['_id']),
            'regex': user['_id']
        }
        for user in users
    }

    name_substitutions = {
        user['name']: {
            # name substitution = id substitution
            'substitution': id_substitutions[user['_id']]['substitution'],
            'regex': get_regex(user['name'])
        }
        for user in users
    }

    substitutions = [v['substitution'] for v in id_substitutions.values()]

    # replace the user ids in actions
    for action in dataset['data']['actions']:
        if 'user' in action:
            userid = action['user']
            if userid not in id_substitutions:
                id_substitutions[userid] = {
                    'substitution': sha256_hash(userid),
                    'regex': userid
                }

            action['user'] = id_substitutions[userid]['substitution']

    # replace the user ids in appInstanceResources
    for app_instance_res in dataset['data']['appInstanceResources']:
        if 'user' in app_instance_res:
            userid = app_instance_res['user']
            if userid not in id_substitutions:
                id_substitutions[userid] = {
                    'substitution': sha256_hash(userid),
                    'regex': userid
                }

            app_instance_res['user'] = id_substitutions[userid]['substitution']

    # generically search and replace occurrences of usernames and ids
    find_and_replace(dataset, name_substitutions)
    find_and_replace(dataset, id_substitutions)

    # the users fields are now just the hashes
    dataset['data']['users'] = substitutions

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    # write your dataset changes here

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'positive_integer', 'type': int}])

    dataset = load_dataset(args.dataset_path)

    if args.positive_integer < 0:
        raise Exception

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'fields', 'type': str}])

    dataset = load_dataset(args.dataset_path)
    fields = json.loads(args.fields)

    iterate_and_suppress(dataset, fields)

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'password', 'type': str}])

    dataset = load_dataset(args.dataset_path)

    if args.password != 'PASSWORD':
        raise Exception

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'fields', 'type': str}])

    dataset = load_dataset(args.dataset_path)
    fields = json.loads(args.fields)

    relevant_arrays = find_selected_arrays(dataset, fields)
    for array in relevant_arrays:
        shuffle_attributes(array['array'], array['field_selection']),

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    for action in dataset['data']['actions']:
        if 'user' in action:
            action['user'] = sha256_hash(action['user'])

    for appInstanceResource in dataset['data']['appInstanceResources']:
        if 'user' in appInstanceResource:
            appInstanceResource['user'] = sha256_hash(appInstanceResource['user'])

    save_dataset(dataset, args.output_path)
Beispiel #8
0
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    # hash the 'user' id for every action
    for action in dataset['data']['actions']:
        if 'user' in action:
            action['user'] = sha256_hash(action['user'])

    # hash the 'user' id for every appInstanceResource
    for appInstanceResource in dataset['data']['appInstanceResources']:
        if 'user' in appInstanceResource:
            appInstanceResource['user'] = sha256_hash(
                appInstanceResource['user'])

    # hash the user '_id' for every user and remove every other attribute
    new_users = []
    for user in dataset['data']['users']:
        if '_id' in user:
            new_users.append({'_id': sha256_hash(user['_id'])})
    dataset['data']['users'] = new_users

    save_dataset(dataset, args.output_path)
Beispiel #9
0
def main():
    args = parse_arguments([{'name': 'k', 'type': int}])

    dataset = load_dataset(args.dataset_path)

    actions = dataset['data']['actions']

    # get the geolocations for each user
    geolocations_per_user = {}
    for action in actions:
        if 'geolocation' in action:
            user = action['user']
            if user not in geolocations_per_user:
                geolocations_per_user[user] = []

            geoloc = action['geolocation']
            country = geoloc.get('country')
            region = geoloc.get('region')
            city = geoloc.get('city')
            geolocations_per_user[user].append((country, region, city))

    # get most represented geolocation for each user
    geolocation_mapping = {}
    for user, geolocations in geolocations_per_user.items():
        most_common = Counter(geolocations).most_common(1)
        if len(most_common) == 1:
            [((country, region, city), _)] = most_common
            geolocation_mapping[user] = {
                'country': country,
                'region': region,
                'city': city,
            }

    # group users by country then region then city
    grouped = {}
    for user, geo in geolocation_mapping.items():
        country = geo['country']
        region = geo['region']
        city = geo['city']

        if country not in grouped:
            grouped[country] = {'count': 0, 'regions': {}}
        country_group = grouped[country]
        country_group['count'] += 1
        country_regions = country_group['regions']

        if region not in country_regions:
            country_regions[region] = {'count': 0, 'cities': {}}
        region_group = country_regions[region]
        region_group['count'] += 1
        region_cities = region_group['cities']

        if city not in region_cities:
            region_cities[city] = {'count': 0, 'users': []}
        city_group = region_cities[city]
        city_group['count'] += 1
        city_group['users'].append(user)

    # remove each value that is not represented at least k times
    for country, country_group in grouped.items():
        country_user_count = country_group.get('count', 0)
        regions = country_group.get('regions', {})
        for region, region_group in regions.items():
            region_user_count = region_group.get('count', 0)
            cities = region_group.get('cities', {})
            for city, city_group in cities.items():
                city_user_count = city_group.get('count', 0)
                users = city_group.get('users', [])
                if city_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['city'] = ''
                if region_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['region'] = ''
                if country_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['country'] = ''

    # update with new values
    for action in actions:
        if 'geolocation' in action:
            user = action['user']
            action['geolocation'] = geolocation_mapping[user]

    save_dataset(dataset, args.output_path)