Beispiel #1
0
def get_all_tile_quadkeys(project_id, verbose=True):
    all_tiles_path = os.path.join(working_dir_path, str(project_id),
                                  'all_tiles.pickled')

    if os.path.isfile(all_tiles_path):
        with open(all_tiles_path, 'rb') as f:
            return pickle.load(f)

    if verbose:
        sys.stdout.write('Calculating all tiles in project (#' +
                         str(project_id) + ')... ')
        sys.stdout.flush()

    parent_path = os.path.dirname(all_tiles_path)
    if not os.path.isdir(parent_path):
        os.makedirs(parent_path)

    with urllib.request.urlopen(
            'http://mapswipe.geog.uni-heidelberg.de/data/projects.geojson'
    ) as url:
        data = json.loads(url.read().decode())

    features = [
        x for x in data['features']
        if x['properties']['project_id'] == int(project_id)
    ]

    if len(features) == 0:
        raise Exception('Could not find feature.')
    elif len(features) > 1:
        raise Exception('Found multiple projects with the target id.')

    feature = features[0]

    bounding_poly = shapely.geometry.Polygon(
        (bing_maps.latlong_to_pixel((point[1], point[0]), 18)
         for point in feature['geometry']['coordinates'][0]))

    ret_val = [
        bing_maps.tile_to_quadkey(tile, 18)
        for tile in bing_maps.tiles_in_pixel_box(bounding_poly.bounds)
        if bounding_poly.contains(bing_maps.tile_to_pixel_box(tile))
    ]

    with open(all_tiles_path, 'wb') as f:
        pickle.dump(ret_val, f)

    if verbose:
        sys.stdout.write('Done\n')
        sys.stdout.flush()

    return ret_val
def get_all_tile_votes_for_projects(project_ids):
    retval = defaultdict(lambda: TileVotes(0, 0, 0))

    for project_id in project_ids:
        with mapswipe.get_project_details_file(
                project_id) as project_details_file:
            tile_json = json.loads(project_details_file.read())

        for tile in tile_json:
            quadkey = bing_maps.tile_to_quadkey(
                (int(tile['task_x']), int(tile['task_y'])),
                int(tile['task_z']))
            votes = TileVotes(tile['yes_count'], tile['maybe_count'],
                              tile['bad_imagery_count'])
            retval[quadkey] += votes

    return retval
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('project_ids', metavar='<project_id>', type=int, nargs='+',
                        help='Project IDs to use to generate the dataset.')
    parser.add_argument('--bing-maps-key', '-k', metavar='<bing_maps_api_key>', required=True,
                        help='Bing Maps API key to use to download map tiles.')
    parser.add_argument('--output-dir', '-o', metavar='<output_directory>', default='dataset',
                        help='Output directory to generate dataset in. Default: "dataset".')
    parser.add_argument('--seed', '-s', metavar='<random_seed>', default=0, type=int,
                        help='The random seed to use when picking tiles, this allows generated datasets to be '
                             'reproducible. Default: 0.')
    parser.add_argument('--max-size', '-n', metavar='<class_size>', default=sys.maxsize, type=int,
                        help='The maximum total number of items per class to output for each output subset ("train", '
                             '"valid", etc.')

    args = parser.parse_args()

    output_dir = args.output_dir
    if os.path.exists(output_dir):
        if query_yes_no('Directory {} already exists. Delete?'.format(output_dir), default='no') == 'yes':
            shutil.rmtree(output_dir)
        else:
            exit()

    bing_maps_client = bing_maps.BingMapsClient(args.bing_maps_key)

    built_floor = 1
    bad_imagery_floor = 1
    built_tiles = set()
    bad_imagery_tiles = set()
    empty_tiles = set()

    for project_id in args.project_ids:
        print('Preselecting tiles from project (#{})... '.format(project_id))

        with mapswipe.get_project_details_file(project_id, verbose=True) as project_details_file:
            project_details = json.load(project_details_file)

        annotated_tiles = set()
        # Normally, we'd just iterate through project details and do our stuff. But project_details is a big blob,
        # so instead we dismantle it as we go, in the hope that we'll lower our overall memory usage.
        while project_details:
            task = project_details.pop()
            quadkey = bing_maps.tile_to_quadkey((int(task['task_x']), int(task['task_y'])), int(task['task_z']))
            annotated_tiles.add(quadkey)

            if task['yes_count'] >= built_floor and task['maybe_count'] == 0 and task['bad_imagery_count'] == 0:
                built_tiles.add(quadkey)
            elif task['yes_count'] == 0 and task['maybe_count'] == 0 and task['bad_imagery_count'] >= bad_imagery_floor:
                bad_imagery_tiles.add(quadkey)

        all_tiles = set(mapswipe.get_all_tile_quadkeys(project_id))
        empty_tiles |= (all_tiles - annotated_tiles)

    classes_and_proportions = {'train': 80, 'valid': 10, 'test': 10}
    allocator = ProportionalAllocator(classes_and_proportions)

    tile_classes = ['built', 'bad_imagery', 'empty']
    for x in itertools.product(['train', 'valid'], tile_classes):
        os.makedirs(os.path.join(output_dir, *x))

    os.makedirs(os.path.join(output_dir, 'test'))

    # We have to sort all of the tiles in sets. This is because some project boundaries overlap, and so it's possible
    #  for us to pick a tile twice. We then have to sort these sets, so that when we shuffle them we start from a
    # consistent base. We allow the use to set a random seed for the shuffling, so this means that it's possible to
    # reproduce a dataset.

    random.seed(args.seed)

    built_tiles = list(built_tiles)
    built_tiles.sort()
    random.shuffle(built_tiles)

    bad_imagery_tiles = list(bad_imagery_tiles)
    bad_imagery_tiles.sort()
    random.shuffle(bad_imagery_tiles)

    empty_tiles = list(empty_tiles)
    empty_tiles.sort()
    random.shuffle(empty_tiles)

    with open(os.path.join(output_dir, 'test', 'solutions.csv'), 'w') as solutions_file:
        while built_tiles and bad_imagery_tiles and empty_tiles and allocator.total < args.max_size:
            sample_built = pick_from(built_tiles, bing_maps_client)
            sample_bad_imagery = pick_from(bad_imagery_tiles, bing_maps_client)
            sample_empty = pick_from(empty_tiles, bing_maps_client)

            if sample_built is not None and sample_bad_imagery is not None and sample_empty is not None:
                clazz = allocator.allocate()

                if clazz == 'test':
                    output_tile(sample_built, os.path.join(output_dir, clazz))
                    output_tile(sample_bad_imagery, os.path.join(output_dir, clazz))
                    output_tile(sample_empty, os.path.join(output_dir, clazz))

                    solutions_file.write(sample_built + ',built\n')
                    solutions_file.write(sample_bad_imagery + ',bad_imagery\n')
                    solutions_file.write(sample_empty + ',empty\n')
                    solutions_file.flush()
                else:
                    output_tile(sample_built, os.path.join(output_dir, clazz, 'built'))
                    output_tile(sample_bad_imagery, os.path.join(output_dir, clazz, 'bad_imagery'))
                    output_tile(sample_empty, os.path.join(output_dir, clazz, 'empty'))

            sys.stdout.write('\rTiles picked: {} in each of {}'.format(allocator, tile_classes))

        sys.stdout.write('\n')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('project_ids', metavar='<project_id>', type=int, nargs='+',
                        help='Project IDs to use to generate the dataset.')
    parser.add_argument('--bing-maps-key', '-k', metavar='<bing_maps_api_key>', required=True,
                        help='Bing Maps API key to use to download map tiles.')
    parser.add_argument('--output-dir', '-o', metavar='<output_directory>', default='dataset',
                        help='Output directory to generate dataset in. Default: "dataset".')
    parser.add_argument('--seed', '-s', metavar='<random_seed>', default=0, type=int,
                        help='The random seed to use when picking tiles, this allows generated datasets to be '
                             'reproducible. Default: 0.')
    parser.add_argument('--max-size', '-n', metavar='<class_size>', default=sys.maxsize, type=int,
                        help='The maximum total number of items per class to output for each output subset ("train", '
                             '"valid", etc.')
    parser.add_argument('--inner-test-dir-for-keras', action='store_true',
                        help='Create an extra directory inside the test directory (useful when working with Keras)')

    args = parser.parse_args()

    output_dir = args.output_dir
    if os.path.exists(output_dir):
        if query_yes_no('Directory {} already exists. Delete?'.format(output_dir), default='no') == 'yes':
            shutil.rmtree(output_dir)
        else:
            exit()

    if args.inner_test_dir_for_keras:
        inner_test_dir = 'test/test'
    else:
        inner_test_dir = 'test'

    bing_maps_client = bing_maps.BingMapsClient(args.bing_maps_key)

    built_floor = 1
    bad_imagery_floor = 1

    classes_and_proportions = {'train': 80, 'valid': 10, 'test': 10}

    tile_classes = ['built', 'bad_imagery', 'empty']
    for x in itertools.product(['train', 'valid'], tile_classes):
        os.makedirs(os.path.join(output_dir, *x))

    os.makedirs(os.path.join(output_dir, inner_test_dir))

    # We have to store all of the tiles in a set to stop us from selecting the same tile twice if it appears in multiple projects
    # (sometimes the boundaries overlap a little)
    all_tiles = set()

    total_tile_groups_written = 0

    with open(os.path.join(output_dir, 'test', 'solutions.csv'), 'w') as solutions_file:
        for project_id in args.project_ids:
            if total_tile_groups_written >= args.max_size:
                return

            allocator = ProportionalAllocator(classes_and_proportions)

            print('Selecting tiles from project (#{})... '.format(project_id))
            fresh_project_tiles = set(
                mapswipe.get_all_tile_quadkeys(project_id)) - all_tiles

            built_tiles = set()
            bad_imagery_tiles = set()
            empty_tiles = set()

            with mapswipe.get_project_details_file(project_id, verbose=True) as project_details_file:
                project_details = json.load(project_details_file)

            annotated_tiles = set()
            # Normally, we'd just iterate through project details and do our stuff. But project_details is a big blob,
            # so instead we dismantle it as we go, in the hope that we'll lower
            # our overall memory usage.
            while project_details:
                task = project_details.pop()
                quadkey = bing_maps.tile_to_quadkey(
                    (int(task['task_x']), int(task['task_y'])), int(task['task_z']))
                if quadkey not in fresh_project_tiles:
                    continue

                annotated_tiles.add(quadkey)

                if task['yes_count'] >= built_floor and task['maybe_count'] == 0 and task['bad_imagery_count'] == 0:
                    built_tiles.add(quadkey)
                elif task['yes_count'] == 0 and task['maybe_count'] == 0 and task['bad_imagery_count'] >= bad_imagery_floor:
                    bad_imagery_tiles.add(quadkey)

            empty_tiles |= (fresh_project_tiles - annotated_tiles)

            all_tiles |= fresh_project_tiles

            # We allow the user to set a random seed for the shuffling, so this means that it's possible to
            # reproduce a dataset.
            random.seed(args.seed)

            # The data structures are sets, so we have to sort after converting it to a list to gives us a stable sort order (so that you can generate the same dataset with just a random seed).
            # Obviously this goes out the window if the ground truth data
            # changes at MapSwipe's end.
            built_tiles = list(built_tiles)
            built_tiles.sort()
            random.shuffle(built_tiles)

            bad_imagery_tiles = list(bad_imagery_tiles)
            bad_imagery_tiles.sort()
            random.shuffle(bad_imagery_tiles)

            empty_tiles = list(empty_tiles)
            empty_tiles.sort()
            random.shuffle(empty_tiles)

            while built_tiles and bad_imagery_tiles and empty_tiles and total_tile_groups_written < args.max_size:
                sample_built = pick_from(built_tiles, bing_maps_client)
                sample_bad_imagery = pick_from(
                    bad_imagery_tiles, bing_maps_client)
                sample_empty = pick_from(empty_tiles, bing_maps_client)

                if sample_built is not None and sample_bad_imagery is not None and sample_empty is not None:
                    clazz = allocator.allocate()

                    total_tile_groups_written += 1
                    if clazz == 'test':
                        output_tile(sample_built, os.path.join(
                            output_dir, inner_test_dir))
                        output_tile(sample_bad_imagery,
                                    os.path.join(output_dir, inner_test_dir))
                        output_tile(sample_empty, os.path.join(
                            output_dir, inner_test_dir))

                        solutions_file.write(sample_built + ',built\n')
                        solutions_file.write(
                            sample_bad_imagery + ',bad_imagery\n')
                        solutions_file.write(sample_empty + ',empty\n')
                        solutions_file.flush()
                    else:
                        output_tile(sample_built, os.path.join(
                            output_dir, clazz, 'built'))
                        output_tile(sample_bad_imagery, os.path.join(
                            output_dir, clazz, 'bad_imagery'))
                        output_tile(sample_empty, os.path.join(
                            output_dir, clazz, 'empty'))

                sys.stdout.write('\r\tTiles picked: {} in each of {}. Total: {}'.format(
                    allocator, tile_classes, allocator.total * 3))

            sys.stdout.write('\n')