Example #1
0
def ingest_spacenet(data_dir, cities, width, height, overwrite=False,
                    train_fraction=0.9, percent_blank=0.5, annot_save=None):

    warnings.showwarning = _warning  # monkeypatch to not emit code
    hw = '{}x{}'.format(height, width)
    ext = '.png'
    data = {}

    train_manifest = os.path.join(data_dir, 'train_{}.csv'.format(hw))
    val_manifest = os.path.join(data_dir, 'val_{}.csv'.format(hw))

    if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite:
        print("Manifest files already found, skipping ingest.")
        print("Use --overwrite flag to force re-ingest.")
        return

    for city in cities:

        if city == 'AOI_1_Rio':  # Rio has different dataset structure
            img_folder = os.path.join(data_dir, city, 'processedData', '3band')
            annot_folder = os.path.join(data_dir, city, 'processedData', 'vectorData', 'geoJson')

            target_img_folder = os.path.join(data_dir, city, 'processedData',
                                             '3band-{}'.format(hw))
            target_annot_folder = os.path.join(data_dir, city, 'processedData',
                                               'json-{}'.format(hw))
            test_img_folder = os.path.join(data_dir, city,
                                           'processedData', '3band-{}-gt'.format(hw))

            # helper function for converting image files to their corresponding annotation file
            # e.g. 3band_013022223133_Public_img3593.tif -> 013022223133_Public_img3593_Geo.geojson
            def img_to_annot(x): x.replace('3band_', '').replace('.tif', '_Geo.geojson')

        else:
            prefix = 'RGB-PanSharpen'
            img_folder = os.path.join(data_dir, city, prefix)
            annot_folder = os.path.join(data_dir, city, 'geojson', 'buildings')

            # create data folders to save converted images and annotations
            target_img_folder = os.path.join(data_dir, city, '{}-{}'.format(prefix, hw))
            target_annot_folder = os.path.join(data_dir, city, 'json-{}'.format(hw))
            test_img_folder = os.path.join(data_dir, city, '{}-{}-gt'.format(prefix, hw))

            # helper function for converting image files to their corresponding annotation file
            # e.g. RGB-PanSharpen_AOI_2_Vegas_img9.tif -> buildings_AOI_2_Vegas_img9.geojson
            def img_to_annot(x): x.replace(prefix, 'buildings').replace('.tif', '.geojson')

        print('Processing {}'.format(city))

        make_dir(target_img_folder)
        make_dir(target_annot_folder)
        make_dir(test_img_folder)

        images = glob.glob(os.path.join(img_folder, "*.tif"))
        assert len(images) > 0, 'No Images found in {}'.format(img_folder)

        data[city] = {'manifest': [], 'annotation': [],
                      'img_folder': img_folder,
                      'annot_folder': annot_folder}

        for image in tqdm(images):

            img_file = os.path.basename(image)
            annot_file = img_to_annot(img_file)
            annot = os.path.join(annot_folder, annot_file)
            assert os.path.exists(annot)

            # target image has extension=ext, and target_annot has extension JSON
            target_image = os.path.join(target_img_folder, os.path.splitext(img_file)[0] + ext)
            target_annot = os.path.join(target_annot_folder,
                                        os.path.splitext(annot_file)[0] + '.json')

            if not os.path.exists(target_image) or not os.path.exists(target_annot) or overwrite:

                annotation = convert_image_annot(image_path=image, annot_path=annot,
                                                 target_image=target_image,
                                                 target_annot=target_annot,
                                                 width=512, height=512, box_shrink=0.8,
                                                 debug_dir=test_img_folder)
            else:
                warnings.warn(
                    'File for {} already exists, skipping processing.Use --overwrite to force.'.
                    format(city), Warning)
                annotation = json.load(open(target_annot))

            # filter on percent_blank, as well as presence of any objects
            if is_eligible_example(annotation, percent_blank):
                data[city]['annotation'].append(annotation)
                data[city]['manifest'].append((target_image, target_annot))

    # write manifest files

    # build manifest list from each city's manifest
    manifest = []
    for city in cities:
        manifest.extend(data[city]['manifest'])

    ntrain = int(np.round(len(manifest) * train_fraction))

    np.random.seed(0)
    np.random.shuffle(manifest)

    util.create_manifest(train_manifest, manifest[:ntrain], data_dir)
    util.create_manifest(val_manifest, manifest[ntrain:], data_dir)

    # write SSD CONFIG
    ssd_config = get_ssd_config((height, width))
    ssd_config_path = os.path.join(data_dir, 'spacenet_ssd_{}.cfg'.format(hw))
    util.write_ssd_config(ssd_config, ssd_config_path, True)

    # write SSD VAL CONFIG
    ssd_config_val = get_ssd_config((height, width), True)
    ssd_config_path_val = os.path.join(data_dir, 'spacenet_ssd_{}_val.cfg'.format(hw))
    util.write_ssd_config(ssd_config_val, ssd_config_path_val, True)

    config_path = os.path.join(data_dir, 'spacenet_{}.cfg'.format(hw))

    config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest),
              'manifest_root': data_dir,
              'epochs': 230,
              'height': height,
              'width': width,
              'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val)
              }

    util.write_config(config, config_path)

    # write annotation pickle
    if annot_save is not None:
        pickle.dump(data, open(annot_save, 'w'))
Example #2
0
def ingest_kitti(input_dir, out_dir, img_reshape=(300, 994),
                 train_percent=90, overwrite=False, skip_unzip=False):
    """
    Ingests the KITTI dataset. Peforms the following ops:
    0. Unzips the files into output directory.
    1. Reshapes image to lower resolution (default reshape of 300x994 maintains KITTI image AR)
    1. Convert annotations to json format
    2. Split the training data into train and validation sets
    3. Write manifest file
    4. Write configuration file

    Arguments:
        input_dir (string): path to folder with KITTI zip files.
        out_dir (string): path to unzip KITTI data
        img_reshape (tuple of int): size to reshape image (default = (300, 994))
        train_percent (float): percent of data to use for training.
        overwrite (bool): overwrite existing files
    """

    assert img_reshape is not None, "Target image reshape required."
    hw = '{}x{}'.format(img_reshape[0], img_reshape[1])

    zip_files = ['data_object_image_2.zip', 'data_object_label_2.zip']

    root_dir = os.path.join(out_dir, 'kitti')
    train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw))
    val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw))

    if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite:
        print("Manifest files already found, skipping ingest.")
        print("Use --overwrite flag to force re-ingest.")
        return

    util.make_dir(root_dir)

    tags = {'trainval': [], 'test': []}

    if skip_unzip is False:
        util.unzip_files(zip_files, input_dir, root_dir)

    img_folder = os.path.join(root_dir, 'training', 'image_2')
    annot_folder = os.path.join(root_dir, 'training', 'label_2')

    target_img_folder = os.path.join(root_dir, 'training', 'image_2-converted')
    target_annot_folder = os.path.join(root_dir, 'training', 'label_2-json')

    tags = glob.glob(os.path.join(img_folder, '*.png'))
    tags = [os.path.basename(os.path.splitext(tag)[0]) for tag in tags]
    assert len(tags) > 0, "No images found in {}".format(img_folder)

    util.make_dir(target_img_folder)
    util.make_dir(target_annot_folder)

    manifest = []

    for tag in tqdm(tags):

        image = os.path.join(img_folder, tag + '.png')
        annot = os.path.join(annot_folder, tag + '.txt')
        assert os.path.exists(image), "{} not found.".format(image)
        assert os.path.exists(annot), "{} not found.".format(annot)

        target_image = os.path.join(target_img_folder, tag + '.png')
        target_annot = os.path.join(target_annot_folder, tag + '.json')

        convert_annot_to_json(annot, image, target_annot, difficult=True, img_reshape=None)
        util.resize_image(image, target_image, img_reshape=None)

        manifest.append((target_image, target_annot))

    # shuffle files and split into training and validation set.
    np.random.seed(0)
    np.random.shuffle(manifest)

    train_count = (len(manifest) * train_percent) // 100
    train = manifest[:train_count]
    val = manifest[train_count:]

    util.create_manifest(train_manifest, train, root_dir)
    util.create_manifest(val_manifest, val, root_dir)

    # write SSD CONFIG
    ssd_config = get_ssd_config(img_reshape)
    ssd_config_path = os.path.join(root_dir, 'kitti_ssd_{}.cfg'.format(hw))
    util.write_ssd_config(ssd_config, ssd_config_path, True)

    # write SSD VAL CONFIG
    ssd_config_val = get_ssd_config(img_reshape, True)
    ssd_config_path_val = os.path.join(root_dir, 'kitti_ssd_{}_val.cfg'.format(hw))
    util.write_ssd_config(ssd_config_val, ssd_config_path_val, True)

    config_path = os.path.join(root_dir, 'kitti_{}.cfg'.format(hw))
    config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest),
              'manifest_root': root_dir,
              'epochs': 100,
              'height': img_reshape[0],
              'width': img_reshape[0],
              'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val)
              }

    util.write_config(config, config_path)
Example #3
0
def ingest_pascal(data_dir,
                  out_dir,
                  img_reshape=(300, 300),
                  overwrite=False,
                  skip_untar=False):

    assert img_reshape is not None, "Target image reshape required."
    hw = '{}x{}'.format(img_reshape[0], img_reshape[1])

    datasets = ['VOC2007', 'VOC2012']
    tar_files = {
        'VOC2007': ['VOCtrainval_06-Nov-2007.tar', 'VOCtest_06-Nov-2007.tar'],
        'VOC2012': ['VOCtrainval_11-May-2012.tar']
    }

    index_name = {'trainval': 'trainval.txt', 'test': 'test.txt'}
    manifest = {'trainval': [], 'test': []}

    root_dir = os.path.join(out_dir, 'VOCdevkit')

    train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw))
    val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw))

    if os.path.exists(train_manifest) and os.path.exists(
            val_manifest) and not overwrite:
        print("Manifest files already found, skipping ingest.")
        print("Use --overwrite flag to force re-ingest.")
        return

    for year in datasets:
        tags = {'trainval': [], 'test': []}

        # define paths
        if skip_untar is False:
            tarfiles = [os.path.join(data_dir, tar) for tar in tar_files[year]]
            extract_tarfiles(tarfiles, out_dir)

        # read the index files and build a list of tags to process
        # in PASCALVOC, each tag (e.g. '000032') refers to an image (000032.jpg)
        # and an annotation XML file (000032.xml)
        for sets in index_name.keys():
            index_file = os.path.join(root_dir, year, 'ImageSets', 'Main',
                                      index_name[sets])
            if os.path.exists(index_file):
                tag_list = get_tag_list(index_file)
                tags[sets].extend(tag_list)
                print('Found {} images in {}'.format(len(tag_list),
                                                     index_file))

        img_folder = os.path.join(root_dir, year, 'JPEGImages')
        annot_folder = os.path.join(root_dir, year, 'Annotations')

        # create data folders to save converted images and annotations
        target_img_folder = os.path.join(root_dir, year,
                                         'JPEGImages-converted')
        target_annot_folder = os.path.join(root_dir, year, 'Annotations-json')

        print('Processing {}'.format(year))

        util.make_dir(target_img_folder)
        util.make_dir(target_annot_folder)

        all_tags = tags['trainval'] + tags[
            'test']  # process all the tags in our index files.

        for tag in tqdm(all_tags):

            image = os.path.join(img_folder, tag + '.jpg')
            annot = os.path.join(annot_folder, tag + '.xml')
            assert os.path.exists(image)
            assert os.path.exists(annot)

            target_image = os.path.join(target_img_folder, tag + '.jpg')
            target_annot = os.path.join(target_annot_folder, tag + '.json')

            # convert the annotations to json, including difficult objects
            convert_xml_to_json(annot,
                                target_annot,
                                difficult=True,
                                img_reshape=None)
            util.resize_image(image, target_image, img_reshape=None)

            if tag in tags['trainval']:
                manifest['trainval'].append((target_image, target_annot))
            elif tag in tags['test']:
                manifest['test'].append((target_image, target_annot))

    np.random.seed(0)
    np.random.shuffle(manifest['trainval'])

    util.create_manifest(train_manifest, manifest['trainval'], root_dir)
    util.create_manifest(val_manifest, manifest['test'], root_dir)

    # write SSD CONFIG
    ssd_config = get_ssd_config(img_reshape)
    ssd_config_path = os.path.join(root_dir, 'pascalvoc_ssd_{}.cfg'.format(hw))
    util.write_ssd_config(ssd_config, ssd_config_path, True)

    # write SSD VAL CONFIG
    ssd_config_val = get_ssd_config(img_reshape, True)
    ssd_config_path_val = os.path.join(root_dir,
                                       'pascalvoc_ssd_{}_val.cfg'.format(hw))
    util.write_ssd_config(ssd_config_val, ssd_config_path_val, True)

    config_path = os.path.join(root_dir, 'pascalvoc_{}.cfg'.format(hw))

    config = {
        'manifest':
        '[train:{}, val:{}]'.format(train_manifest, val_manifest),
        'manifest_root':
        root_dir,
        'epochs':
        230,
        'height':
        img_reshape[0],
        'width':
        img_reshape[1],
        'ssd_config':
        '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val)
    }

    util.write_config(config, config_path)
Example #4
0
def ingest_pascal(data_dir, out_dir, img_reshape=(300, 300), overwrite=False, skip_untar=False):

    assert img_reshape is not None, "Target image reshape required."
    hw = '{}x{}'.format(img_reshape[0], img_reshape[1])

    datasets = ['VOC2007', 'VOC2012']
    tar_files = {'VOC2007': ['VOCtrainval_06-Nov-2007.tar', 'VOCtest_06-Nov-2007.tar'],
                 'VOC2012': ['VOCtrainval_11-May-2012.tar']}

    index_name = {'trainval': 'trainval.txt', 'test': 'test.txt'}
    manifest = {'trainval': [], 'test': []}

    root_dir = os.path.join(out_dir, 'VOCdevkit')

    train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw))
    val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw))

    if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite:
        print("Manifest files already found, skipping ingest.")
        print("Use --overwrite flag to force re-ingest.")
        return

    for year in datasets:
        tags = {'trainval': [], 'test': []}

        # define paths
        if skip_untar is False:
            tarfiles = [os.path.join(data_dir, tar) for tar in tar_files[year]]
            extract_tarfiles(tarfiles, out_dir)

        # read the index files and build a list of tags to process
        # in PASCALVOC, each tag (e.g. '000032') refers to an image (000032.jpg)
        # and an annotation XML file (000032.xml)
        for sets in index_name.keys():
            index_file = os.path.join(root_dir, year, 'ImageSets', 'Main', index_name[sets])
            if os.path.exists(index_file):
                tag_list = get_tag_list(index_file)
                tags[sets].extend(tag_list)
                print('Found {} images in {}'.format(len(tag_list), index_file))

        img_folder = os.path.join(root_dir, year, 'JPEGImages')
        annot_folder = os.path.join(root_dir, year, 'Annotations')

        # create data folders to save converted images and annotations
        target_img_folder = os.path.join(root_dir, year, 'JPEGImages-converted')
        target_annot_folder = os.path.join(root_dir, year, 'Annotations-json')

        print('Processing {}'.format(year))

        util.make_dir(target_img_folder)
        util.make_dir(target_annot_folder)

        all_tags = tags['trainval'] + tags['test']  # process all the tags in our index files.

        for tag in tqdm(all_tags):

            image = os.path.join(img_folder, tag + '.jpg')
            annot = os.path.join(annot_folder, tag + '.xml')
            assert os.path.exists(image)
            assert os.path.exists(annot)

            target_image = os.path.join(target_img_folder, tag + '.jpg')
            target_annot = os.path.join(target_annot_folder, tag + '.json')

            # convert the annotations to json, including difficult objects
            convert_xml_to_json(annot, target_annot, difficult=True, img_reshape=None)
            util.resize_image(image, target_image, img_reshape=None)

            if tag in tags['trainval']:
                manifest['trainval'].append((target_image, target_annot))
            elif tag in tags['test']:
                manifest['test'].append((target_image, target_annot))

    np.random.seed(0)
    np.random.shuffle(manifest['trainval'])

    util.create_manifest(train_manifest, manifest['trainval'], root_dir)
    util.create_manifest(val_manifest, manifest['test'], root_dir)

    # write SSD CONFIG
    ssd_config = get_ssd_config(img_reshape)
    ssd_config_path = os.path.join(root_dir, 'pascalvoc_ssd_{}.cfg'.format(hw))
    util.write_ssd_config(ssd_config, ssd_config_path, True)

    # write SSD VAL CONFIG
    ssd_config_val = get_ssd_config(img_reshape, True)
    ssd_config_path_val = os.path.join(root_dir, 'pascalvoc_ssd_{}_val.cfg'.format(hw))
    util.write_ssd_config(ssd_config_val, ssd_config_path_val, True)

    config_path = os.path.join(root_dir, 'pascalvoc_{}.cfg'.format(hw))

    config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest),
              'manifest_root': root_dir,
              'epochs': 230,
              'height': img_reshape[0],
              'width': img_reshape[1],
              'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val)
              }

    util.write_config(config, config_path)