def ingest_spacenet(data_dir, cities, width, height, overwrite=False, train_fraction=0.9, percent_blank=0.5, annot_save=None): warnings.showwarning = _warning # monkeypatch to not emit code hw = '{}x{}'.format(height, width) ext = '.png' data = {} train_manifest = os.path.join(data_dir, 'train_{}.csv'.format(hw)) val_manifest = os.path.join(data_dir, 'val_{}.csv'.format(hw)) if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite: print("Manifest files already found, skipping ingest.") print("Use --overwrite flag to force re-ingest.") return for city in cities: if city == 'AOI_1_Rio': # Rio has different dataset structure img_folder = os.path.join(data_dir, city, 'processedData', '3band') annot_folder = os.path.join(data_dir, city, 'processedData', 'vectorData', 'geoJson') target_img_folder = os.path.join(data_dir, city, 'processedData', '3band-{}'.format(hw)) target_annot_folder = os.path.join(data_dir, city, 'processedData', 'json-{}'.format(hw)) test_img_folder = os.path.join(data_dir, city, 'processedData', '3band-{}-gt'.format(hw)) # helper function for converting image files to their corresponding annotation file # e.g. 3band_013022223133_Public_img3593.tif -> 013022223133_Public_img3593_Geo.geojson def img_to_annot(x): x.replace('3band_', '').replace('.tif', '_Geo.geojson') else: prefix = 'RGB-PanSharpen' img_folder = os.path.join(data_dir, city, prefix) annot_folder = os.path.join(data_dir, city, 'geojson', 'buildings') # create data folders to save converted images and annotations target_img_folder = os.path.join(data_dir, city, '{}-{}'.format(prefix, hw)) target_annot_folder = os.path.join(data_dir, city, 'json-{}'.format(hw)) test_img_folder = os.path.join(data_dir, city, '{}-{}-gt'.format(prefix, hw)) # helper function for converting image files to their corresponding annotation file # e.g. RGB-PanSharpen_AOI_2_Vegas_img9.tif -> buildings_AOI_2_Vegas_img9.geojson def img_to_annot(x): x.replace(prefix, 'buildings').replace('.tif', '.geojson') print('Processing {}'.format(city)) make_dir(target_img_folder) make_dir(target_annot_folder) make_dir(test_img_folder) images = glob.glob(os.path.join(img_folder, "*.tif")) assert len(images) > 0, 'No Images found in {}'.format(img_folder) data[city] = {'manifest': [], 'annotation': [], 'img_folder': img_folder, 'annot_folder': annot_folder} for image in tqdm(images): img_file = os.path.basename(image) annot_file = img_to_annot(img_file) annot = os.path.join(annot_folder, annot_file) assert os.path.exists(annot) # target image has extension=ext, and target_annot has extension JSON target_image = os.path.join(target_img_folder, os.path.splitext(img_file)[0] + ext) target_annot = os.path.join(target_annot_folder, os.path.splitext(annot_file)[0] + '.json') if not os.path.exists(target_image) or not os.path.exists(target_annot) or overwrite: annotation = convert_image_annot(image_path=image, annot_path=annot, target_image=target_image, target_annot=target_annot, width=512, height=512, box_shrink=0.8, debug_dir=test_img_folder) else: warnings.warn( 'File for {} already exists, skipping processing.Use --overwrite to force.'. format(city), Warning) annotation = json.load(open(target_annot)) # filter on percent_blank, as well as presence of any objects if is_eligible_example(annotation, percent_blank): data[city]['annotation'].append(annotation) data[city]['manifest'].append((target_image, target_annot)) # write manifest files # build manifest list from each city's manifest manifest = [] for city in cities: manifest.extend(data[city]['manifest']) ntrain = int(np.round(len(manifest) * train_fraction)) np.random.seed(0) np.random.shuffle(manifest) util.create_manifest(train_manifest, manifest[:ntrain], data_dir) util.create_manifest(val_manifest, manifest[ntrain:], data_dir) # write SSD CONFIG ssd_config = get_ssd_config((height, width)) ssd_config_path = os.path.join(data_dir, 'spacenet_ssd_{}.cfg'.format(hw)) util.write_ssd_config(ssd_config, ssd_config_path, True) # write SSD VAL CONFIG ssd_config_val = get_ssd_config((height, width), True) ssd_config_path_val = os.path.join(data_dir, 'spacenet_ssd_{}_val.cfg'.format(hw)) util.write_ssd_config(ssd_config_val, ssd_config_path_val, True) config_path = os.path.join(data_dir, 'spacenet_{}.cfg'.format(hw)) config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest), 'manifest_root': data_dir, 'epochs': 230, 'height': height, 'width': width, 'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val) } util.write_config(config, config_path) # write annotation pickle if annot_save is not None: pickle.dump(data, open(annot_save, 'w'))
def ingest_kitti(input_dir, out_dir, img_reshape=(300, 994), train_percent=90, overwrite=False, skip_unzip=False): """ Ingests the KITTI dataset. Peforms the following ops: 0. Unzips the files into output directory. 1. Reshapes image to lower resolution (default reshape of 300x994 maintains KITTI image AR) 1. Convert annotations to json format 2. Split the training data into train and validation sets 3. Write manifest file 4. Write configuration file Arguments: input_dir (string): path to folder with KITTI zip files. out_dir (string): path to unzip KITTI data img_reshape (tuple of int): size to reshape image (default = (300, 994)) train_percent (float): percent of data to use for training. overwrite (bool): overwrite existing files """ assert img_reshape is not None, "Target image reshape required." hw = '{}x{}'.format(img_reshape[0], img_reshape[1]) zip_files = ['data_object_image_2.zip', 'data_object_label_2.zip'] root_dir = os.path.join(out_dir, 'kitti') train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw)) val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw)) if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite: print("Manifest files already found, skipping ingest.") print("Use --overwrite flag to force re-ingest.") return util.make_dir(root_dir) tags = {'trainval': [], 'test': []} if skip_unzip is False: util.unzip_files(zip_files, input_dir, root_dir) img_folder = os.path.join(root_dir, 'training', 'image_2') annot_folder = os.path.join(root_dir, 'training', 'label_2') target_img_folder = os.path.join(root_dir, 'training', 'image_2-converted') target_annot_folder = os.path.join(root_dir, 'training', 'label_2-json') tags = glob.glob(os.path.join(img_folder, '*.png')) tags = [os.path.basename(os.path.splitext(tag)[0]) for tag in tags] assert len(tags) > 0, "No images found in {}".format(img_folder) util.make_dir(target_img_folder) util.make_dir(target_annot_folder) manifest = [] for tag in tqdm(tags): image = os.path.join(img_folder, tag + '.png') annot = os.path.join(annot_folder, tag + '.txt') assert os.path.exists(image), "{} not found.".format(image) assert os.path.exists(annot), "{} not found.".format(annot) target_image = os.path.join(target_img_folder, tag + '.png') target_annot = os.path.join(target_annot_folder, tag + '.json') convert_annot_to_json(annot, image, target_annot, difficult=True, img_reshape=None) util.resize_image(image, target_image, img_reshape=None) manifest.append((target_image, target_annot)) # shuffle files and split into training and validation set. np.random.seed(0) np.random.shuffle(manifest) train_count = (len(manifest) * train_percent) // 100 train = manifest[:train_count] val = manifest[train_count:] util.create_manifest(train_manifest, train, root_dir) util.create_manifest(val_manifest, val, root_dir) # write SSD CONFIG ssd_config = get_ssd_config(img_reshape) ssd_config_path = os.path.join(root_dir, 'kitti_ssd_{}.cfg'.format(hw)) util.write_ssd_config(ssd_config, ssd_config_path, True) # write SSD VAL CONFIG ssd_config_val = get_ssd_config(img_reshape, True) ssd_config_path_val = os.path.join(root_dir, 'kitti_ssd_{}_val.cfg'.format(hw)) util.write_ssd_config(ssd_config_val, ssd_config_path_val, True) config_path = os.path.join(root_dir, 'kitti_{}.cfg'.format(hw)) config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest), 'manifest_root': root_dir, 'epochs': 100, 'height': img_reshape[0], 'width': img_reshape[0], 'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val) } util.write_config(config, config_path)
def ingest_pascal(data_dir, out_dir, img_reshape=(300, 300), overwrite=False, skip_untar=False): assert img_reshape is not None, "Target image reshape required." hw = '{}x{}'.format(img_reshape[0], img_reshape[1]) datasets = ['VOC2007', 'VOC2012'] tar_files = { 'VOC2007': ['VOCtrainval_06-Nov-2007.tar', 'VOCtest_06-Nov-2007.tar'], 'VOC2012': ['VOCtrainval_11-May-2012.tar'] } index_name = {'trainval': 'trainval.txt', 'test': 'test.txt'} manifest = {'trainval': [], 'test': []} root_dir = os.path.join(out_dir, 'VOCdevkit') train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw)) val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw)) if os.path.exists(train_manifest) and os.path.exists( val_manifest) and not overwrite: print("Manifest files already found, skipping ingest.") print("Use --overwrite flag to force re-ingest.") return for year in datasets: tags = {'trainval': [], 'test': []} # define paths if skip_untar is False: tarfiles = [os.path.join(data_dir, tar) for tar in tar_files[year]] extract_tarfiles(tarfiles, out_dir) # read the index files and build a list of tags to process # in PASCALVOC, each tag (e.g. '000032') refers to an image (000032.jpg) # and an annotation XML file (000032.xml) for sets in index_name.keys(): index_file = os.path.join(root_dir, year, 'ImageSets', 'Main', index_name[sets]) if os.path.exists(index_file): tag_list = get_tag_list(index_file) tags[sets].extend(tag_list) print('Found {} images in {}'.format(len(tag_list), index_file)) img_folder = os.path.join(root_dir, year, 'JPEGImages') annot_folder = os.path.join(root_dir, year, 'Annotations') # create data folders to save converted images and annotations target_img_folder = os.path.join(root_dir, year, 'JPEGImages-converted') target_annot_folder = os.path.join(root_dir, year, 'Annotations-json') print('Processing {}'.format(year)) util.make_dir(target_img_folder) util.make_dir(target_annot_folder) all_tags = tags['trainval'] + tags[ 'test'] # process all the tags in our index files. for tag in tqdm(all_tags): image = os.path.join(img_folder, tag + '.jpg') annot = os.path.join(annot_folder, tag + '.xml') assert os.path.exists(image) assert os.path.exists(annot) target_image = os.path.join(target_img_folder, tag + '.jpg') target_annot = os.path.join(target_annot_folder, tag + '.json') # convert the annotations to json, including difficult objects convert_xml_to_json(annot, target_annot, difficult=True, img_reshape=None) util.resize_image(image, target_image, img_reshape=None) if tag in tags['trainval']: manifest['trainval'].append((target_image, target_annot)) elif tag in tags['test']: manifest['test'].append((target_image, target_annot)) np.random.seed(0) np.random.shuffle(manifest['trainval']) util.create_manifest(train_manifest, manifest['trainval'], root_dir) util.create_manifest(val_manifest, manifest['test'], root_dir) # write SSD CONFIG ssd_config = get_ssd_config(img_reshape) ssd_config_path = os.path.join(root_dir, 'pascalvoc_ssd_{}.cfg'.format(hw)) util.write_ssd_config(ssd_config, ssd_config_path, True) # write SSD VAL CONFIG ssd_config_val = get_ssd_config(img_reshape, True) ssd_config_path_val = os.path.join(root_dir, 'pascalvoc_ssd_{}_val.cfg'.format(hw)) util.write_ssd_config(ssd_config_val, ssd_config_path_val, True) config_path = os.path.join(root_dir, 'pascalvoc_{}.cfg'.format(hw)) config = { 'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest), 'manifest_root': root_dir, 'epochs': 230, 'height': img_reshape[0], 'width': img_reshape[1], 'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val) } util.write_config(config, config_path)
def ingest_pascal(data_dir, out_dir, img_reshape=(300, 300), overwrite=False, skip_untar=False): assert img_reshape is not None, "Target image reshape required." hw = '{}x{}'.format(img_reshape[0], img_reshape[1]) datasets = ['VOC2007', 'VOC2012'] tar_files = {'VOC2007': ['VOCtrainval_06-Nov-2007.tar', 'VOCtest_06-Nov-2007.tar'], 'VOC2012': ['VOCtrainval_11-May-2012.tar']} index_name = {'trainval': 'trainval.txt', 'test': 'test.txt'} manifest = {'trainval': [], 'test': []} root_dir = os.path.join(out_dir, 'VOCdevkit') train_manifest = os.path.join(root_dir, 'train_{}.csv'.format(hw)) val_manifest = os.path.join(root_dir, 'val_{}.csv'.format(hw)) if os.path.exists(train_manifest) and os.path.exists(val_manifest) and not overwrite: print("Manifest files already found, skipping ingest.") print("Use --overwrite flag to force re-ingest.") return for year in datasets: tags = {'trainval': [], 'test': []} # define paths if skip_untar is False: tarfiles = [os.path.join(data_dir, tar) for tar in tar_files[year]] extract_tarfiles(tarfiles, out_dir) # read the index files and build a list of tags to process # in PASCALVOC, each tag (e.g. '000032') refers to an image (000032.jpg) # and an annotation XML file (000032.xml) for sets in index_name.keys(): index_file = os.path.join(root_dir, year, 'ImageSets', 'Main', index_name[sets]) if os.path.exists(index_file): tag_list = get_tag_list(index_file) tags[sets].extend(tag_list) print('Found {} images in {}'.format(len(tag_list), index_file)) img_folder = os.path.join(root_dir, year, 'JPEGImages') annot_folder = os.path.join(root_dir, year, 'Annotations') # create data folders to save converted images and annotations target_img_folder = os.path.join(root_dir, year, 'JPEGImages-converted') target_annot_folder = os.path.join(root_dir, year, 'Annotations-json') print('Processing {}'.format(year)) util.make_dir(target_img_folder) util.make_dir(target_annot_folder) all_tags = tags['trainval'] + tags['test'] # process all the tags in our index files. for tag in tqdm(all_tags): image = os.path.join(img_folder, tag + '.jpg') annot = os.path.join(annot_folder, tag + '.xml') assert os.path.exists(image) assert os.path.exists(annot) target_image = os.path.join(target_img_folder, tag + '.jpg') target_annot = os.path.join(target_annot_folder, tag + '.json') # convert the annotations to json, including difficult objects convert_xml_to_json(annot, target_annot, difficult=True, img_reshape=None) util.resize_image(image, target_image, img_reshape=None) if tag in tags['trainval']: manifest['trainval'].append((target_image, target_annot)) elif tag in tags['test']: manifest['test'].append((target_image, target_annot)) np.random.seed(0) np.random.shuffle(manifest['trainval']) util.create_manifest(train_manifest, manifest['trainval'], root_dir) util.create_manifest(val_manifest, manifest['test'], root_dir) # write SSD CONFIG ssd_config = get_ssd_config(img_reshape) ssd_config_path = os.path.join(root_dir, 'pascalvoc_ssd_{}.cfg'.format(hw)) util.write_ssd_config(ssd_config, ssd_config_path, True) # write SSD VAL CONFIG ssd_config_val = get_ssd_config(img_reshape, True) ssd_config_path_val = os.path.join(root_dir, 'pascalvoc_ssd_{}_val.cfg'.format(hw)) util.write_ssd_config(ssd_config_val, ssd_config_path_val, True) config_path = os.path.join(root_dir, 'pascalvoc_{}.cfg'.format(hw)) config = {'manifest': '[train:{}, val:{}]'.format(train_manifest, val_manifest), 'manifest_root': root_dir, 'epochs': 230, 'height': img_reshape[0], 'width': img_reshape[1], 'ssd_config': '[train:{}, val:{}]'.format(ssd_config_path, ssd_config_path_val) } util.write_config(config, config_path)