def main(): parser = argparse.ArgumentParser() parser.add_argument( 'dataset_name', help='A short string representing the dataset to be used as a partition key in MegaDB') parser.add_argument( '--image_db', help='Path to the json containing the image DB in CCT format') parser.add_argument( '--bbox_db', help='Path to the json containing the bbox DB in CCT format') parser.add_argument( '--docs', help='Embedded CCT format json to use instead of image_db or bbox_db') parser.add_argument( '--partial_mega_db', required=True, help='Path to store the resulting json') args = parser.parse_args() assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string' if args.image_db: assert os.path.exists(args.image_db), 'image_db file path provided does not point to a file' if args.bbox_db: assert os.path.exists(args.bbox_db), 'bbox_db file path provided does not point to a file' docs = make_cct_embedded(args.image_db, args.bbox_db) sequences = process_sequences(docs, args.dataset_name) sequences_schema_check.sequences_schema_check(sequences) write_json(args.partial_mega_db, sequences)
def megadb_to_cct(dataset_name, mega_db, output_path, bbox_only): mega_db = [i for i in mega_db if i['dataset'] == dataset_name] assert len(mega_db) > 0, 'There are no entries from the dataset {}'.format(dataset_name) for i in mega_db: del i['dataset'] # all remaining fields will be added to the CCT database print('Number of entries belonging to dataset {}: {}'.format(dataset_name, len(mega_db))) cct_images, cct_annotations = break_into_images_annotations(mega_db, bbox_only) # consolidate categories category_names = set() for anno in cct_annotations: category_names.add(anno['category_name']) cat_name_to_id = { 'empty': 0 # always set empty to 0 even for dataset without 'empty' labeled images } if bbox_only: cat_name_to_id['animal'] = 1 cat_name_to_id['person'] = 2 cat_name_to_id['group'] = 3 cat_name_to_id['vehicle'] = 4 for cat in category_names: if cat not in cat_name_to_id: cat_name_to_id[cat] = len(cat_name_to_id) for anno in cct_annotations: anno['category_id'] = cat_name_to_id[anno['category_name']] del anno['category_name'] cct_categories = [] for name, num_id in cat_name_to_id.items(): cct_categories.append({ 'id': num_id, 'name': name }) print('Final CCT DB has {} image entries, and {} annotation entries.'.format(len(cct_images), len(cct_annotations))) cct_db = { 'info': { 'version': str(datetime.now()), 'date_created': str(datetime.today().date()), 'description': '' # to be filled by main() }, 'images': cct_images, 'categories': cct_categories, 'annotations': cct_annotations } cct_db = CameraTrapJsonUtils.order_db_keys(cct_db) cct_db['info']['description'] = 'COCO Camera Traps database converted from sequences in dataset {}'.format( dataset_name) print('Writing to output file...') write_json(output_path, cct_db) print('Done!')
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'dataset_name', help= 'The name of the dataset; only entries from this dataset will be used') parser.add_argument('mega_db_seqs', help='A json containing a list of sequence objects') parser.add_argument( 'out_file', help='Path to store the resulting json to input to megadb_to_cct.py') parser.add_argument( '--ncores', type=int, default=None, help= 'Number of cores to use when downloading images to read their dimensions' ) args = parser.parse_args() assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string' assert os.path.exists( args.mega_db_seqs), 'File at mega_db path does not exist' assert args.out_file.endswith( '.json'), 'out_cct_db path needs to end in .json' assert args.out_file != args.mega_db_seqs assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ print('Loading entries...') with open(args.mega_db_seqs) as f: mega_db_entries = json.load(f) print('Number of entries in the mega_db: {}'.format(len(mega_db_entries))) megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() start_time = time.time() updated_seqs = get_image_dims(mega_db_entries, args.dataset_name, datasets_table, args.ncores) write_json(args.out_file, updated_seqs) elapsed = time.time() - start_time print('Time elapsed: {}'.format(humanfriendly.format_timespan(elapsed)))