def main(): parser = argparse.ArgumentParser() parser.add_argument( 'dataset_name', help= 'The name of the dataset; only entries from this dataset will be used') parser.add_argument('mega_db_seqs', help='A json containing a list of sequence objects') parser.add_argument( 'out_file', help='Path to store the resulting json to input to megadb_to_cct.py') parser.add_argument( '--ncores', type=int, default=None, help= 'Number of cores to use when downloading images to read their dimensions' ) args = parser.parse_args() assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string' assert os.path.exists( args.mega_db_seqs), 'File at mega_db path does not exist' assert args.out_file.endswith( '.json'), 'out_cct_db path needs to end in .json' assert args.out_file != args.mega_db_seqs assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ print('Loading entries...') with open(args.mega_db_seqs) as f: mega_db_entries = json.load(f) print('Number of entries in the mega_db: {}'.format(len(mega_db_entries))) megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() start_time = time.time() updated_seqs = get_image_dims(mega_db_entries, args.dataset_name, datasets_table, args.ncores) write_json(args.out_file, updated_seqs) elapsed = time.time() - start_time print('Time elapsed: {}'.format(humanfriendly.format_timespan(elapsed)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('megadb_entries', type=str, help='Path to a json list of MegaDB entries') parser.add_argument('output_dir', action='store', type=str, help='Output directory for html and rendered images') parser.add_argument('--trim_to_images_bboxes_labeled', action='store_true', help='Only include images that have been sent for bbox labeling (but may be actually empty). Turn this on if QAing annotations.') parser.add_argument('--num_to_visualize', action='store', type=int, default=200, help='Number of images to visualize (all comformant images in a sequence are shown, so may be a few more than specified). Sequences are shuffled. Defaults to 200. Use -1 to visualize all.') parser.add_argument('--pathsep_replacement', action='store', type=str, default='~', help='Replace path separators in relative filenames with another character (default ~)') parser.add_argument('-w', '--output_image_width', type=int, help=('an integer indicating the desired width in pixels of the output annotated images. ' 'Use -1 to not resize.'), default=700) if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ os.makedirs(args.output_dir, exist_ok=True) os.makedirs(os.path.join(args.output_dir, 'rendered_images')) print('Connecting to MegaDB to get the datasets table...') megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Loading the MegaDB entries...') with open(args.megadb_entries) as f: sequences = json.load(f) print('Total number of sequences: {}'.format(len(sequences))) # print('Checking that the MegaDB entries conform to the schema...') # sequences_schema_check.sequences_schema_check(sequences) shuffle(sequences) visualize_sequences(datasets_table, sequences, args)
def get_example_images(megadb_utils: MegadbUtils, dataset_name: str, class_name: str) -> List[Optional[str]]: """Gets SAS URLs for images of a particular class from a given dataset.""" datasets_table = megadb_utils.get_datasets_table() # this query should be fairly fast, ~1 sec query_both_levels = f''' SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq FROM seq WHERE ARRAY_CONTAINS(seq.class, "{class_name}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0 ''' sequences = megadb_utils.query_sequences_table( query_both_levels, partition_key=dataset_name) num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES) sample_seqs = sample(sequences, num_samples) image_urls: List[Optional[str]] = [] for seq in sample_seqs: sample_image = sample(seq['images'], 1)[0] # sample 1 img per sequence img_path = MegadbUtils.get_full_path( datasets_table, dataset_name, sample_image['file']) img_path = urllib.parse.quote_plus(img_path) dataset_info = datasets_table[dataset_name] img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format( dataset_info["storage_account"], dataset_info["container"], img_path, dataset_info["container_sas_key"]) image_urls.append(img_url) num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls) if num_missing > 0: image_urls.extend([None] * num_missing) assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES return image_urls
def query_species_by_dataset(megadb_utils: MegadbUtils, output_dir: str) -> None: """For each dataset, creates a JSON file specifying species counts. Skips dataset if a JSON file for it already exists. """ # which datasets are already processed? queried_datasets = set( i.split('.json')[0] for i in os.listdir(output_dir) if i.endswith('.json')) datasets_table = megadb_utils.get_datasets_table() dataset_names = [i for i in datasets_table if i not in queried_datasets] print( f'{len(queried_datasets)} datasets already queried. Querying species ' f'in {len(dataset_names)} datasets...') for dataset_name in dataset_names: print(f'Querying dataset {dataset_name}...') # sequence-level query should be fairly fast, ~1 sec query_seq_level = ''' SELECT VALUE seq.class FROM seq WHERE ARRAY_LENGTH(seq.class) > 0 AND NOT ARRAY_CONTAINS(seq.class, "empty") AND NOT ARRAY_CONTAINS(seq.class, "__label_unavailable") ''' results = megadb_utils.query_sequences_table( query_seq_level, partition_key=dataset_name) counter = Counter() for i in results: counter.update(i) # cases when the class field is on the image level (images in a sequence # that had different class labels, 'caltech' dataset is like this) # this query may take a long time, >1hr query_image_level = ''' SELECT VALUE seq.images FROM sequences seq WHERE ( SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_LENGTH(im.class) > 0 ) > 0 ''' start = datetime.now() results_im = megadb_utils.query_sequences_table( query_image_level, partition_key=dataset_name) elapsed = (datetime.now() - start).seconds print(f'- image-level query took {elapsed}s') for seq_images in results_im: for im in seq_images: assert 'class' in im counter.update(im['class']) with open(os.path.join(output_dir, f'{dataset_name}.json'), 'w') as f: json.dump(counter, f, indent=2)
def main(): parser = argparse.ArgumentParser( description='Script to download image files') parser.add_argument('file_list') parser.add_argument( 'store_dir', help='Path to a directory to store the downloaded files') parser.add_argument('--single_thread', action='store_true') parser.add_argument('--only_new_images', action='store_true') args = parser.parse_args() os.makedirs(args.store_dir, exist_ok=True) megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Obtained the datasets table. Loading the file_list now...') with open(args.file_list) as f: file_list = json.load(f) existing = os.listdir(args.store_dir) existing = set([i.split('.jpg')[0] for i in existing]) file_list_to_download = [ i for i in file_list if i['download_id'] not in existing ] if args.only_new_images: print('Only going to download new images.') file_list_to_download = [ i for i in file_list_to_download if 'new_entry' in i ] # if need to re-download a dataset's images in case of corruption # file_list_to_download = [i for i in file_list_to_download if i['dataset'] == 'rspb_gola'] print('file_list has {} items, still need to download {} items'.format( len(file_list), len(file_list_to_download))) urls = [ construct_url(item, datasets_table) for item in file_list_to_download ] local_paths = [ os.path.join(args.store_dir, '{}.jpg'.format(item['download_id'])) for item in file_list_to_download ] origin_and_dest = zip(urls, local_paths) start_time = time.time() if args.single_thread: print('Starting to download, single threaded...') for url, local_path in origin_and_dest: download_file(url, local_path) else: print('Starting to download, using ThreadPool...') pool = ThreadPool() list(pool.starmap(download_file, origin_and_dest)) elapsed = time.time() - start_time print('Time spent on download: {}'.format( humanfriendly.format_timespan(elapsed)))
def visualize_incoming_annotations(args): print('Connecting to MegaDB to get the datasets table...') megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Loading the MegaDB entries...') with open(args.megadb_entries) as f: sequences = json.load(f) print(f'Total number of sequences: {len(sequences)}') dataset_seq_images = defaultdict(dict) for seq in sequences: dataset_seq_images[seq['dataset']][seq['seq_id']] = seq['images'] print('Loading incoming annotation entries...') incoming = IndexedJsonDb(args.incoming_annotation) print( f'Number of images in this annotation file: {len(incoming.image_id_to_image)}' ) if args.num_to_visualize != -1 and args.num_to_visualize <= len( incoming.image_id_to_image): incoming_id_to_anno = sample( list(incoming.image_id_to_annotations.items()), args.num_to_visualize) else: incoming_id_to_anno = incoming.image_id_to_annotations.items() # The file_name field in the incoming json looks like alka_squirrels.seq2020_05_07_25C.frame119221.jpg # we need to use the dataset, sequence and frame info to find the actual path in blob storage # using the sequences images_html = [] for image_id, annotations in tqdm(incoming_id_to_anno): if args.trim_to_images_bboxes_labeled and annotations[0][ 'category_id'] == 5: # category_id 5 is No Object Visible continue anno_file_name = incoming.image_id_to_image[image_id]['file_name'] parts = anno_file_name.split('.') dataset_name = parts[0] seq_id = parts[1].split('seq')[1] frame_num = int(parts[2].split('frame')[1]) im_rel_path = get_image_rel_path(dataset_seq_images, dataset_name, seq_id, frame_num) if im_rel_path is None: print(f'Not found in megadb entries: dataset {dataset_name},' f' seq_id {seq_id}, frame_num {frame_num}') continue im_full_path = megadb_utils.get_full_path(datasets_table, dataset_name, im_rel_path) # download the image container_client = megadb_utils.get_storage_client( datasets_table, dataset_name) downloader = container_client.download_blob(im_full_path) image_file = io.BytesIO() blob_props = downloader.download_to_stream(image_file) image = vis_utils.open_image(image_file) boxes = [anno['bbox'] for anno in annotations] classes = [anno['category_id'] for anno in annotations] vis_utils.render_iMerit_boxes(boxes, classes, image, label_map=incoming.cat_id_to_name) file_name = '{}_gtbbox.jpg'.format( os.path.splitext(anno_file_name)[0].replace('/', '~')) image = vis_utils.resize_image(image, args.output_image_width) image.save(os.path.join(args.output_dir, 'rendered_images', file_name)) images_html.append({ 'filename': '{}/{}'.format('rendered_images', file_name), 'title': '{}, number of boxes: {}'.format( anno_file_name, len([b for b in boxes if len(b) > 0])), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) # Write to HTML images_html = sorted(images_html, key=lambda x: x['filename']) write_html_image_list(filename=os.path.join(args.output_dir, 'index.html'), images=images_html, options={ 'headerHtml': '<h1>Sample annotations from {}</h1>'.format( args.incoming_annotation) }) print('Visualized {} images.'.format(len(images_html)))