def main(): parser = argparse.ArgumentParser() parser.add_argument( '--output_dir', required=True, help= 'Path to directory where the JSONs containing species count for each dataset live' ) parser.add_argument( '--query_species', action='store_true', help=('If flagged, query what species are present in a dataset. ' 'Otherwise, create a spreadsheet for labeling the taxonomy')) args = parser.parse_args() assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ os.makedirs(args.output_dir, exist_ok=True) megadb_utils = MegadbUtils() if args.query_species: query_species_by_dataset(megadb_utils, args.output_dir) make_spreadsheet(megadb_utils, args.output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'dataset_name', help= 'The name of the dataset; only entries from this dataset will be used') parser.add_argument('mega_db_seqs', help='A json containing a list of sequence objects') parser.add_argument( 'out_file', help='Path to store the resulting json to input to megadb_to_cct.py') parser.add_argument( '--ncores', type=int, default=None, help= 'Number of cores to use when downloading images to read their dimensions' ) args = parser.parse_args() assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string' assert os.path.exists( args.mega_db_seqs), 'File at mega_db path does not exist' assert args.out_file.endswith( '.json'), 'out_cct_db path needs to end in .json' assert args.out_file != args.mega_db_seqs assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ print('Loading entries...') with open(args.mega_db_seqs) as f: mega_db_entries = json.load(f) print('Number of entries in the mega_db: {}'.format(len(mega_db_entries))) megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() start_time = time.time() updated_seqs = get_image_dims(mega_db_entries, args.dataset_name, datasets_table, args.ncores) write_json(args.out_file, updated_seqs) elapsed = time.time() - start_time print('Time elapsed: {}'.format(humanfriendly.format_timespan(elapsed)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('megadb_entries', type=str, help='Path to a json list of MegaDB entries') parser.add_argument('output_dir', action='store', type=str, help='Output directory for html and rendered images') parser.add_argument('--trim_to_images_bboxes_labeled', action='store_true', help='Only include images that have been sent for bbox labeling (but may be actually empty). Turn this on if QAing annotations.') parser.add_argument('--num_to_visualize', action='store', type=int, default=200, help='Number of images to visualize (all comformant images in a sequence are shown, so may be a few more than specified). Sequences are shuffled. Defaults to 200. Use -1 to visualize all.') parser.add_argument('--pathsep_replacement', action='store', type=str, default='~', help='Replace path separators in relative filenames with another character (default ~)') parser.add_argument('-w', '--output_image_width', type=int, help=('an integer indicating the desired width in pixels of the output annotated images. ' 'Use -1 to not resize.'), default=700) if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ os.makedirs(args.output_dir, exist_ok=True) os.makedirs(os.path.join(args.output_dir, 'rendered_images')) print('Connecting to MegaDB to get the datasets table...') megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Loading the MegaDB entries...') with open(args.megadb_entries) as f: sequences = json.load(f) print('Total number of sequences: {}'.format(len(sequences))) # print('Checking that the MegaDB entries conform to the schema...') # sequences_schema_check.sequences_schema_check(sequences) shuffle(sequences) visualize_sequences(datasets_table, sequences, args)
def get_example_images(megadb_utils: MegadbUtils, dataset_name: str, class_name: str) -> List[Optional[str]]: """Gets SAS URLs for images of a particular class from a given dataset.""" datasets_table = megadb_utils.get_datasets_table() # this query should be fairly fast, ~1 sec query_both_levels = f''' SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq FROM seq WHERE ARRAY_CONTAINS(seq.class, "{class_name}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0 ''' sequences = megadb_utils.query_sequences_table( query_both_levels, partition_key=dataset_name) num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES) sample_seqs = sample(sequences, num_samples) image_urls: List[Optional[str]] = [] for seq in sample_seqs: sample_image = sample(seq['images'], 1)[0] # sample 1 img per sequence img_path = MegadbUtils.get_full_path( datasets_table, dataset_name, sample_image['file']) img_path = urllib.parse.quote_plus(img_path) dataset_info = datasets_table[dataset_name] img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format( dataset_info["storage_account"], dataset_info["container"], img_path, dataset_info["container_sas_key"]) image_urls.append(img_url) num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls) if num_missing > 0: image_urls.extend([None] * num_missing) assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES return image_urls
def get_image_dims(mega_db_seqs, dataset_name, datasets_table, n_cores): images_to_get_dims_for = [] for seq in tqdm(mega_db_seqs): assert 'seq_id' in seq and 'images' in seq for i in seq['images']: assert 'file' in i for im in seq['images']: if 'bbox' in im and len(im['bbox']) > 1: if 'id' not in im: im['id'] = str(uuid.uuid1()) images_to_get_dims_for.append(im) print('Getting the dimensions for {} images'.format( len(images_to_get_dims_for))) blob_service = MegadbUtils.get_blob_service(datasets_table, dataset_name) path_prefix = datasets_table[dataset_name]['path_prefix'] container_name = datasets_table[dataset_name]['container'] if n_cores: print('Using threads to download images') pool = workerpool(n_cores) updated_im_objects = pool.map( partial(_get_image_dims, blob_service, path_prefix, container_name), images_to_get_dims_for) print('pool.map has returned') else: print('Downloading images sequentially') updated_im_objects = [] for image_obj in tqdm(images_to_get_dims_for): updated_im_objects.append( get_image_dims(blob_service, path_prefix, container_name, image_obj)) print('Successfully updated {} images.'.format(len(updated_im_objects))) updated_im_objects = {i['id']: i for i in updated_im_objects} # update the sequences print('Updating the sequence objects...') for seq in tqdm(mega_db_seqs): updated_images = [] for im in seq['images']: if 'bbox' in im and im['id'] in updated_im_objects: updated_images.append(updated_im_objects[im['id']]) else: updated_images.append(im) seq['images'] = updated_images return mega_db_seqs
def get_example_images(megadb_utils, dataset_name, class_name): datasets_table = megadb_utils.get_datasets_table() query_both_levels = ''' SELECT TOP {} VALUE seq FROM seq WHERE ARRAY_CONTAINS(seq.class, "{}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{}")) > 0 '''.format(NUMBER_SEQUENCES_TO_QUERY, class_name, class_name) sequences = megadb_utils.query_sequences_table(query_both_levels, partition_key=dataset_name) sample_seqs = sample( sequences, min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES)) # sample 7 sequences if possible image_urls = [] for i, seq in enumerate(sample_seqs): sample_image = sample(seq['images'], 1)[0] # sample one image from each sequence img_path = sample_image['file'] img_path = MegadbUtils.get_full_path(datasets_table, dataset_name, img_path) img_path = urllib.parse.quote_plus(img_path) dataset_info = datasets_table[dataset_name] img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format( dataset_info["storage_account"], dataset_info["container"], img_path, dataset_info["container_sas_key"]) image_urls.append(img_url) if len(image_urls) < NUMBER_EXAMPLES_PER_SPECIES: image_urls.extend([None] * (NUMBER_EXAMPLES_PER_SPECIES - len(image_urls))) assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES return image_urls
def main(): parser = argparse.ArgumentParser( description='Script to split downloaded image files into train/val/test folders' ) parser.add_argument( 'file_list' ) parser.add_argument( '--origin_dir', required=True, help='Path to a directory storing the downloaded files' ) parser.add_argument( '--dest_dir', required=True, help='Path to a directory where the train/val/test folders are or will be created' ) args = parser.parse_args() megadb_utils = MegadbUtils() splits_table = megadb_utils.get_splits_table() print('Obtained the splits table.') assert os.path.exists(args.file_list) assert os.path.exists(args.origin_dir) os.makedirs(args.dest_dir, exist_ok=True) dest_folders = { Splits.TRAIN: os.path.join(args.dest_dir, Splits.TRAIN), Splits.VAL: os.path.join(args.dest_dir, Splits.VAL), Splits.TEST: os.path.join(args.dest_dir, Splits.TEST) } os.makedirs(dest_folders[Splits.TRAIN], exist_ok=True) os.makedirs(dest_folders[Splits.VAL], exist_ok=True) os.makedirs(dest_folders[Splits.TEST], exist_ok=True) print('Loading file_list...') with open(args.file_list) as f: file_list = json.load(f) counter = defaultdict(lambda: defaultdict(int)) count = 0 start_time = time.time() for entry in tqdm(file_list): which_split = look_up_split(splits_table, entry) download_id = entry['download_id'] + '.jpg' origin_path = os.path.join(args.origin_dir, download_id) if not os.path.exists(origin_path): # print('Image not found in origin dir at {}'.format(origin_path)) continue dest_path = os.path.join(args.dest_dir, which_split, download_id) dest = move(origin_path, dest_path) count += 1 counter[entry['dataset']][which_split] += 1 if count % 10000 == 0: print(counter) print() elapsed = time.time() - start_time print('Time spent on moving files: {}'.format(humanfriendly.format_timespan(elapsed))) print(counter)
def visualize_sequences(datasets_table, sequences, args): num_images = 0 images_html = [] rendering_info = [] for seq in sequences: if 'images' not in seq: continue # dataset and seq_id are required fields dataset_name = seq['dataset'] seq_id = seq['seq_id'] # sort the images in the sequence images_in_seq = sorted(seq['images'], key=lambda x: x['frame_num']) if len(seq['images']) > 1 else seq['images'] for im in images_in_seq: if args.trim_to_images_bboxes_labeled and 'bbox' not in im: continue num_images += 1 blob_path = MegadbUtils.get_full_path(datasets_table, dataset_name, im['file']) frame_num = im.get('frame_num', -1) im_class = im.get('class', None) if im_class is None: # if no class label on the image, show the class label on the sequence im_class = seq.get('class', []) rendering = {} rendering['blob_service'] = MegadbUtils.get_blob_service(datasets_table, dataset_name) rendering['container_name'] = datasets_table[dataset_name]['container'] rendering['blob_path'] = blob_path rendering['bbox'] = im.get('bbox', []) annotated_img_name = 'anno_' + blob_path.replace('/', args.pathsep_replacement).replace('\\', args.pathsep_replacement) rendering['annotated_img_name'] = annotated_img_name rendering_info.append(rendering) images_html.append({ 'filename': 'rendered_images/{}'.format(annotated_img_name), 'title': 'Seq ID: {}. Frame number: {}<br/> Image file: {}<br/> number of boxes: {}, image class labels: {}'.format(seq_id, frame_num, blob_path, len(rendering['bbox']), im_class), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) if num_images >= args.num_to_visualize: print('num_images visualized is {}'.format(num_images)) break # pool = ThreadPool() render_image_info_partial = partial(render_image_info, args=args) # print('len of rendering_info', len(rendering_info)) # tqdm(pool.imap_unordered(render_image_info_partial, rendering_info), total=len(rendering_info)) for rendering in tqdm(rendering_info): render_image_info_partial(rendering) print('Making HTML...') html_path = os.path.join(args.output_dir, 'index.html') # options = write_html_image_list() # options['headerHtml'] write_html_image_list( filename=html_path, images=images_html )
def query_species_by_dataset(megadb_utils: MegadbUtils, output_dir: str) -> None: """For each dataset, creates a JSON file specifying species counts. Skips dataset if a JSON file for it already exists. """ # which datasets are already processed? queried_datasets = set( i.split('.json')[0] for i in os.listdir(output_dir) if i.endswith('.json')) datasets_table = megadb_utils.get_datasets_table() dataset_names = [i for i in datasets_table if i not in queried_datasets] print( f'{len(queried_datasets)} datasets already queried. Querying species ' f'in {len(dataset_names)} datasets...') for dataset_name in dataset_names: print(f'Querying dataset {dataset_name}...') # sequence-level query should be fairly fast, ~1 sec query_seq_level = ''' SELECT VALUE seq.class FROM seq WHERE ARRAY_LENGTH(seq.class) > 0 AND NOT ARRAY_CONTAINS(seq.class, "empty") AND NOT ARRAY_CONTAINS(seq.class, "__label_unavailable") ''' results = megadb_utils.query_sequences_table( query_seq_level, partition_key=dataset_name) counter = Counter() for i in results: counter.update(i) # cases when the class field is on the image level (images in a sequence # that had different class labels, 'caltech' dataset is like this) # this query may take a long time, >1hr query_image_level = ''' SELECT VALUE seq.images FROM sequences seq WHERE ( SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_LENGTH(im.class) > 0 ) > 0 ''' start = datetime.now() results_im = megadb_utils.query_sequences_table( query_image_level, partition_key=dataset_name) elapsed = (datetime.now() - start).seconds print(f'- image-level query took {elapsed}s') for seq_images in results_im: for im in seq_images: assert 'class' in im counter.update(im['class']) with open(os.path.join(output_dir, f'{dataset_name}.json'), 'w') as f: json.dump(counter, f, indent=2)
# e.g. {'name': '@top_n', 'value': 100} - see query_and_upsert_examples/query_for_data.ipynb query_parameters = None save_every = 50000 assert save_every > 0 # Use False if do not want all results stored in a single JSON. consolidate_results = True #%% Script time_stamp = datetime.utcnow().strftime('%Y%m%d%H%M%S') db_utils = MegadbUtils() # read the CosmosDB endpoint and key from the environment # execute the query start_time = time.time() result_iterable = db_utils.query_sequences_table(query=query, partition_key=partition_key, parameters=query_parameters) # loop through and save the results results = [] item_count = 0 part_count = 0 part_paths = [] for item in result_iterable:
def analyze_images(url_or_path: str, json_keys: Optional[Sequence[str]] = None, account: Optional[str] = None, container: Optional[str] = None, sas_token: Optional[str] = None) -> None: """ Args: url_or_path: str, URL or local path to a file containing a list of image paths. Each image path is either <blob_name> if account and container are given, or <dataset>/<blob_name> if account and container are None. File can either be a list of image paths, or a JSON file containing image paths. json_keys: optional list of str, only relevant if url_or_path is a JSON file. If json_keys=None, then the JSON file at url_or_path is assumed to be a JSON list of image paths. If json_keys is not None, then the JSON file should be a dict, whose values corresponding to json_keys are lists of image paths. account: str, name of Azure Blob Storage account container: str, name of Azure Blob Storage container sas_token: str, optional SAS token (without leading '?') if the container is not publicly accessible """ datasets_table = None if (account is None) or (container is None): assert account is None assert container is None assert sas_token is None datasets_table = MegadbUtils().get_datasets_table() is_json = ('.json' in url_or_path) if url_or_path.startswith(('http://', 'https://')): r = requests.get(url_or_path) if is_json: img_paths = r.json() else: img_paths = r.text.splitlines() else: with open(url_or_path, 'r') as f: if is_json: img_paths = json.load(f) else: img_paths = f.readlines() if is_json and json_keys is not None: img_paths_json = img_paths img_paths = [] for k in json_keys: img_paths += img_paths_json[k] mapping: Dict[str, List[str]] = { status: [] for status in ['good', 'nonexistant', 'non_image', 'truncated', 'bad'] } pool = futures.ThreadPoolExecutor(max_workers=100) # lock before changing ImageFile.LOAD_TRUNCATED_IMAGES truncated_images_lock = threading.Lock() futures_list = [] for img_path in tqdm(img_paths): future = pool.submit( check_image_condition, img_path, truncated_images_lock, account, container, sas_token, datasets_table) futures_list.append(future) total = len(futures_list) for future in tqdm(futures.as_completed(futures_list), total=total): img_file, status = future.result() mapping[status].append(img_file) for status, img_list in mapping.items(): print(f'{status}: {len(img_list)}') pprint(sorted(img_list))
def main(): parser = argparse.ArgumentParser( description='Script to download image files') parser.add_argument('file_list') parser.add_argument( 'store_dir', help='Path to a directory to store the downloaded files') parser.add_argument('--single_thread', action='store_true') parser.add_argument('--only_new_images', action='store_true') args = parser.parse_args() os.makedirs(args.store_dir, exist_ok=True) megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Obtained the datasets table. Loading the file_list now...') with open(args.file_list) as f: file_list = json.load(f) existing = os.listdir(args.store_dir) existing = set([i.split('.jpg')[0] for i in existing]) file_list_to_download = [ i for i in file_list if i['download_id'] not in existing ] if args.only_new_images: print('Only going to download new images.') file_list_to_download = [ i for i in file_list_to_download if 'new_entry' in i ] # if need to re-download a dataset's images in case of corruption # file_list_to_download = [i for i in file_list_to_download if i['dataset'] == 'rspb_gola'] print('file_list has {} items, still need to download {} items'.format( len(file_list), len(file_list_to_download))) urls = [ construct_url(item, datasets_table) for item in file_list_to_download ] local_paths = [ os.path.join(args.store_dir, '{}.jpg'.format(item['download_id'])) for item in file_list_to_download ] origin_and_dest = zip(urls, local_paths) start_time = time.time() if args.single_thread: print('Starting to download, single threaded...') for url, local_path in origin_and_dest: download_file(url, local_path) else: print('Starting to download, using ThreadPool...') pool = ThreadPool() list(pool.starmap(download_file, origin_and_dest)) elapsed = time.time() - start_time print('Time spent on download: {}'.format( humanfriendly.format_timespan(elapsed)))
def main(filetype: str, file_list_path: str, store_dir: str, save_key: Optional[str], json_key: Optional[str], only_new_images: bool, threads: int, check_existing_dir: Optional[str]) -> None: # input validation assert filetype in ['json', 'txt'] if check_existing_dir is not None: assert os.path.isdir(check_existing_dir) assert check_existing_dir != store_dir if os.path.exists(store_dir): assert os.path.isdir(store_dir) print('Searching for existing files') # existing files, with paths relative to <store_dir> existing = set( os.path.relpath(os.path.join(dirpath, f), store_dir) for dirpath, _, filenames in os.walk(store_dir) for f in filenames) else: print('Creating directory at:', store_dir) os.makedirs(store_dir) existing = set() print('Loading datasets table from MegaDB') datasets_table = MegadbUtils().get_datasets_table() # parse JSON or TXT file print('Processing file list') if filetype == 'json': filename_to_url, count = process_json(file_list_path, save_key, json_key, only_new_images, existing=existing, datasets_table=datasets_table) else: filename_to_url, count = process_txt(file_list_path, existing=existing, datasets_table=datasets_table) print(f'file_list has {count} items, still need to download ' f'{len(filename_to_url)} items') print('Submitting URLs to download') pool = futures.ThreadPoolExecutor(max_workers=threads) future_to_filename = {} for filename, url in tqdm(filename_to_url.items()): future = pool.submit(download_file, url, filename, store_dir, check_existing_dir) future_to_filename[future] = filename print('Fetching results') total = len(future_to_filename) failed_filenames = [] for future in tqdm(futures.as_completed(future_to_filename), total=total): filename = future_to_filename[future] try: future.result() except Exception as e: # pylint: disable=broad-except exception_type = type(e).__name__ tqdm.write(f'{filename} - generated {exception_type}: {e}') failed_filenames.append(filename) if len(failed_filenames) > 0: print(f'{len(failed_filenames)} failed to download. Writing log...') date = datetime.now().strftime( '%Y%m%d_%H%M%S') # ex: '20200722_110816' with open(f'download_images_failed_{date}.json', 'w') as f: json.dump(failed_filenames, f, indent=1) else: print('Success!')
def visualize_incoming_annotations(args): print('Connecting to MegaDB to get the datasets table...') megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Loading the MegaDB entries...') with open(args.megadb_entries) as f: sequences = json.load(f) print(f'Total number of sequences: {len(sequences)}') dataset_seq_images = defaultdict(dict) for seq in sequences: dataset_seq_images[seq['dataset']][seq['seq_id']] = seq['images'] print('Loading incoming annotation entries...') incoming = IndexedJsonDb(args.incoming_annotation) print( f'Number of images in this annotation file: {len(incoming.image_id_to_image)}' ) if args.num_to_visualize != -1 and args.num_to_visualize <= len( incoming.image_id_to_image): incoming_id_to_anno = sample( list(incoming.image_id_to_annotations.items()), args.num_to_visualize) else: incoming_id_to_anno = incoming.image_id_to_annotations.items() # The file_name field in the incoming json looks like alka_squirrels.seq2020_05_07_25C.frame119221.jpg # we need to use the dataset, sequence and frame info to find the actual path in blob storage # using the sequences images_html = [] for image_id, annotations in tqdm(incoming_id_to_anno): if args.trim_to_images_bboxes_labeled and annotations[0][ 'category_id'] == 5: # category_id 5 is No Object Visible continue anno_file_name = incoming.image_id_to_image[image_id]['file_name'] parts = anno_file_name.split('.') dataset_name = parts[0] seq_id = parts[1].split('seq')[1] frame_num = int(parts[2].split('frame')[1]) im_rel_path = get_image_rel_path(dataset_seq_images, dataset_name, seq_id, frame_num) if im_rel_path is None: print(f'Not found in megadb entries: dataset {dataset_name},' f' seq_id {seq_id}, frame_num {frame_num}') continue im_full_path = megadb_utils.get_full_path(datasets_table, dataset_name, im_rel_path) # download the image container_client = megadb_utils.get_storage_client( datasets_table, dataset_name) downloader = container_client.download_blob(im_full_path) image_file = io.BytesIO() blob_props = downloader.download_to_stream(image_file) image = vis_utils.open_image(image_file) boxes = [anno['bbox'] for anno in annotations] classes = [anno['category_id'] for anno in annotations] vis_utils.render_iMerit_boxes(boxes, classes, image, label_map=incoming.cat_id_to_name) file_name = '{}_gtbbox.jpg'.format( os.path.splitext(anno_file_name)[0].replace('/', '~')) image = vis_utils.resize_image(image, args.output_image_width) image.save(os.path.join(args.output_dir, 'rendered_images', file_name)) images_html.append({ 'filename': '{}/{}'.format('rendered_images', file_name), 'title': '{}, number of boxes: {}'.format( anno_file_name, len([b for b in boxes if len(b) > 0])), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) # Write to HTML images_html = sorted(images_html, key=lambda x: x['filename']) write_html_image_list(filename=os.path.join(args.output_dir, 'index.html'), images=images_html, options={ 'headerHtml': '<h1>Sample annotations from {}</h1>'.format( args.incoming_annotation) }) print('Visualized {} images.'.format(len(images_html)))