Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output_dir',
        required=True,
        help=
        'Path to directory where the JSONs containing species count for each dataset live'
    )
    parser.add_argument(
        '--query_species',
        action='store_true',
        help=('If flagged, query what species are present in a dataset. '
              'Otherwise, create a spreadsheet for labeling the taxonomy'))
    args = parser.parse_args()

    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ

    os.makedirs(args.output_dir, exist_ok=True)

    megadb_utils = MegadbUtils()

    if args.query_species:
        query_species_by_dataset(megadb_utils, args.output_dir)

    make_spreadsheet(megadb_utils, args.output_dir)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'dataset_name',
        help=
        'The name of the dataset; only entries from this dataset will be used')
    parser.add_argument('mega_db_seqs',
                        help='A json containing a list of sequence objects')
    parser.add_argument(
        'out_file',
        help='Path to store the resulting json to input to megadb_to_cct.py')
    parser.add_argument(
        '--ncores',
        type=int,
        default=None,
        help=
        'Number of cores to use when downloading images to read their dimensions'
    )
    args = parser.parse_args()

    assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string'
    assert os.path.exists(
        args.mega_db_seqs), 'File at mega_db path does not exist'
    assert args.out_file.endswith(
        '.json'), 'out_cct_db path needs to end in .json'
    assert args.out_file != args.mega_db_seqs
    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ

    print('Loading entries...')
    with open(args.mega_db_seqs) as f:
        mega_db_entries = json.load(f)
    print('Number of entries in the mega_db: {}'.format(len(mega_db_entries)))

    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    start_time = time.time()

    updated_seqs = get_image_dims(mega_db_entries, args.dataset_name,
                                  datasets_table, args.ncores)
    write_json(args.out_file, updated_seqs)

    elapsed = time.time() - start_time
    print('Time elapsed: {}'.format(humanfriendly.format_timespan(elapsed)))
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('megadb_entries', type=str, help='Path to a json list of MegaDB entries')
    parser.add_argument('output_dir', action='store', type=str,
                        help='Output directory for html and rendered images')
    parser.add_argument('--trim_to_images_bboxes_labeled', action='store_true',
                        help='Only include images that have been sent for bbox labeling (but may be actually empty). Turn this on if QAing annotations.')
    parser.add_argument('--num_to_visualize', action='store', type=int, default=200,
                        help='Number of images to visualize (all comformant images in a sequence are shown, so may be a few more than specified). Sequences are shuffled. Defaults to 200. Use -1 to visualize all.')

    parser.add_argument('--pathsep_replacement', action='store', type=str, default='~',
                        help='Replace path separators in relative filenames with another character (default ~)')
    parser.add_argument('-w', '--output_image_width', type=int,
                        help=('an integer indicating the desired width in pixels of the output annotated images. '
                              'Use -1 to not resize.'),
                        default=700)

    if len(sys.argv[1:]) == 0:
        parser.print_help()
        parser.exit()

    args = parser.parse_args()

    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ

    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(os.path.join(args.output_dir, 'rendered_images'))

    print('Connecting to MegaDB to get the datasets table...')
    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    print('Loading the MegaDB entries...')
    with open(args.megadb_entries) as f:
        sequences = json.load(f)
    print('Total number of sequences: {}'.format(len(sequences)))

    # print('Checking that the MegaDB entries conform to the schema...')
    # sequences_schema_check.sequences_schema_check(sequences)

    shuffle(sequences)
    visualize_sequences(datasets_table, sequences, args)
Ejemplo n.º 4
0
def get_example_images(megadb_utils: MegadbUtils, dataset_name: str,
                       class_name: str) -> List[Optional[str]]:
    """Gets SAS URLs for images of a particular class from a given dataset."""
    datasets_table = megadb_utils.get_datasets_table()

    # this query should be fairly fast, ~1 sec
    query_both_levels = f'''
    SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq
    FROM seq
    WHERE ARRAY_CONTAINS(seq.class, "{class_name}")
        OR (SELECT VALUE COUNT(im)
            FROM im IN seq.images
            WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0
    '''
    sequences = megadb_utils.query_sequences_table(
        query_both_levels, partition_key=dataset_name)

    num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES)
    sample_seqs = sample(sequences, num_samples)

    image_urls: List[Optional[str]] = []
    for seq in sample_seqs:
        sample_image = sample(seq['images'], 1)[0]  # sample 1 img per sequence
        img_path = MegadbUtils.get_full_path(
            datasets_table, dataset_name, sample_image['file'])
        img_path = urllib.parse.quote_plus(img_path)

        dataset_info = datasets_table[dataset_name]
        img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format(
            dataset_info["storage_account"],
            dataset_info["container"],
            img_path,
            dataset_info["container_sas_key"])
        image_urls.append(img_url)

    num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls)
    if num_missing > 0:
        image_urls.extend([None] * num_missing)
    assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES
    return image_urls
Ejemplo n.º 5
0
def get_image_dims(mega_db_seqs, dataset_name, datasets_table, n_cores):

    images_to_get_dims_for = []

    for seq in tqdm(mega_db_seqs):
        assert 'seq_id' in seq and 'images' in seq
        for i in seq['images']:
            assert 'file' in i

        for im in seq['images']:
            if 'bbox' in im and len(im['bbox']) > 1:
                if 'id' not in im:
                    im['id'] = str(uuid.uuid1())
                images_to_get_dims_for.append(im)

    print('Getting the dimensions for {} images'.format(
        len(images_to_get_dims_for)))

    blob_service = MegadbUtils.get_blob_service(datasets_table, dataset_name)
    path_prefix = datasets_table[dataset_name]['path_prefix']
    container_name = datasets_table[dataset_name]['container']

    if n_cores:
        print('Using threads to download images')
        pool = workerpool(n_cores)
        updated_im_objects = pool.map(
            partial(_get_image_dims, blob_service, path_prefix,
                    container_name), images_to_get_dims_for)
        print('pool.map has returned')
    else:
        print('Downloading images sequentially')
        updated_im_objects = []
        for image_obj in tqdm(images_to_get_dims_for):
            updated_im_objects.append(
                get_image_dims(blob_service, path_prefix, container_name,
                               image_obj))
    print('Successfully updated {} images.'.format(len(updated_im_objects)))
    updated_im_objects = {i['id']: i for i in updated_im_objects}

    # update the sequences
    print('Updating the sequence objects...')
    for seq in tqdm(mega_db_seqs):
        updated_images = []
        for im in seq['images']:
            if 'bbox' in im and im['id'] in updated_im_objects:
                updated_images.append(updated_im_objects[im['id']])
            else:
                updated_images.append(im)
        seq['images'] = updated_images

    return mega_db_seqs
Ejemplo n.º 6
0
def get_example_images(megadb_utils, dataset_name, class_name):
    datasets_table = megadb_utils.get_datasets_table()

    query_both_levels = '''
    SELECT TOP {} VALUE seq
    FROM seq
    WHERE ARRAY_CONTAINS(seq.class, "{}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{}")) > 0
    '''.format(NUMBER_SEQUENCES_TO_QUERY, class_name, class_name)

    sequences = megadb_utils.query_sequences_table(query_both_levels,
                                                   partition_key=dataset_name)
    sample_seqs = sample(
        sequences,
        min(len(sequences),
            NUMBER_EXAMPLES_PER_SPECIES))  # sample 7 sequences if possible

    image_urls = []
    for i, seq in enumerate(sample_seqs):
        sample_image = sample(seq['images'],
                              1)[0]  # sample one image from each sequence
        img_path = sample_image['file']

        img_path = MegadbUtils.get_full_path(datasets_table, dataset_name,
                                             img_path)
        img_path = urllib.parse.quote_plus(img_path)

        dataset_info = datasets_table[dataset_name]
        img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format(
            dataset_info["storage_account"], dataset_info["container"],
            img_path, dataset_info["container_sas_key"])
        image_urls.append(img_url)

    if len(image_urls) < NUMBER_EXAMPLES_PER_SPECIES:
        image_urls.extend([None] *
                          (NUMBER_EXAMPLES_PER_SPECIES - len(image_urls)))
    assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES
    return image_urls
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description='Script to split downloaded image files into train/val/test folders'
    )
    parser.add_argument(
        'file_list'
    )
    parser.add_argument(
        '--origin_dir',
        required=True,
        help='Path to a directory storing the downloaded files'
    )
    parser.add_argument(
        '--dest_dir',
        required=True,
        help='Path to a directory where the train/val/test folders are or will be created'
    )

    args = parser.parse_args()

    megadb_utils = MegadbUtils()
    splits_table = megadb_utils.get_splits_table()
    print('Obtained the splits table.')

    assert os.path.exists(args.file_list)
    assert os.path.exists(args.origin_dir)

    os.makedirs(args.dest_dir, exist_ok=True)

    dest_folders = {
        Splits.TRAIN: os.path.join(args.dest_dir, Splits.TRAIN),
        Splits.VAL: os.path.join(args.dest_dir, Splits.VAL),
        Splits.TEST: os.path.join(args.dest_dir, Splits.TEST)
    }

    os.makedirs(dest_folders[Splits.TRAIN], exist_ok=True)
    os.makedirs(dest_folders[Splits.VAL], exist_ok=True)
    os.makedirs(dest_folders[Splits.TEST], exist_ok=True)

    print('Loading file_list...')
    with open(args.file_list) as f:
        file_list = json.load(f)

    counter = defaultdict(lambda: defaultdict(int))
    count = 0

    start_time = time.time()
    for entry in tqdm(file_list):
        which_split = look_up_split(splits_table, entry)

        download_id = entry['download_id'] + '.jpg'
        origin_path = os.path.join(args.origin_dir, download_id)
        if not os.path.exists(origin_path):
            # print('Image not found in origin dir at {}'.format(origin_path))
            continue
        dest_path = os.path.join(args.dest_dir, which_split, download_id)
        dest = move(origin_path, dest_path)

        count += 1
        counter[entry['dataset']][which_split] += 1
        if count % 10000 == 0:
            print(counter)
            print()

    elapsed = time.time() - start_time
    print('Time spent on moving files: {}'.format(humanfriendly.format_timespan(elapsed)))
    print(counter)
Ejemplo n.º 8
0
def visualize_sequences(datasets_table, sequences, args):
    num_images = 0

    images_html = []
    rendering_info = []

    for seq in sequences:
        if 'images' not in seq:
            continue

        # dataset and seq_id are required fields
        dataset_name = seq['dataset']
        seq_id = seq['seq_id']

        # sort the images in the sequence

        images_in_seq = sorted(seq['images'], key=lambda x: x['frame_num']) if len(seq['images']) > 1 else seq['images']

        for im in images_in_seq:
            if args.trim_to_images_bboxes_labeled and 'bbox' not in im:
                continue

            num_images += 1

            blob_path = MegadbUtils.get_full_path(datasets_table, dataset_name, im['file'])
            frame_num = im.get('frame_num', -1)
            im_class = im.get('class', None)
            if im_class is None:  # if no class label on the image, show the class label on the sequence
                im_class = seq.get('class', [])

            rendering = {}
            rendering['blob_service'] = MegadbUtils.get_blob_service(datasets_table, dataset_name)
            rendering['container_name'] = datasets_table[dataset_name]['container']
            rendering['blob_path'] = blob_path
            rendering['bbox'] = im.get('bbox', [])

            annotated_img_name = 'anno_' + blob_path.replace('/', args.pathsep_replacement).replace('\\', args.pathsep_replacement)
            rendering['annotated_img_name'] = annotated_img_name

            rendering_info.append(rendering)

            images_html.append({
                'filename': 'rendered_images/{}'.format(annotated_img_name),
                'title': 'Seq ID: {}. Frame number: {}<br/> Image file: {}<br/> number of boxes: {}, image class labels: {}'.format(seq_id, frame_num, blob_path, len(rendering['bbox']), im_class),
                'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5'
            })

        if num_images >= args.num_to_visualize:
            print('num_images visualized is {}'.format(num_images))
            break

    # pool = ThreadPool()
    render_image_info_partial = partial(render_image_info, args=args)
    # print('len of rendering_info', len(rendering_info))
    # tqdm(pool.imap_unordered(render_image_info_partial, rendering_info), total=len(rendering_info))

    for rendering in tqdm(rendering_info):
        render_image_info_partial(rendering)

    print('Making HTML...')

    html_path = os.path.join(args.output_dir, 'index.html')
    # options = write_html_image_list()
    # options['headerHtml']
    write_html_image_list(
        filename=html_path,
        images=images_html
    )
Ejemplo n.º 9
0
def query_species_by_dataset(megadb_utils: MegadbUtils,
                             output_dir: str) -> None:
    """For each dataset, creates a JSON file specifying species counts.

    Skips dataset if a JSON file for it already exists.
    """
    # which datasets are already processed?
    queried_datasets = set(
        i.split('.json')[0] for i in os.listdir(output_dir)
        if i.endswith('.json'))

    datasets_table = megadb_utils.get_datasets_table()
    dataset_names = [i for i in datasets_table if i not in queried_datasets]

    print(
        f'{len(queried_datasets)} datasets already queried. Querying species '
        f'in {len(dataset_names)} datasets...')

    for dataset_name in dataset_names:
        print(f'Querying dataset {dataset_name}...')

        # sequence-level query should be fairly fast, ~1 sec
        query_seq_level = '''
        SELECT VALUE seq.class
        FROM seq
        WHERE ARRAY_LENGTH(seq.class) > 0
            AND NOT ARRAY_CONTAINS(seq.class, "empty")
            AND NOT ARRAY_CONTAINS(seq.class, "__label_unavailable")
        '''
        results = megadb_utils.query_sequences_table(
            query_seq_level, partition_key=dataset_name)

        counter = Counter()
        for i in results:
            counter.update(i)

        # cases when the class field is on the image level (images in a sequence
        # that had different class labels, 'caltech' dataset is like this)
        # this query may take a long time, >1hr
        query_image_level = '''
        SELECT VALUE seq.images
        FROM sequences seq
        WHERE (
            SELECT VALUE COUNT(im)
            FROM im IN seq.images
            WHERE ARRAY_LENGTH(im.class) > 0
        ) > 0
        '''

        start = datetime.now()
        results_im = megadb_utils.query_sequences_table(
            query_image_level, partition_key=dataset_name)
        elapsed = (datetime.now() - start).seconds
        print(f'- image-level query took {elapsed}s')

        for seq_images in results_im:
            for im in seq_images:
                assert 'class' in im
                counter.update(im['class'])

        with open(os.path.join(output_dir, f'{dataset_name}.json'), 'w') as f:
            json.dump(counter, f, indent=2)
Ejemplo n.º 10
0
# e.g. {'name': '@top_n', 'value': 100} - see query_and_upsert_examples/query_for_data.ipynb
query_parameters = None

save_every = 50000
assert save_every > 0

# Use False if do not want all results stored in a single JSON.
consolidate_results = True


#%% Script

time_stamp = datetime.utcnow().strftime('%Y%m%d%H%M%S')

db_utils = MegadbUtils()  # read the CosmosDB endpoint and key from the environment

# execute the query
start_time = time.time()

result_iterable = db_utils.query_sequences_table(query=query,
                                                 partition_key=partition_key,
                                                 parameters=query_parameters)

# loop through and save the results
results = []
item_count = 0
part_count = 0
part_paths = []

for item in result_iterable:
def analyze_images(url_or_path: str, json_keys: Optional[Sequence[str]] = None,
                   account: Optional[str] = None,
                   container: Optional[str] = None,
                   sas_token: Optional[str] = None) -> None:
    """
    Args:
        url_or_path: str, URL or local path to a file containing a list
            of image paths. Each image path is either <blob_name> if account and
            container are given, or <dataset>/<blob_name> if account and
            container are None. File can either be a list of image paths, or a
            JSON file containing image paths.
        json_keys: optional list of str, only relevant if url_or_path is a JSON
            file. If json_keys=None, then the JSON file at url_or_path is
            assumed to be a JSON list of image paths. If json_keys is not None,
            then the JSON file should be a dict, whose values corresponding to
            json_keys are lists of image paths.
        account: str, name of Azure Blob Storage account
        container: str, name of Azure Blob Storage container
        sas_token: str, optional SAS token (without leading '?') if the
            container is not publicly accessible
    """
    datasets_table = None
    if (account is None) or (container is None):
        assert account is None
        assert container is None
        assert sas_token is None
        datasets_table = MegadbUtils().get_datasets_table()

    is_json = ('.json' in url_or_path)
    if url_or_path.startswith(('http://', 'https://')):
        r = requests.get(url_or_path)
        if is_json:
            img_paths = r.json()
        else:
            img_paths = r.text.splitlines()
    else:
        with open(url_or_path, 'r') as f:
            if is_json:
                img_paths = json.load(f)
            else:
                img_paths = f.readlines()

    if is_json and json_keys is not None:
        img_paths_json = img_paths
        img_paths = []
        for k in json_keys:
            img_paths += img_paths_json[k]

    mapping: Dict[str, List[str]] = {
        status: []
        for status in ['good', 'nonexistant', 'non_image', 'truncated', 'bad']
    }

    pool = futures.ThreadPoolExecutor(max_workers=100)

    # lock before changing ImageFile.LOAD_TRUNCATED_IMAGES
    truncated_images_lock = threading.Lock()

    futures_list = []
    for img_path in tqdm(img_paths):
        future = pool.submit(
            check_image_condition, img_path, truncated_images_lock, account,
            container, sas_token, datasets_table)
        futures_list.append(future)

    total = len(futures_list)
    for future in tqdm(futures.as_completed(futures_list), total=total):
        img_file, status = future.result()
        mapping[status].append(img_file)

    for status, img_list in mapping.items():
        print(f'{status}: {len(img_list)}')
        pprint(sorted(img_list))
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(
        description='Script to download image files')
    parser.add_argument('file_list')
    parser.add_argument(
        'store_dir', help='Path to a directory to store the downloaded files')
    parser.add_argument('--single_thread', action='store_true')
    parser.add_argument('--only_new_images', action='store_true')

    args = parser.parse_args()

    os.makedirs(args.store_dir, exist_ok=True)

    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()
    print('Obtained the datasets table. Loading the file_list now...')

    with open(args.file_list) as f:
        file_list = json.load(f)

    existing = os.listdir(args.store_dir)
    existing = set([i.split('.jpg')[0] for i in existing])

    file_list_to_download = [
        i for i in file_list if i['download_id'] not in existing
    ]

    if args.only_new_images:
        print('Only going to download new images.')
        file_list_to_download = [
            i for i in file_list_to_download if 'new_entry' in i
        ]

    # if need to re-download a dataset's images in case of corruption
    # file_list_to_download = [i for i in file_list_to_download if i['dataset'] == 'rspb_gola']

    print('file_list has {} items, still need to download {} items'.format(
        len(file_list), len(file_list_to_download)))

    urls = [
        construct_url(item, datasets_table) for item in file_list_to_download
    ]
    local_paths = [
        os.path.join(args.store_dir, '{}.jpg'.format(item['download_id']))
        for item in file_list_to_download
    ]

    origin_and_dest = zip(urls, local_paths)

    start_time = time.time()
    if args.single_thread:
        print('Starting to download, single threaded...')
        for url, local_path in origin_and_dest:
            download_file(url, local_path)
    else:
        print('Starting to download, using ThreadPool...')
        pool = ThreadPool()
        list(pool.starmap(download_file, origin_and_dest))

    elapsed = time.time() - start_time
    print('Time spent on download: {}'.format(
        humanfriendly.format_timespan(elapsed)))
Ejemplo n.º 13
0
def main(filetype: str, file_list_path: str, store_dir: str,
         save_key: Optional[str], json_key: Optional[str],
         only_new_images: bool, threads: int,
         check_existing_dir: Optional[str]) -> None:
    # input validation
    assert filetype in ['json', 'txt']
    if check_existing_dir is not None:
        assert os.path.isdir(check_existing_dir)
        assert check_existing_dir != store_dir

    if os.path.exists(store_dir):
        assert os.path.isdir(store_dir)
        print('Searching for existing files')
        # existing files, with paths relative to <store_dir>
        existing = set(
            os.path.relpath(os.path.join(dirpath, f), store_dir)
            for dirpath, _, filenames in os.walk(store_dir) for f in filenames)
    else:
        print('Creating directory at:', store_dir)
        os.makedirs(store_dir)
        existing = set()

    print('Loading datasets table from MegaDB')
    datasets_table = MegadbUtils().get_datasets_table()

    # parse JSON or TXT file
    print('Processing file list')
    if filetype == 'json':
        filename_to_url, count = process_json(file_list_path,
                                              save_key,
                                              json_key,
                                              only_new_images,
                                              existing=existing,
                                              datasets_table=datasets_table)
    else:
        filename_to_url, count = process_txt(file_list_path,
                                             existing=existing,
                                             datasets_table=datasets_table)

    print(f'file_list has {count} items, still need to download '
          f'{len(filename_to_url)} items')

    print('Submitting URLs to download')
    pool = futures.ThreadPoolExecutor(max_workers=threads)
    future_to_filename = {}
    for filename, url in tqdm(filename_to_url.items()):
        future = pool.submit(download_file, url, filename, store_dir,
                             check_existing_dir)
        future_to_filename[future] = filename

    print('Fetching results')
    total = len(future_to_filename)
    failed_filenames = []
    for future in tqdm(futures.as_completed(future_to_filename), total=total):
        filename = future_to_filename[future]
        try:
            future.result()
        except Exception as e:  # pylint: disable=broad-except
            exception_type = type(e).__name__
            tqdm.write(f'{filename} - generated {exception_type}: {e}')
            failed_filenames.append(filename)

    if len(failed_filenames) > 0:
        print(f'{len(failed_filenames)} failed to download. Writing log...')
        date = datetime.now().strftime(
            '%Y%m%d_%H%M%S')  # ex: '20200722_110816'
        with open(f'download_images_failed_{date}.json', 'w') as f:
            json.dump(failed_filenames, f, indent=1)
    else:
        print('Success!')
def visualize_incoming_annotations(args):
    print('Connecting to MegaDB to get the datasets table...')
    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    print('Loading the MegaDB entries...')
    with open(args.megadb_entries) as f:
        sequences = json.load(f)
    print(f'Total number of sequences: {len(sequences)}')
    dataset_seq_images = defaultdict(dict)
    for seq in sequences:
        dataset_seq_images[seq['dataset']][seq['seq_id']] = seq['images']

    print('Loading incoming annotation entries...')
    incoming = IndexedJsonDb(args.incoming_annotation)
    print(
        f'Number of images in this annotation file: {len(incoming.image_id_to_image)}'
    )

    if args.num_to_visualize != -1 and args.num_to_visualize <= len(
            incoming.image_id_to_image):
        incoming_id_to_anno = sample(
            list(incoming.image_id_to_annotations.items()),
            args.num_to_visualize)
    else:
        incoming_id_to_anno = incoming.image_id_to_annotations.items()

    # The file_name field in the incoming json looks like alka_squirrels.seq2020_05_07_25C.frame119221.jpg
    # we need to use the dataset, sequence and frame info to find the actual path in blob storage
    # using the sequences
    images_html = []
    for image_id, annotations in tqdm(incoming_id_to_anno):
        if args.trim_to_images_bboxes_labeled and annotations[0][
                'category_id'] == 5:
            # category_id 5 is No Object Visible
            continue

        anno_file_name = incoming.image_id_to_image[image_id]['file_name']
        parts = anno_file_name.split('.')
        dataset_name = parts[0]
        seq_id = parts[1].split('seq')[1]
        frame_num = int(parts[2].split('frame')[1])

        im_rel_path = get_image_rel_path(dataset_seq_images, dataset_name,
                                         seq_id, frame_num)
        if im_rel_path is None:
            print(f'Not found in megadb entries: dataset {dataset_name},'
                  f' seq_id {seq_id}, frame_num {frame_num}')
            continue

        im_full_path = megadb_utils.get_full_path(datasets_table, dataset_name,
                                                  im_rel_path)

        # download the image
        container_client = megadb_utils.get_storage_client(
            datasets_table, dataset_name)
        downloader = container_client.download_blob(im_full_path)
        image_file = io.BytesIO()
        blob_props = downloader.download_to_stream(image_file)
        image = vis_utils.open_image(image_file)

        boxes = [anno['bbox'] for anno in annotations]
        classes = [anno['category_id'] for anno in annotations]

        vis_utils.render_iMerit_boxes(boxes,
                                      classes,
                                      image,
                                      label_map=incoming.cat_id_to_name)

        file_name = '{}_gtbbox.jpg'.format(
            os.path.splitext(anno_file_name)[0].replace('/', '~'))
        image = vis_utils.resize_image(image, args.output_image_width)
        image.save(os.path.join(args.output_dir, 'rendered_images', file_name))

        images_html.append({
            'filename':
            '{}/{}'.format('rendered_images', file_name),
            'title':
            '{}, number of boxes: {}'.format(
                anno_file_name, len([b for b in boxes if len(b) > 0])),
            'textStyle':
            'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5'
        })

    # Write to HTML
    images_html = sorted(images_html, key=lambda x: x['filename'])
    write_html_image_list(filename=os.path.join(args.output_dir, 'index.html'),
                          images=images_html,
                          options={
                              'headerHtml':
                              '<h1>Sample annotations from {}</h1>'.format(
                                  args.incoming_annotation)
                          })

    print('Visualized {} images.'.format(len(images_html)))