def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'dataset_name',
        help=
        'The name of the dataset; only entries from this dataset will be used')
    parser.add_argument('mega_db_seqs',
                        help='A json containing a list of sequence objects')
    parser.add_argument(
        'out_file',
        help='Path to store the resulting json to input to megadb_to_cct.py')
    parser.add_argument(
        '--ncores',
        type=int,
        default=None,
        help=
        'Number of cores to use when downloading images to read their dimensions'
    )
    args = parser.parse_args()

    assert len(args.dataset_name) > 0, 'dataset_name cannot be an empty string'
    assert os.path.exists(
        args.mega_db_seqs), 'File at mega_db path does not exist'
    assert args.out_file.endswith(
        '.json'), 'out_cct_db path needs to end in .json'
    assert args.out_file != args.mega_db_seqs
    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ

    print('Loading entries...')
    with open(args.mega_db_seqs) as f:
        mega_db_entries = json.load(f)
    print('Number of entries in the mega_db: {}'.format(len(mega_db_entries)))

    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    start_time = time.time()

    updated_seqs = get_image_dims(mega_db_entries, args.dataset_name,
                                  datasets_table, args.ncores)
    write_json(args.out_file, updated_seqs)

    elapsed = time.time() - start_time
    print('Time elapsed: {}'.format(humanfriendly.format_timespan(elapsed)))
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('megadb_entries', type=str, help='Path to a json list of MegaDB entries')
    parser.add_argument('output_dir', action='store', type=str,
                        help='Output directory for html and rendered images')
    parser.add_argument('--trim_to_images_bboxes_labeled', action='store_true',
                        help='Only include images that have been sent for bbox labeling (but may be actually empty). Turn this on if QAing annotations.')
    parser.add_argument('--num_to_visualize', action='store', type=int, default=200,
                        help='Number of images to visualize (all comformant images in a sequence are shown, so may be a few more than specified). Sequences are shuffled. Defaults to 200. Use -1 to visualize all.')

    parser.add_argument('--pathsep_replacement', action='store', type=str, default='~',
                        help='Replace path separators in relative filenames with another character (default ~)')
    parser.add_argument('-w', '--output_image_width', type=int,
                        help=('an integer indicating the desired width in pixels of the output annotated images. '
                              'Use -1 to not resize.'),
                        default=700)

    if len(sys.argv[1:]) == 0:
        parser.print_help()
        parser.exit()

    args = parser.parse_args()

    assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ

    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(os.path.join(args.output_dir, 'rendered_images'))

    print('Connecting to MegaDB to get the datasets table...')
    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    print('Loading the MegaDB entries...')
    with open(args.megadb_entries) as f:
        sequences = json.load(f)
    print('Total number of sequences: {}'.format(len(sequences)))

    # print('Checking that the MegaDB entries conform to the schema...')
    # sequences_schema_check.sequences_schema_check(sequences)

    shuffle(sequences)
    visualize_sequences(datasets_table, sequences, args)
Esempio n. 3
0
def get_example_images(megadb_utils: MegadbUtils, dataset_name: str,
                       class_name: str) -> List[Optional[str]]:
    """Gets SAS URLs for images of a particular class from a given dataset."""
    datasets_table = megadb_utils.get_datasets_table()

    # this query should be fairly fast, ~1 sec
    query_both_levels = f'''
    SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq
    FROM seq
    WHERE ARRAY_CONTAINS(seq.class, "{class_name}")
        OR (SELECT VALUE COUNT(im)
            FROM im IN seq.images
            WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0
    '''
    sequences = megadb_utils.query_sequences_table(
        query_both_levels, partition_key=dataset_name)

    num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES)
    sample_seqs = sample(sequences, num_samples)

    image_urls: List[Optional[str]] = []
    for seq in sample_seqs:
        sample_image = sample(seq['images'], 1)[0]  # sample 1 img per sequence
        img_path = MegadbUtils.get_full_path(
            datasets_table, dataset_name, sample_image['file'])
        img_path = urllib.parse.quote_plus(img_path)

        dataset_info = datasets_table[dataset_name]
        img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format(
            dataset_info["storage_account"],
            dataset_info["container"],
            img_path,
            dataset_info["container_sas_key"])
        image_urls.append(img_url)

    num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls)
    if num_missing > 0:
        image_urls.extend([None] * num_missing)
    assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES
    return image_urls
Esempio n. 4
0
def query_species_by_dataset(megadb_utils: MegadbUtils,
                             output_dir: str) -> None:
    """For each dataset, creates a JSON file specifying species counts.

    Skips dataset if a JSON file for it already exists.
    """
    # which datasets are already processed?
    queried_datasets = set(
        i.split('.json')[0] for i in os.listdir(output_dir)
        if i.endswith('.json'))

    datasets_table = megadb_utils.get_datasets_table()
    dataset_names = [i for i in datasets_table if i not in queried_datasets]

    print(
        f'{len(queried_datasets)} datasets already queried. Querying species '
        f'in {len(dataset_names)} datasets...')

    for dataset_name in dataset_names:
        print(f'Querying dataset {dataset_name}...')

        # sequence-level query should be fairly fast, ~1 sec
        query_seq_level = '''
        SELECT VALUE seq.class
        FROM seq
        WHERE ARRAY_LENGTH(seq.class) > 0
            AND NOT ARRAY_CONTAINS(seq.class, "empty")
            AND NOT ARRAY_CONTAINS(seq.class, "__label_unavailable")
        '''
        results = megadb_utils.query_sequences_table(
            query_seq_level, partition_key=dataset_name)

        counter = Counter()
        for i in results:
            counter.update(i)

        # cases when the class field is on the image level (images in a sequence
        # that had different class labels, 'caltech' dataset is like this)
        # this query may take a long time, >1hr
        query_image_level = '''
        SELECT VALUE seq.images
        FROM sequences seq
        WHERE (
            SELECT VALUE COUNT(im)
            FROM im IN seq.images
            WHERE ARRAY_LENGTH(im.class) > 0
        ) > 0
        '''

        start = datetime.now()
        results_im = megadb_utils.query_sequences_table(
            query_image_level, partition_key=dataset_name)
        elapsed = (datetime.now() - start).seconds
        print(f'- image-level query took {elapsed}s')

        for seq_images in results_im:
            for im in seq_images:
                assert 'class' in im
                counter.update(im['class'])

        with open(os.path.join(output_dir, f'{dataset_name}.json'), 'w') as f:
            json.dump(counter, f, indent=2)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        description='Script to download image files')
    parser.add_argument('file_list')
    parser.add_argument(
        'store_dir', help='Path to a directory to store the downloaded files')
    parser.add_argument('--single_thread', action='store_true')
    parser.add_argument('--only_new_images', action='store_true')

    args = parser.parse_args()

    os.makedirs(args.store_dir, exist_ok=True)

    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()
    print('Obtained the datasets table. Loading the file_list now...')

    with open(args.file_list) as f:
        file_list = json.load(f)

    existing = os.listdir(args.store_dir)
    existing = set([i.split('.jpg')[0] for i in existing])

    file_list_to_download = [
        i for i in file_list if i['download_id'] not in existing
    ]

    if args.only_new_images:
        print('Only going to download new images.')
        file_list_to_download = [
            i for i in file_list_to_download if 'new_entry' in i
        ]

    # if need to re-download a dataset's images in case of corruption
    # file_list_to_download = [i for i in file_list_to_download if i['dataset'] == 'rspb_gola']

    print('file_list has {} items, still need to download {} items'.format(
        len(file_list), len(file_list_to_download)))

    urls = [
        construct_url(item, datasets_table) for item in file_list_to_download
    ]
    local_paths = [
        os.path.join(args.store_dir, '{}.jpg'.format(item['download_id']))
        for item in file_list_to_download
    ]

    origin_and_dest = zip(urls, local_paths)

    start_time = time.time()
    if args.single_thread:
        print('Starting to download, single threaded...')
        for url, local_path in origin_and_dest:
            download_file(url, local_path)
    else:
        print('Starting to download, using ThreadPool...')
        pool = ThreadPool()
        list(pool.starmap(download_file, origin_and_dest))

    elapsed = time.time() - start_time
    print('Time spent on download: {}'.format(
        humanfriendly.format_timespan(elapsed)))
def visualize_incoming_annotations(args):
    print('Connecting to MegaDB to get the datasets table...')
    megadb_utils = MegadbUtils()
    datasets_table = megadb_utils.get_datasets_table()

    print('Loading the MegaDB entries...')
    with open(args.megadb_entries) as f:
        sequences = json.load(f)
    print(f'Total number of sequences: {len(sequences)}')
    dataset_seq_images = defaultdict(dict)
    for seq in sequences:
        dataset_seq_images[seq['dataset']][seq['seq_id']] = seq['images']

    print('Loading incoming annotation entries...')
    incoming = IndexedJsonDb(args.incoming_annotation)
    print(
        f'Number of images in this annotation file: {len(incoming.image_id_to_image)}'
    )

    if args.num_to_visualize != -1 and args.num_to_visualize <= len(
            incoming.image_id_to_image):
        incoming_id_to_anno = sample(
            list(incoming.image_id_to_annotations.items()),
            args.num_to_visualize)
    else:
        incoming_id_to_anno = incoming.image_id_to_annotations.items()

    # The file_name field in the incoming json looks like alka_squirrels.seq2020_05_07_25C.frame119221.jpg
    # we need to use the dataset, sequence and frame info to find the actual path in blob storage
    # using the sequences
    images_html = []
    for image_id, annotations in tqdm(incoming_id_to_anno):
        if args.trim_to_images_bboxes_labeled and annotations[0][
                'category_id'] == 5:
            # category_id 5 is No Object Visible
            continue

        anno_file_name = incoming.image_id_to_image[image_id]['file_name']
        parts = anno_file_name.split('.')
        dataset_name = parts[0]
        seq_id = parts[1].split('seq')[1]
        frame_num = int(parts[2].split('frame')[1])

        im_rel_path = get_image_rel_path(dataset_seq_images, dataset_name,
                                         seq_id, frame_num)
        if im_rel_path is None:
            print(f'Not found in megadb entries: dataset {dataset_name},'
                  f' seq_id {seq_id}, frame_num {frame_num}')
            continue

        im_full_path = megadb_utils.get_full_path(datasets_table, dataset_name,
                                                  im_rel_path)

        # download the image
        container_client = megadb_utils.get_storage_client(
            datasets_table, dataset_name)
        downloader = container_client.download_blob(im_full_path)
        image_file = io.BytesIO()
        blob_props = downloader.download_to_stream(image_file)
        image = vis_utils.open_image(image_file)

        boxes = [anno['bbox'] for anno in annotations]
        classes = [anno['category_id'] for anno in annotations]

        vis_utils.render_iMerit_boxes(boxes,
                                      classes,
                                      image,
                                      label_map=incoming.cat_id_to_name)

        file_name = '{}_gtbbox.jpg'.format(
            os.path.splitext(anno_file_name)[0].replace('/', '~'))
        image = vis_utils.resize_image(image, args.output_image_width)
        image.save(os.path.join(args.output_dir, 'rendered_images', file_name))

        images_html.append({
            'filename':
            '{}/{}'.format('rendered_images', file_name),
            'title':
            '{}, number of boxes: {}'.format(
                anno_file_name, len([b for b in boxes if len(b) > 0])),
            'textStyle':
            'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5'
        })

    # Write to HTML
    images_html = sorted(images_html, key=lambda x: x['filename'])
    write_html_image_list(filename=os.path.join(args.output_dir, 'index.html'),
                          images=images_html,
                          options={
                              'headerHtml':
                              '<h1>Sample annotations from {}</h1>'.format(
                                  args.incoming_annotation)
                          })

    print('Visualized {} images.'.format(len(images_html)))