Beispiel #1
0
def construct_url(img_path: str,
                  datasets_table: Mapping[str, Any],
                  dataset_name: Optional[str] = None) -> str:
    """Builds Azure SAS storage URL.

    Args:
        img_path: str, either <dataset_name>/<blob> (set dataset_name=None)
            or <img_file> without the 'path_prefix' from dataset_table
        datasets_table: dict, from MegaDB
        dataset_name: optional str

    Returns: str, URL with SAS token
    """
    if dataset_name is None:
        dataset_name, blob = img_path.split('/', maxsplit=1)
    else:
        blob = img_path
        path_prefix = datasets_table[dataset_name].get('path_prefix', '')
        if len(path_prefix) > 0:
            blob = path_prefix + '/' + blob

    sas_token = datasets_table[dataset_name]['container_sas_key']
    if sas_token[0] == '?':
        sas_token = sas_token[1:]

    url = build_azure_storage_uri(
        account=datasets_table[dataset_name]['storage_account'],
        container=datasets_table[dataset_name]['container'],
        blob=blob,
        sas_token=sas_token)
    return url
Beispiel #2
0
def get_image_sas_uris(img_paths: Iterable[str]) -> List[str]:
    """Converts a image paths to Azure Blob Storage blob URIs with SAS tokens.

    Args:
        img_paths: list of str, <dataset-name>/<image-filename>

    Returns:
        image_sas_uris: list of str, image blob URIs with SAS tokens, ready to
            pass to the batch detection API
    """
    # we need the datasets table for getting SAS keys
    datasets_table = megadb_utils.MegadbUtils().get_datasets_table()

    image_sas_uris = []
    for img_path in img_paths:
        dataset, img_file = img_path.split('/', maxsplit=1)

        # strip leading '?' from SAS token
        sas_token = datasets_table[dataset]['container_sas_key']
        if sas_token[0] == '?':
            sas_token = sas_token[1:]

        image_sas_uri = sas_blob_utils.build_azure_storage_uri(
            account=datasets_table[dataset]['storage_account'],
            container=datasets_table[dataset]['container'],
            blob=img_file,
            sas_token=sas_token)
        image_sas_uris.append(image_sas_uri)
    return image_sas_uris
def upload_file_to_blob(account_name: str, container_name: str,
                        local_path: str, blob_name: str,
                        sas_token: str) -> str:
    """Uploads a local file to Azure Blob Storage and returns the uploaded
    blob URI with SAS token."""
    container_uri = sas_blob_utils.build_azure_storage_uri(
        account=account_name, container=container_name, sas_token=sas_token)
    with open(local_path, 'rb') as data:
        return sas_blob_utils.upload_blob(container_uri=container_uri,
                                          blob_name=blob_name,
                                          data=data)
Beispiel #4
0
def enumerate_blobs_to_file(
        output_file: str,
        account_name: str,
        container_name: str,
        sas_token: Optional[str] = None,
        blob_prefix: Optional[str] = None,
        blob_suffix: Optional[Union[str, Tuple[str]]] = None,
        rsearch: Optional[str] = None,
        limit: Optional[int] = None
        ) -> List[str]:
    """
    Enumerates blobs in a container, and writes the blob names to an output
    file.

    Args:
        output_file: str, path to save list of files in container
            If ends in '.json', writes a JSON string. Otherwise, writes a
            newline-delimited list. Can be None, in which case this is just a 
            convenient wrapper for blob enumeration.
        account_name: str, Azure Storage account name
        container_name: str, Azure Blob Storage container name
        sas_token: optional str, container SAS token, leading ? will be removed if present.
        blob_prefix: optional str, returned results will only contain blob names
            to with this prefix
        blob_suffix: optional str or tuple of str, returned results will only
            contain blob names with this/these suffix(es). The blob names will
            be lowercased first before comparing with the suffix(es).
        rsearch: optional str, returned results will only contain blob names
            that match this regex. Can also be a list of regexes, in which case
            blobs matching *any* of the regex's will be returned.            
        limit: int, maximum # of blob names to list
            if None, then returns all blob names

    Returns: list of str, sorted blob names, of length limit or shorter.
    """
    if sas_token is not None and len(sas_token) > 9 and sas_token[0] == '?':
        sas_token = sas_token[1:]
        
    container_uri = sas_blob_utils.build_azure_storage_uri(
        account=account_name, container=container_name, sas_token=sas_token)
    matched_blobs = sas_blob_utils.list_blobs_in_container(
        container_uri=container_uri, blob_prefix=blob_prefix,
        blob_suffix=blob_suffix, rsearch=rsearch, limit=limit)
    if output_file is not None:
        write_list_to_file(output_file, matched_blobs)
    return matched_blobs
def enumerate_blobs_to_file(output_file: str,
                            account_name: str,
                            container_name: str,
                            sas_token: Optional[str] = None,
                            blob_prefix: Optional[str] = None,
                            blob_suffix: Optional[Union[str,
                                                        Tuple[str]]] = None,
                            rsearch: Optional[str] = None,
                            limit: Optional[int] = None) -> List[str]:
    """Enumerates blobs in a container, and writes the blob names to an output
    file.

    Args:
        output_file: str, path to save list of files in container
            If ends in '.json', writes a JSON string. Otherwise, writes a
            newline-delimited list
        account_name: str, Azure Storage account name
        container_name: str, Azure Blob Storage container name
        sas_token: optional str, container SAS token, does not start with '?'
        blob_prefix: optional str, returned results will only contain blob names
            to with this prefix
        blob_suffix: optional str or tuple of str, returned results will only
            contain blob names with this/these suffix(es). The blob names will
            be lowercased first before comparing with the suffix(es).
        rsearch: optional str, returned results will only contain blob names
            that match this Python regex pattern at any point in the blob name.
            Use '^' character to only match from the beginning of the blob name.
        limit: int, maximum # of blob names to list
            if None, then returns all blob names

    Returns: list of str, sorted blob names, of length limit or shorter.
    """
    container_uri = sas_blob_utils.build_azure_storage_uri(
        account=account_name, container=container_name, sas_token=sas_token)
    matched_blobs = sas_blob_utils.list_blobs_in_container(
        container_uri=container_uri,
        blob_prefix=blob_prefix,
        blob_suffix=blob_suffix,
        rsearch=rsearch,
        limit=limit)
    write_list_to_file(output_file, matched_blobs)
    return matched_blobs
Beispiel #6
0
def construct_url(img_path: str,
                  datasets_table: Mapping[str, Any],
                  dataset_name: Optional[str] = None) -> str:
    """Builds Azure SAS storage URL.

    Args:
        img_path: str, either <dataset_name>/<blob> (set dataset_name=None)
            or <img_file> without the 'path_prefix' from dataset_table
        datasets_table: dict, from MegaDB
        dataset_name: optional str

    Returns: str, URL with SAS token
    """
    if dataset_name is None:
        dataset_name, blob = img_path.split('/', maxsplit=1)
    else:
        blob = img_path
        path_prefix = datasets_table[dataset_name].get('path_prefix', '')
        if len(path_prefix) > 0:
            blob = path_prefix + '/' + blob

    sas_token = datasets_table[dataset_name]['container_sas_key']
    if sas_token[0] == '?':
        sas_token = sas_token[1:]

    url = build_azure_storage_uri(
        account=datasets_table[dataset_name]['storage_account'],
        container=datasets_table[dataset_name]['container'],
        blob=blob,
        sas_token=sas_token)

    # wiitigers Unicode issue - no good mapping from DB file names to file names in blob URL
    if dataset_name == 'wiitigers' and '' in img_path:
        url = url.replace('%C3%AF%E2%82%AC%C2%A8', '%EF%80%A8')

    return url
def create_batch_job(job_id: str, body: dict):
    """
    This is the target to be run in a thread to submit a batch processing job and monitor progress
    """
    job_status_table = JobStatusTable()
    try:
        log.info(f'server_job, create_batch_job, job_id {job_id}, {body}')

        input_container_sas = body.get('input_container_sas', None)

        use_url = body.get('use_url', False)

        images_requested_json_sas = body.get('images_requested_json_sas', None)

        image_path_prefix = body.get('image_path_prefix', None)

        first_n = body.get('first_n', None)
        first_n = int(first_n) if first_n else None

        sample_n = body.get('sample_n', None)
        sample_n = int(sample_n) if sample_n else None

        model_version = body.get('model_version', '')
        if model_version == '':
            model_version = api_config.DEFAULT_MD_VERSION

        # request_name and request_submission_timestamp are for appending to
        # output file names
        job_name = body.get('request_name', '')  # in earlier versions we used "request" to mean a "job"
        job_submission_timestamp = get_utc_time()

        # image_paths can be a list of strings (Azure blob names or public URLs)
        # or a list of length-2 lists where each is a [image_id, metadata] pair

        # Case 1: listing all images in the container
        # - not possible to have attached metadata if listing images in a blob
        if images_requested_json_sas is None:
            log.info('server_job, create_batch_job, listing all images to process.')

            # list all images to process
            image_paths = sas_blob_utils.list_blobs_in_container(
                container_uri=input_container_sas,
                blob_prefix=image_path_prefix,  # check will be case-sensitive
                blob_suffix=api_config.IMAGE_SUFFIXES_ACCEPTED,  # check will be case-insensitive
                limit=api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB + 1
                # + 1 so if the number of images listed > MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB
                # we will know and not proceed
            )

        # Case 2: user supplied a list of images to process; can include metadata
        else:
            log.info('server_job, create_batch_job, using provided list of images.')
            output_stream, blob_properties = sas_blob_utils.download_blob_to_stream(images_requested_json_sas)
            image_paths = json.load(output_stream)
            log.info('server_job, create_batch_job, length of image_paths provided by the user: {}'.format(
                len(image_paths)))
            if len(image_paths) == 0:
                job_status = get_job_status(
                    'completed', '0 images found in provided list of images.')
                job_status_table.update_job_status(job_id, job_status)
                return

            error, metadata_available = validate_provided_image_paths(image_paths)
            if error is not None:
                msg = 'image paths provided in the json are not valid: {}'.format(error)
                raise ValueError(msg)

            # filter down to those conforming to the provided prefix and accepted suffixes (image file types)
            valid_image_paths = []
            for p in image_paths:
                locator = p[0] if metadata_available else p

                # prefix is case-sensitive; suffix is not
                if image_path_prefix is not None and not locator.startswith(image_path_prefix):
                    continue

                # Although urlparse(p).path preserves the extension on local paths, it will not work for
                # blob file names that contains "#", which will be treated as indication of a query.
                # If the URL is generated via Azure Blob Storage, the "#" char will be properly encoded
                path = urllib.parse.urlparse(locator).path if use_url else locator

                if path.lower().endswith(api_config.IMAGE_SUFFIXES_ACCEPTED):
                    valid_image_paths.append(p)
            image_paths = valid_image_paths
            log.info(('server_job, create_batch_job, length of image_paths provided by user, '
                      f'after filtering to jpg: {len(image_paths)}'))

        # apply the first_n and sample_n filters
        if first_n:
            assert first_n > 0, 'parameter first_n is 0.'
            # OK if first_n > total number of images
            image_paths = image_paths[:first_n]

        if sample_n:
            assert sample_n > 0, 'parameter sample_n is 0.'
            if sample_n > len(image_paths):
                msg = ('parameter sample_n specifies more images than '
                       'available (after filtering by other provided params).')
                raise ValueError(msg)

            # sample by shuffling image paths and take the first sample_n images
            log.info('First path before shuffling:', image_paths[0])
            shuffle(image_paths)
            log.info('First path after shuffling:', image_paths[0])
            image_paths = image_paths[:sample_n]

        num_images = len(image_paths)
        log.info(f'server_job, create_batch_job, num_images after applying all filters: {num_images}')

        if num_images < 1:
            job_status = get_job_status('completed', (
                'Zero images found in container or in provided list of images '
                'after filtering with the provided parameters.'))
            job_status_table.update_job_status(job_id, job_status)
            return
        if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB:
            job_status = get_job_status(
                'failed',
                (f'The number of images ({num_images}) requested for processing exceeds the maximum '
                 f'accepted {api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB} in one call'))
            job_status_table.update_job_status(job_id, job_status)
            return

        # upload the image list to the container, which is also mounted on all nodes
        # all sharding and scoring use the uploaded list
        images_list_str_as_bytes = bytes(json.dumps(image_paths, ensure_ascii=False), encoding='utf-8')

        container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME,
                                                               container=api_config.STORAGE_CONTAINER_API)
        with ContainerClient.from_container_url(container_url,
                                                credential=api_config.STORAGE_ACCOUNT_KEY) as api_container_client:
            _ = api_container_client.upload_blob(
                name=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_images.json',
                data=images_list_str_as_bytes)

        job_status = get_job_status('created', f'{num_images} images listed; submitting the job...')
        job_status_table.update_job_status(job_id, job_status)

    except Exception as e:
        job_status = get_job_status('failed', f'Error occurred while preparing the Batch job: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while preparing the Batch job: {e}')
        return  # do not start monitoring

    try:
        batch_job_manager = BatchJobManager()

        model_rel_path = api_config.MD_VERSIONS_TO_REL_PATH[model_version]
        batch_job_manager.create_job(job_id,
                                     model_rel_path,
                                     input_container_sas,
                                     use_url)

        num_tasks, task_ids_failed_to_submit = batch_job_manager.submit_tasks(job_id, num_images)

        # now request_status moves from created to running
        job_status = get_job_status('running',
                                    (f'Submitted {num_images} images to cluster in {num_tasks} shards. '
                                     f'Number of shards failed to be submitted: {len(task_ids_failed_to_submit)}'))

        # an extra field to allow the monitoring thread to restart after an API restart: total number of tasks
        job_status['num_tasks'] = num_tasks
        # also record the number of images to process for reporting
        job_status['num_images'] = num_images

        job_status_table.update_job_status(job_id, job_status)
    except Exception as e:
        job_status = get_job_status('problem', f'Please contact us. Error occurred while submitting the Batch job: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while submitting the Batch job: {e}')
        return

    # start the monitor thread with the same name
    try:
        thread = threading.Thread(
            target=monitor_batch_job,
            name=f'job_{job_id}',
            kwargs={
                'job_id': job_id,
                'num_tasks': num_tasks,
                'model_version': model_version,
                'job_name': job_name,
                'job_submission_timestamp': job_submission_timestamp
            }
        )
        thread.start()
    except Exception as e:
        job_status = get_job_status('problem', f'Error occurred while starting the monitoring thread: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while starting the monitoring thread: {e}')
        return
def aggregate_results(job_id: str,
                      model_version: str,
                      job_name: str,
                      job_submission_timestamp: str) -> str:
    log.info(f'server_job, aggregate_results starting, job_id: {job_id}')

    container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME,
                                                           container=api_config.STORAGE_CONTAINER_API)
    # when people download this, the timestamp will have : replaced by _
    output_file_path = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_detections_{job_name}_{job_submission_timestamp}.json'

    with ContainerClient.from_container_url(container_url,
                                            credential=api_config.STORAGE_ACCOUNT_KEY) as container_client:
        # check if the result blob has already been written (could be another instance of the API / worker thread)
        # and if so, skip aggregating and uploading the results, and just generate the SAS URL, which
        # could be needed still if the previous request_status was `problem`.
        blob_client = container_client.get_blob_client(output_file_path)
        if blob_client.exists():
            log.warning(f'The output file already exists, likely because another monitoring thread already wrote it.')
        else:
            task_outputs_dir = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_outputs/'
            generator = container_client.list_blobs(name_starts_with=task_outputs_dir)

            blobs = [i for i in generator if i.name.endswith('.json')]

            all_results = []
            for blob_props in tqdm(blobs):
                with container_client.get_blob_client(blob_props) as blob_client:
                    stream = io.BytesIO()
                    blob_client.download_blob().readinto(stream)
                    stream.seek(0)
                    task_results = json.load(stream)
                    all_results.extend(task_results)

            api_output = {
                'info': {
                    'detector': f'megadetector_v{model_version}',
                    'detection_completion_time': get_utc_time(),
                    'format_version': api_config.OUTPUT_FORMAT_VERSION
                },
                'detection_categories': api_config.DETECTOR_LABEL_MAP,
                'images': all_results
            }

            # upload the output JSON to the Job folder
            api_output_as_bytes = bytes(json.dumps(api_output, ensure_ascii=False, indent=1), encoding='utf-8')
            _ = container_client.upload_blob(name=output_file_path, data=api_output_as_bytes)

    output_sas = generate_blob_sas(
        account_name=api_config.STORAGE_ACCOUNT_NAME,
        container_name=api_config.STORAGE_CONTAINER_API,
        blob_name=output_file_path,
        account_key=api_config.STORAGE_ACCOUNT_KEY,
        permission=BlobSasPermissions(read=True, write=False),
        expiry=datetime.utcnow() + timedelta(days=api_config.OUTPUT_SAS_EXPIRATION_DAYS)
    )
    output_sas_url = sas_blob_utils.build_azure_storage_uri(
        account=api_config.STORAGE_ACCOUNT_NAME,
        container=api_config.STORAGE_CONTAINER_API,
        blob=output_file_path,
        sas_token=output_sas
    )
    log.info(f'server_job, aggregate_results done, job_id: {job_id}')
    log.info(f'output_sas_url: {output_sas_url}')
    return output_sas_url
#
# Also available at the /supported_model_versions and /default_model_version
# endpoints
#
# Unless you have any specific reason to set this to a non-default value, leave
# it at the default, which as of 2020.04.28 is MegaDetector 4.1
#
# additional_task_args = {"model_version":"4_prelim"}
#

#%% Derived variables, path setup

assert len(folder_names) != 0

read_only_sas_url = sas_blob_utils.build_azure_storage_uri(
    account=storage_account_name,
    container=container_name,
    sas_token=read_only_sas_token)
write_sas_url = sas_blob_utils.build_azure_storage_uri(
    account=storage_account_name,
    container=container_name,
    sas_token=read_write_sas_token)

# local folders
filename_base = os.path.join(base_output_folder_name, base_task_name)
raw_api_output_folder = os.path.join(filename_base, 'raw_api_outputs')
combined_api_output_folder = os.path.join(filename_base,
                                          'combined_api_outputs')
postprocessing_output_folder = os.path.join(filename_base, 'postprocessing')

os.makedirs(filename_base, exist_ok=True)
os.makedirs(raw_api_output_folder, exist_ok=True)
output_files = {}

pbar = tqdm(js.items())
for img_path, img_info in pbar:
    save_path = os.path.join(images_dir, img_path)
    if os.path.exists(save_path):
        continue

    ds, img_file = img_path.split('/', maxsplit=1)
    if ds not in output_files:
        output_path = os.path.join(output_dir, f'{ds}_images.txt')
        output_files[ds] = open(output_path, 'w')

        dataset_info = datasets_table[ds]
        account = dataset_info['storage_account']
        container = dataset_info['container']

        if 'public' in datasets_table[ds]['access']:
            url = sas_blob_utils.build_azure_storage_uri(account, container)
        else:
            url = sas_blob_utils.build_azure_storage_uri(
                account,
                container,
                sas_token=dataset_info['container_sas_key'][1:])
        pbar.write(f'"{url}"')

    output_files[ds].write(img_file + '\n')

for f in output_files.values():
    f.close()
Beispiel #11
0
def aggregate_results(job_id, model_version, job_name,
                      job_submission_timestamp):
    log.info(f'server_job, aggregate_results starting, job_id: {job_id}')

    task_outputs_dir = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_outputs/'

    container_url = sas_blob_utils.build_azure_storage_uri(
        account=api_config.STORAGE_ACCOUNT_NAME,
        container=api_config.STORAGE_CONTAINER_API)

    all_results = []

    with ContainerClient.from_container_url(
            container_url,
            credential=api_config.STORAGE_ACCOUNT_KEY) as container_client:
        generator = container_client.list_blobs(
            name_starts_with=task_outputs_dir)

        blobs = [i for i in generator if i.name.endswith('.json')]

        for blob_props in tqdm(blobs):
            with container_client.get_blob_client(blob_props) as blob_client:
                stream = io.BytesIO()
                blob_client.download_blob().readinto(stream)
                stream.seek(0)
                task_results = json.load(stream)
                all_results.extend(task_results)

        api_output = {
            'info': {
                'detector': f'megadetector_v{model_version}',
                'detection_completion_time': get_utc_time(),
                'format_version': api_config.OUTPUT_FORMAT_VERSION
            },
            'detection_categories': api_config.DETECTOR_LABEL_MAP,
            'images': all_results
        }

        # upload the output JSON to the Job folder
        api_output_as_bytes = bytes(json.dumps(api_output,
                                               ensure_ascii=False,
                                               indent=1),
                                    encoding='utf-8')
        output_file_path = f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_detections_{job_name}_{job_submission_timestamp}.json'
        _ = container_client.upload_blob(name=output_file_path,
                                         data=api_output_as_bytes)

    output_sas = generate_blob_sas(
        account_name=api_config.STORAGE_ACCOUNT_NAME,
        container_name=api_config.STORAGE_CONTAINER_API,
        blob_name=output_file_path,
        account_key=api_config.STORAGE_ACCOUNT_KEY,
        permission=BlobSasPermissions(read=True, write=False),
        expiry=datetime.utcnow() +
        timedelta(days=api_config.OUTPUT_SAS_EXPIRATION_DAYS))
    output_sas_url = sas_blob_utils.build_azure_storage_uri(
        account=api_config.STORAGE_ACCOUNT_NAME,
        container=api_config.STORAGE_CONTAINER_API,
        blob=output_file_path,
        sas_token=output_sas)
    log.info(f'server_job, aggregate_results done, job_id: {job_id}')
    log.info(f'output_sas_url: {output_sas_url}')
    return output_sas_url
def download_and_crop(
        queried_images_json: Mapping[str, Mapping[str, Any]],
        detection_cache: Mapping[str, Mapping[str, Mapping[str, Any]]],
        detection_categories: Mapping[str, str],
        detector_version: str,
        cropped_images_dir: str,
        confidence_threshold: float,
        save_full_images: bool,
        square_crops: bool,
        check_crops_valid: bool,
        images_dir: Optional[str] = None,
        threads: int = 1,
        images_missing_detections: Optional[Iterable[str]] = None
        ) -> Tuple[List[str], int, int]:
    """
    Saves crops to a file with the same name as the original image with an
    additional suffix appended, starting with 3 underscores:
    - if image has ground truth bboxes: "___cropXX.jpg", where "XX" indicates
        the bounding box index
    - if image has bboxes from MegaDetector: "___cropXX_mdvY.Y.jpg", where
        "Y.Y" indicates the MegaDetector version
    See module docstring for more info and examples.

    Note: this function is very similar to the "download_and_crop()" function in
        crop_detections.py. The main difference is that this function uses
        MegaDB to look up Azure Storage container information for images based
        on the dataset, whereas the crop_detections.py version has no concept
        of a "dataset" and "ground-truth" bounding boxes from MegaDB.

    Args:
        queried_images_json: dict, represents JSON output of json_validator.py,
            all images in queried_images_json are assumed to have either ground
            truth or cached detected bounding boxes unless
            images_missing_detections is given
        detection_cache: dict, dataset_name => {img_path => detection_dict}
        detector_version: str, detector version string, e.g., '4.1'
        cropped_images_dir: str, path to folder where cropped images are saved
        confidence_threshold: float, only crop bounding boxes above this value
        save_full_images: bool, whether to save downloaded images to images_dir,
            images_dir must be given and must exist if save_full_images=True
        square_crops: bool, whether to crop bounding boxes as squares
        check_crops_valid: bool, whether to load each crop to ensure the file is
            valid (i.e., not truncated)
        images_dir: optional str, path to folder where full images are saved
        threads: int, number of threads to use for downloading images
        images_missing_detections: optional list of str, image files to skip
            because they have no ground truth or cached detected bounding boxes

    Returns: list of str, images with bounding boxes that failed to download or
        crop properly
    """
    # error checking before we download and crop any images
    valid_img_paths = set(queried_images_json.keys())
    if images_missing_detections is not None:
        valid_img_paths -= set(images_missing_detections)
    for img_path in valid_img_paths:
        info_dict = queried_images_json[img_path]
        ds, img_file = img_path.split('/', maxsplit=1)
        assert ds == info_dict['dataset']

        if 'bbox' in info_dict:  # ground-truth bounding boxes
            pass
        elif img_file in detection_cache[ds]:  # detected bounding boxes
            bbox_dicts = detection_cache[ds][img_file]['detections']
            assert all('conf' in bbox_dict for bbox_dict in bbox_dicts)
            # convert from category ID to category name
            for d in bbox_dicts:
                d['category'] = detection_categories[d['category']]
        else:
            raise ValueError(f'{img_path} has no ground truth bounding boxes '
                             'and was not found in the detection cache. Please '
                             'include it in images_missing_detections.')

    # we need the datasets table for getting SAS keys
    datasets_table = megadb_utils.MegadbUtils().get_datasets_table()
    container_clients = {}  # dataset name => ContainerClient

    pool = futures.ThreadPoolExecutor(max_workers=threads)
    future_to_img_path = {}
    images_failed_download = []

    print(f'Getting bbox info for {len(valid_img_paths)} images...')
    for img_path in tqdm(sorted(valid_img_paths)):
        # we already did all error checking above, so we don't do any here
        info_dict = queried_images_json[img_path]
        ds, img_file = img_path.split('/', maxsplit=1)

        # get ContainerClient
        if ds not in container_clients:
            sas_token = datasets_table[ds]['container_sas_key']
            if sas_token[0] == '?':
                sas_token = sas_token[1:]
            url = sas_blob_utils.build_azure_storage_uri(
                account=datasets_table[ds]['storage_account'],
                container=datasets_table[ds]['container'],
                sas_token=sas_token)
            container_clients[ds] = ContainerClient.from_container_url(url)
        container_client = container_clients[ds]

        # get bounding boxes
        # we must include the dataset <ds> in <crop_path_template> because
        #    '{img_path}' actually gets populated with <img_file> in
        #    load_and_crop()
        is_ground_truth = ('bbox' in info_dict)
        if is_ground_truth:  # ground-truth bounding boxes
            bbox_dicts = info_dict['bbox']
            crop_path_template = os.path.join(
                cropped_images_dir, ds, '{img_path}___crop{n:>02d}.jpg')
        else:  # detected bounding boxes
            bbox_dicts = detection_cache[ds][img_file]['detections']
            crop_path_template = os.path.join(
                cropped_images_dir, ds,
                '{img_path}___crop{n:>02d}_' + f'mdv{detector_version}.jpg')

        ds_dir = None if images_dir is None else os.path.join(images_dir, ds)

        # get the image, either from disk or from Blob Storage
        future = pool.submit(
            load_and_crop, img_file, ds_dir, container_client, bbox_dicts,
            confidence_threshold, crop_path_template, save_full_images,
            square_crops, check_crops_valid)
        future_to_img_path[future] = img_path

    total = len(future_to_img_path)
    total_downloads = 0
    total_new_crops = 0
    print(f'Reading/downloading {total} images and cropping...')
    for future in tqdm(futures.as_completed(future_to_img_path), total=total):
        img_path = future_to_img_path[future]
        try:
            did_download, num_new_crops = future.result()
            total_downloads += did_download
            total_new_crops += num_new_crops
        except Exception as e:  # pylint: disable=broad-except
            exception_type = type(e).__name__
            tqdm.write(f'{img_path} - generated {exception_type}: {e}')
            images_failed_download.append(img_path)

    pool.shutdown()
    for container_client in container_clients.values():
        # inelegant way to close the container_clients
        with container_client:
            pass

    print(f'Downloaded {total_downloads} images.')
    print(f'Made {total_new_crops} new crops.')
    return images_failed_download, total_downloads, total_new_crops
def submit_batch_detection_api(images_to_detect: Iterable[str],
                               task_lists_dir: str,
                               detector_version: str,
                               account: str,
                               container: str,
                               sas_token: str,
                               caller: str,
                               batch_detection_api_url: str,
                               resume_file_path: str
                               ) -> Dict[str, List[Task]]:
    """
    Args:
        images_to_detect: list of str, list of str, image paths with the format
            <dataset-name>/<image-filename>
        task_lists_dir: str, path to local directory for saving JSON files
            each containing a list of image URLs corresponding to an API task
        detector_version: str, MegaDetector version string, e.g., '4.1',
            see {batch_detection_api_url}/supported_model_versions
        account: str, Azure Storage account name
        container: str, Azure Blob Storage container name, where the task lists
            will be uploaded
        sas_token: str, SAS token with write permissions for the container
        caller: str, allow-listed caller
        batch_detection_api_url: str, URL to batch detection API
        resume_file_path: str, path to save resume file

    Returns: dict, maps str dataset name to list of Task objects
    """
    filtered_images_to_detect = [
        x for x in images_to_detect if path_utils.is_image_file(x)]
    not_images = set(images_to_detect) - set(filtered_images_to_detect)
    if len(not_images) == 0:
        print('Good! All image files have valid file extensions.')
    else:
        print(f'Skipping {len(not_images)} files with non-image extensions:')
        pprint.pprint(sorted(not_images))
    images_to_detect = filtered_images_to_detect

    datasets_table = megadb_utils.MegadbUtils().get_datasets_table()

    images_by_dataset = split_images_list_by_dataset(images_to_detect)
    tasks_by_dataset = {}
    for dataset, image_paths in images_by_dataset.items():
        # get SAS URL for images container
        images_sas_token = datasets_table[dataset]['container_sas_key']
        if images_sas_token[0] == '?':
            images_sas_token = images_sas_token[1:]
        images_container_url = sas_blob_utils.build_azure_storage_uri(
            account=datasets_table[dataset]['storage_account'],
            container=datasets_table[dataset]['container'],
            sas_token=images_sas_token)

        # strip image paths of dataset name
        image_blob_names = [path[path.find('/') + 1:] for path in image_paths]

        tasks_by_dataset[dataset] = submit_batch_detection_api_by_dataset(
            dataset=dataset,
            image_blob_names=image_blob_names,
            images_container_url=images_container_url,
            task_lists_dir=task_lists_dir,
            detector_version=detector_version,
            account=account, container=container, sas_token=sas_token,
            caller=caller, batch_detection_api_url=batch_detection_api_url)

    # save list of dataset names and task IDs for resuming
    resume_json = [
        {
            'dataset': dataset,
            'task_name': task.name,
            'task_id': task.id,
            'local_images_list_path': task.local_images_list_path
        }
        for dataset in tasks_by_dataset
        for task in tasks_by_dataset[dataset]
    ]
    with open(resume_file_path, 'w') as f:
        json.dump(resume_json, f, indent=1)
    return tasks_by_dataset
Beispiel #14
0
    def submit_tasks(self, job_id: str, num_images: int) -> Tuple[int, list]:
        """
        Shard the images and submit each shard as a Task under the Job pointed to by this job_id
        Args:
            job_id: ID of the Batch Job to submit the tasks to
            num_images: total number of images to be processed in this Job

        Returns:
            num_task: total number of Tasks that should be in this Job
            task_ids_failed_to_submit: which Tasks from the above failed to be submitted
        """
        log.info('BatchJobManager, submit_tasks')

        # cannot execute the scoring script that is in the mounted directory; has to be copied to cwd
        # not luck giving the commandline arguments via formatted string - set as env vars instead
        score_command = '/bin/bash -c \"cp $AZ_BATCH_NODE_MOUNTS_DIR/batch-api/scripts/score.py . && python score.py\" '

        num_images_per_task = api_config.NUM_IMAGES_PER_TASK

        # form shards of images and assign each shard to a Task
        num_tasks = math.ceil(num_images / num_images_per_task)

        # for persisting stdout and stderr
        permissions = ContainerSasPermissions(read=True, write=True, list=True)
        access_duration_hrs = api_config.MONITOR_PERIOD_MINUTES * api_config.MAX_MONITOR_CYCLES / 60
        container_sas_token = generate_container_sas(
            account_name=api_config.STORAGE_ACCOUNT_NAME,
            container_name=api_config.STORAGE_CONTAINER_API,
            account_key=api_config.STORAGE_ACCOUNT_KEY,
            permission=permissions,
            expiry=datetime.utcnow() + timedelta(hours=access_duration_hrs))
        container_sas_url = sas_blob_utils.build_azure_storage_uri(
            account=api_config.STORAGE_ACCOUNT_NAME,
            container=api_config.STORAGE_CONTAINER_API,
            sas_token=container_sas_token)

        tasks = []
        for task_id in range(num_tasks):
            begin_index = task_id * num_images_per_task
            end_index = begin_index + num_images_per_task

            # persist stdout and stderr (will be removed when node removed)
            # paths are relative to the Task working directory
            stderr_destination = OutputFileDestination(
                container=OutputFileBlobContainerDestination(
                    container_url=container_sas_url,
                    path=
                    f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stderr.txt'
                ))
            stdout_destination = OutputFileDestination(
                container=OutputFileBlobContainerDestination(
                    container_url=container_sas_url,
                    path=
                    f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/task_logs/job_{job_id}_task_{task_id}_stdout.txt'
                ))
            std_err_and_out = [
                OutputFile(
                    file_pattern=
                    '../stderr.txt',  # stderr.txt is at the same level as wd
                    destination=stderr_destination,
                    upload_options=OutputFileUploadOptions(
                        upload_condition=OutputFileUploadCondition.
                        task_completion)
                    # can also just upload on failure
                ),
                OutputFile(file_pattern='../stdout.txt',
                           destination=stdout_destination,
                           upload_options=OutputFileUploadOptions(
                               upload_condition=OutputFileUploadCondition.
                               task_completion))
            ]

            task = TaskAddParameter(
                id=str(task_id),
                command_line=score_command,
                container_settings=TaskContainerSettings(
                    image_name=api_config.CONTAINER_IMAGE_NAME,
                    working_directory='taskWorkingDirectory'),
                environment_settings=[
                    EnvironmentSetting(name='TASK_BEGIN_INDEX',
                                       value=begin_index),
                    EnvironmentSetting(name='TASK_END_INDEX', value=end_index),
                ],
                output_files=std_err_and_out)
            tasks.append(task)

        # first try submitting Tasks
        task_ids_failed_to_submit = self._create_tasks(
            job_id, tasks, api_config.NUM_TASKS_PER_SUBMISSION, 1)

        # retry submitting Tasks
        if len(task_ids_failed_to_submit) > 0:
            task_ids_failed_to_submit_set = set(task_ids_failed_to_submit)
            tasks_to_retry = [
                t for t in tasks if t.id in task_ids_failed_to_submit_set
            ]
            task_ids_failed_to_submit = self._create_tasks(
                job_id, tasks_to_retry, api_config.NUM_TASKS_PER_RESUBMISSION,
                2)

            if len(task_ids_failed_to_submit) > 0:
                log.info(
                    'BatchJobManager, submit_tasks, after retry, '
                    f'len of task_ids_failed_to_submit: {len(task_ids_failed_to_submit)}'
                )
            else:
                log.info(
                    'BatchJobManager, submit_tasks, after retry, all Tasks submitted'
                )
        else:
            log.info(
                'BatchJobManager, submit_tasks, all Tasks submitted after first try'
            )

        # Change the Job's on_all_tasks_complete option to 'terminateJob' so the Job's status changes automatically
        # after all submitted tasks are done
        # This is so that we do not take up the quota for active Jobs in the Batch account.
        job_patch_params = JobPatchParameter(
            on_all_tasks_complete=OnAllTasksComplete.terminate_job)
        self.batch_client.job.patch(job_id, job_patch_params)

        return num_tasks, task_ids_failed_to_submit
def check_image_condition(img_path: str,
                          truncated_images_lock: threading.Lock,
                          account: Optional[str] = None,
                          container: Optional[str] = None,
                          sas_token: Optional[str] = None,
                          datasets_table: Optional[Mapping[str, Any]] = None
                          ) -> Tuple[str, str]:
    """
    Args:
        img_path: str, either <blob_name> if datasets_table is None, or
            <dataset>/<blob_name> if datasets_table is given
        account: str, name of Azure Blob Storage account
        container: str, name of Azure Blob Storage container
        sas_token: str, optional SAS token (without leading '?') if the
            container is not publicly accessible
        datasets_table: dict, maps dataset name to dict of information

    Returns: (img_file, status) tuple, where status is one of
        'nonexistant': blob does not exist in the container
        'non_image': img_file does not have valid file extension
        'good': image exists and is able to be opened without setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
        'truncated': image exists but can only be opened by setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
        'bad': image exists, but cannot be opened even when setting
            ImageFile.LOAD_TRUNCATED_IMAGES=True
    """
    if (account is None) or (container is None) or (datasets_table is not None):
        assert account is None
        assert container is None
        assert sas_token is None
        assert datasets_table is not None

        dataset, img_file = img_path.split('/', maxsplit=1)
        account = datasets_table[dataset]['storage_account']
        container = datasets_table[dataset]['container']
        sas_token = datasets_table[dataset]['container_sas_key']
        if sas_token[0] == '?':  # strip leading '?' from SAS token
            sas_token = sas_token[1:]
    else:
        img_file = img_path

    if not path_utils.is_image_file(img_file):
        return img_file, 'non_image'

    blob_url = sas_blob_utils.build_azure_storage_uri(
        account=account, container=container, sas_token=sas_token,
        blob=img_file)
    blob_exists = sas_blob_utils.check_blob_exists(blob_url)
    if not blob_exists:
        return img_file, 'nonexistant'

    stream, _ = sas_blob_utils.download_blob_to_stream(blob_url)
    stream.seek(0)
    try:
        with truncated_images_lock:
            ImageFile.LOAD_TRUNCATED_IMAGES = False
            with Image.open(stream) as img:
                img.load()
        return img_file, 'good'
    except OSError as e:  # PIL.UnidentifiedImageError is a subclass of OSError
        try:
            stream.seek(0)
            with truncated_images_lock:
                ImageFile.LOAD_TRUNCATED_IMAGES = True
                with Image.open(stream) as img:
                    img.load()
            return img_file, 'truncated'
        except Exception as e:  # pylint: disable=broad-except
            exception_type = type(e).__name__
            tqdm.write(f'Unable to load {img_file}. {exception_type}: {e}.')
            return img_file, 'bad'