Ejemplo n.º 1
0
    def test_list_blobs_in_container(self):
        blobs_list = list_blobs_in_container(PUBLIC_ZIPPED_CONTAINER_URI,
                                             limit=100)
        expected = sorted([
            'wcs_20200403_bboxes.json.zip', 'wcs_camera_traps.json.zip',
            'wcs_camera_traps_00.zip', 'wcs_camera_traps_01.zip',
            'wcs_camera_traps_02.zip', 'wcs_camera_traps_03.zip',
            'wcs_camera_traps_04.zip', 'wcs_camera_traps_05.zip',
            'wcs_camera_traps_06.zip', 'wcs_specieslist.csv', 'wcs_splits.json'
        ])
        self.assertEqual(blobs_list, expected)

        blobs_list = list_blobs_in_container(PUBLIC_ZIPPED_CONTAINER_URI,
                                             rsearch=r'_\d[0-3]\.zip')
        expected = sorted([
            'wcs_camera_traps_00.zip', 'wcs_camera_traps_01.zip',
            'wcs_camera_traps_02.zip', 'wcs_camera_traps_03.zip'
        ])
        self.assertEqual(blobs_list, expected)
Ejemplo n.º 2
0
 def test_generate_writable_container_sas(self):
     self.needs_cleanup = True
     new_sas_uri = generate_writable_container_sas(
         account_name=PRIVATE_ACCOUNT_NAME,
         account_key=PRIVATE_ACCOUNT_KEY,
         container_name=PRIVATE_CONTAINER_NAME,
         access_duration_hrs=1,
         account_url=PRIVATE_ACCOUNT_URI)
     self.assertTrue(isinstance(new_sas_uri, str))
     self.assertNotEqual(new_sas_uri, '')
     self.assertEqual(len(list_blobs_in_container(new_sas_uri)), 0)
Ejemplo n.º 3
0
    def test_generate_writable_container_sas(self):
        # until the private emulated account is able to work, skip this test
        self.skipTest('skipping private account tests for now')

        self.needs_cleanup = True
        new_sas_uri = generate_writable_container_sas(
            account_name=PRIVATE_ACCOUNT_NAME,
            account_key=PRIVATE_ACCOUNT_KEY,
            container_name=PRIVATE_CONTAINER_NAME,
            access_duration_hrs=1,
            account_url=PRIVATE_ACCOUNT_URI)
        self.assertTrue(isinstance(new_sas_uri, str))
        self.assertNotEqual(new_sas_uri, '')
        self.assertEqual(len(list_blobs_in_container(new_sas_uri)), 0)
Ejemplo n.º 4
0
def enumerate_blobs_to_file(
        output_file: str,
        account_name: str,
        container_name: str,
        sas_token: Optional[str] = None,
        blob_prefix: Optional[str] = None,
        blob_suffix: Optional[Union[str, Tuple[str]]] = None,
        rsearch: Optional[str] = None,
        limit: Optional[int] = None
        ) -> List[str]:
    """
    Enumerates blobs in a container, and writes the blob names to an output
    file.

    Args:
        output_file: str, path to save list of files in container
            If ends in '.json', writes a JSON string. Otherwise, writes a
            newline-delimited list. Can be None, in which case this is just a 
            convenient wrapper for blob enumeration.
        account_name: str, Azure Storage account name
        container_name: str, Azure Blob Storage container name
        sas_token: optional str, container SAS token, leading ? will be removed if present.
        blob_prefix: optional str, returned results will only contain blob names
            to with this prefix
        blob_suffix: optional str or tuple of str, returned results will only
            contain blob names with this/these suffix(es). The blob names will
            be lowercased first before comparing with the suffix(es).
        rsearch: optional str, returned results will only contain blob names
            that match this regex. Can also be a list of regexes, in which case
            blobs matching *any* of the regex's will be returned.            
        limit: int, maximum # of blob names to list
            if None, then returns all blob names

    Returns: list of str, sorted blob names, of length limit or shorter.
    """
    if sas_token is not None and len(sas_token) > 9 and sas_token[0] == '?':
        sas_token = sas_token[1:]
        
    container_uri = sas_blob_utils.build_azure_storage_uri(
        account=account_name, container=container_name, sas_token=sas_token)
    matched_blobs = sas_blob_utils.list_blobs_in_container(
        container_uri=container_uri, blob_prefix=blob_prefix,
        blob_suffix=blob_suffix, rsearch=rsearch, limit=limit)
    if output_file is not None:
        write_list_to_file(output_file, matched_blobs)
    return matched_blobs
Ejemplo n.º 5
0
def enumerate_blobs_to_file(output_file: str,
                            account_name: str,
                            container_name: str,
                            sas_token: Optional[str] = None,
                            blob_prefix: Optional[str] = None,
                            blob_suffix: Optional[Union[str,
                                                        Tuple[str]]] = None,
                            rsearch: Optional[str] = None,
                            limit: Optional[int] = None) -> List[str]:
    """Enumerates blobs in a container, and writes the blob names to an output
    file.

    Args:
        output_file: str, path to save list of files in container
            If ends in '.json', writes a JSON string. Otherwise, writes a
            newline-delimited list
        account_name: str, Azure Storage account name
        container_name: str, Azure Blob Storage container name
        sas_token: optional str, container SAS token, does not start with '?'
        blob_prefix: optional str, returned results will only contain blob names
            to with this prefix
        blob_suffix: optional str or tuple of str, returned results will only
            contain blob names with this/these suffix(es). The blob names will
            be lowercased first before comparing with the suffix(es).
        rsearch: optional str, returned results will only contain blob names
            that match this Python regex pattern at any point in the blob name.
            Use '^' character to only match from the beginning of the blob name.
        limit: int, maximum # of blob names to list
            if None, then returns all blob names

    Returns: list of str, sorted blob names, of length limit or shorter.
    """
    container_uri = sas_blob_utils.build_azure_storage_uri(
        account=account_name, container=container_name, sas_token=sas_token)
    matched_blobs = sas_blob_utils.list_blobs_in_container(
        container_uri=container_uri,
        blob_prefix=blob_prefix,
        blob_suffix=blob_suffix,
        rsearch=rsearch,
        limit=limit)
    write_list_to_file(output_file, matched_blobs)
    return matched_blobs
Ejemplo n.º 6
0
def create_batch_job(job_id: str, body: dict):
    """
    This is the target to be run in a thread to submit a batch processing job and monitor progress
    """
    job_status_table = JobStatusTable()
    try:
        log.info(f'server_job, create_batch_job, job_id {job_id}, {body}')

        input_container_sas = body.get('input_container_sas', None)

        use_url = body.get('use_url', False)

        images_requested_json_sas = body.get('images_requested_json_sas', None)

        image_path_prefix = body.get('image_path_prefix', None)

        first_n = body.get('first_n', None)
        first_n = int(first_n) if first_n else None

        sample_n = body.get('sample_n', None)
        sample_n = int(sample_n) if sample_n else None

        model_version = body.get('model_version', '')
        if model_version == '':
            model_version = api_config.DEFAULT_MD_VERSION

        # request_name and request_submission_timestamp are for appending to
        # output file names
        job_name = body.get('request_name', '')  # in earlier versions we used "request" to mean a "job"
        job_submission_timestamp = get_utc_time()

        # image_paths can be a list of strings (Azure blob names or public URLs)
        # or a list of length-2 lists where each is a [image_id, metadata] pair

        # Case 1: listing all images in the container
        # - not possible to have attached metadata if listing images in a blob
        if images_requested_json_sas is None:
            log.info('server_job, create_batch_job, listing all images to process.')

            # list all images to process
            image_paths = sas_blob_utils.list_blobs_in_container(
                container_uri=input_container_sas,
                blob_prefix=image_path_prefix,  # check will be case-sensitive
                blob_suffix=api_config.IMAGE_SUFFIXES_ACCEPTED,  # check will be case-insensitive
                limit=api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB + 1
                # + 1 so if the number of images listed > MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB
                # we will know and not proceed
            )

        # Case 2: user supplied a list of images to process; can include metadata
        else:
            log.info('server_job, create_batch_job, using provided list of images.')
            output_stream, blob_properties = sas_blob_utils.download_blob_to_stream(images_requested_json_sas)
            image_paths = json.load(output_stream)
            log.info('server_job, create_batch_job, length of image_paths provided by the user: {}'.format(
                len(image_paths)))
            if len(image_paths) == 0:
                job_status = get_job_status(
                    'completed', '0 images found in provided list of images.')
                job_status_table.update_job_status(job_id, job_status)
                return

            error, metadata_available = validate_provided_image_paths(image_paths)
            if error is not None:
                msg = 'image paths provided in the json are not valid: {}'.format(error)
                raise ValueError(msg)

            # filter down to those conforming to the provided prefix and accepted suffixes (image file types)
            valid_image_paths = []
            for p in image_paths:
                locator = p[0] if metadata_available else p

                # prefix is case-sensitive; suffix is not
                if image_path_prefix is not None and not locator.startswith(image_path_prefix):
                    continue

                # Although urlparse(p).path preserves the extension on local paths, it will not work for
                # blob file names that contains "#", which will be treated as indication of a query.
                # If the URL is generated via Azure Blob Storage, the "#" char will be properly encoded
                path = urllib.parse.urlparse(locator).path if use_url else locator

                if path.lower().endswith(api_config.IMAGE_SUFFIXES_ACCEPTED):
                    valid_image_paths.append(p)
            image_paths = valid_image_paths
            log.info(('server_job, create_batch_job, length of image_paths provided by user, '
                      f'after filtering to jpg: {len(image_paths)}'))

        # apply the first_n and sample_n filters
        if first_n:
            assert first_n > 0, 'parameter first_n is 0.'
            # OK if first_n > total number of images
            image_paths = image_paths[:first_n]

        if sample_n:
            assert sample_n > 0, 'parameter sample_n is 0.'
            if sample_n > len(image_paths):
                msg = ('parameter sample_n specifies more images than '
                       'available (after filtering by other provided params).')
                raise ValueError(msg)

            # sample by shuffling image paths and take the first sample_n images
            log.info('First path before shuffling:', image_paths[0])
            shuffle(image_paths)
            log.info('First path after shuffling:', image_paths[0])
            image_paths = image_paths[:sample_n]

        num_images = len(image_paths)
        log.info(f'server_job, create_batch_job, num_images after applying all filters: {num_images}')

        if num_images < 1:
            job_status = get_job_status('completed', (
                'Zero images found in container or in provided list of images '
                'after filtering with the provided parameters.'))
            job_status_table.update_job_status(job_id, job_status)
            return
        if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB:
            job_status = get_job_status(
                'failed',
                (f'The number of images ({num_images}) requested for processing exceeds the maximum '
                 f'accepted {api_config.MAX_NUMBER_IMAGES_ACCEPTED_PER_JOB} in one call'))
            job_status_table.update_job_status(job_id, job_status)
            return

        # upload the image list to the container, which is also mounted on all nodes
        # all sharding and scoring use the uploaded list
        images_list_str_as_bytes = bytes(json.dumps(image_paths, ensure_ascii=False), encoding='utf-8')

        container_url = sas_blob_utils.build_azure_storage_uri(account=api_config.STORAGE_ACCOUNT_NAME,
                                                               container=api_config.STORAGE_CONTAINER_API)
        with ContainerClient.from_container_url(container_url,
                                                credential=api_config.STORAGE_ACCOUNT_KEY) as api_container_client:
            _ = api_container_client.upload_blob(
                name=f'api_{api_config.API_INSTANCE_NAME}/job_{job_id}/{job_id}_images.json',
                data=images_list_str_as_bytes)

        job_status = get_job_status('created', f'{num_images} images listed; submitting the job...')
        job_status_table.update_job_status(job_id, job_status)

    except Exception as e:
        job_status = get_job_status('failed', f'Error occurred while preparing the Batch job: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while preparing the Batch job: {e}')
        return  # do not start monitoring

    try:
        batch_job_manager = BatchJobManager()

        model_rel_path = api_config.MD_VERSIONS_TO_REL_PATH[model_version]
        batch_job_manager.create_job(job_id,
                                     model_rel_path,
                                     input_container_sas,
                                     use_url)

        num_tasks, task_ids_failed_to_submit = batch_job_manager.submit_tasks(job_id, num_images)

        # now request_status moves from created to running
        job_status = get_job_status('running',
                                    (f'Submitted {num_images} images to cluster in {num_tasks} shards. '
                                     f'Number of shards failed to be submitted: {len(task_ids_failed_to_submit)}'))

        # an extra field to allow the monitoring thread to restart after an API restart: total number of tasks
        job_status['num_tasks'] = num_tasks
        # also record the number of images to process for reporting
        job_status['num_images'] = num_images

        job_status_table.update_job_status(job_id, job_status)
    except Exception as e:
        job_status = get_job_status('problem', f'Please contact us. Error occurred while submitting the Batch job: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while submitting the Batch job: {e}')
        return

    # start the monitor thread with the same name
    try:
        thread = threading.Thread(
            target=monitor_batch_job,
            name=f'job_{job_id}',
            kwargs={
                'job_id': job_id,
                'num_tasks': num_tasks,
                'model_version': model_version,
                'job_name': job_name,
                'job_submission_timestamp': job_submission_timestamp
            }
        )
        thread.start()
    except Exception as e:
        job_status = get_job_status('problem', f'Error occurred while starting the monitoring thread: {e}')
        job_status_table.update_job_status(job_id, job_status)
        log.error(f'server_job, create_batch_job, Error occurred while starting the monitoring thread: {e}')
        return
Ejemplo n.º 7
0
    #%%

    if False:

        #%%

        # Use this when you have the task IDs, but no taskgroup objects, typically because you recorded them
        # manually somewhere
        task_ids = []
        task_to_results = {}

        # Enumerate files associated with each task
        for task_id in task_ids:

            # Enumerate files associated this this task
            matched_blobs = sas_blob_utils.list_blobs_in_container(
                container_uri=container_uri, blob_prefix=task_id)
            task_to_results[task_id] = matched_blobs

        #%%

    if False:

        #%%

        # Use this when you don't know the task ID(s), typically because this notebook closed

        matched_blobs = sas_blob_utils.list_blobs_in_container(
            container_uri=container_uri)
        task_blobs = [s for s in matched_blobs if base_task_name in s]
        task_id = task_blobs[0].split('/')[0]
        matched_blobs = sas_blob_utils.list_blobs_in_container(