Example #1
0
def _request_detections(**kwargs: Any) -> None:
    try:
        body = kwargs.get('post_body')
        assert body is not None

        input_container_sas = body.get('input_container_sas', None)

        use_url = body.get('use_url', False)

        images_requested_json_sas = body.get('images_requested_json_sas', None)

        image_path_prefix = body.get('image_path_prefix', None)

        first_n = body.get('first_n', None)
        first_n = int(first_n) if first_n else None

        sample_n = body.get('sample_n', None)
        sample_n = int(sample_n) if sample_n else None

        model_version = body.get('model_version', '')
        if model_version == '':
            model_version = api_config.AML_CONFIG['default_model_version']
        model_name = api_config.AML_CONFIG['models'][model_version]

        # request_name and request_submission_timestamp are for appending to
        # output file names
        request_name = body.get('request_name', '')
        request_submission_timestamp = orchestrator.get_utc_timestamp()

        request_id = kwargs['request_id']
        task_status = orchestrator.get_task_status(
            'running', 'Request received.')
        update_task_status(api_task_manager, request_id, task_status)
        print('runserver.py, request_id {}, '.format(request_id),
              'model_version {}, model_name {}, '.format(model_version, model_name),
              'request_name {}, '.format(request_name),
              'submission timestamp is {}'.format(request_submission_timestamp))

        # image_paths can be a list of strings (Azure blob names or public URLs)
        # or a list of length-2 lists where each is a [image_id, metadata] pair

        # Case 1: listing all images in the container
        # - not possible to have attached metadata if listing images in a blob
        if images_requested_json_sas is None:
            metadata_available = False
            task_status = orchestrator.get_task_status(
                'running', 'Listing all images to process.')
            update_task_status(api_task_manager, request_id, task_status)
            print('runserver.py, running - listing all images to process.')

            # list all images to process
            image_paths = SasBlob.list_blobs_in_container(
                api_config.MAX_NUMBER_IMAGES_ACCEPTED + 1,
                # so > MAX_NUMBER_IMAGES_ACCEPTED will find that there are too many images requested so should not proceed
                sas_uri=input_container_sas,
                blob_prefix=image_path_prefix, blob_suffix='.jpg')

        # Case 2: user supplied a list of images to process; can include metadata
        else:
            print('runserver.py, running - using provided list of images.')
            image_paths_text = SasBlob.download_blob_to_text(
                images_requested_json_sas)
            image_paths = json.loads(image_paths_text)
            print('runserver.py, length of image_paths provided by the user: {}'.format(len(image_paths)))
            if len(image_paths) == 0:
                task_status = orchestrator.get_task_status(
                    'completed', '0 images found in provided list of images.')
                update_task_status(api_task_manager, request_id, task_status)
                return

            error, metadata_available = orchestrator.validate_provided_image_paths(image_paths)
            if error is not None:
                msg = 'image paths provided in the json are not valid: {}'.format(error)
                raise ValueError(msg)

            valid_image_paths = []
            for p in image_paths:
                locator = p[0] if metadata_available else p
                # urlparse(p).path also preserves the extension on local paths
                path = urllib.parse.urlparse(locator).path.lower()
                if path.endswith(api_config.ACCEPTED_IMAGE_FILE_ENDINGS):
                    valid_image_paths.append(p)
            image_paths = valid_image_paths
            print('runserver.py, length of image_paths provided by user, '
                  'after filtering to jpg: {}'.format(len(image_paths)))

            valid_image_paths = []
            if image_path_prefix is not None:
                for p in image_paths:
                    locator = p[0] if metadata_available else p
                    if locator.startswith(image_path_prefix):
                        valid_image_paths.append(p)
                image_paths = valid_image_paths
                print('runserver.py, length of image_paths provided by user, '
                      'after filtering for image_path_prefix: {}'.format(len(image_paths)))

            if not use_url:
                res = orchestrator.spot_check_blob_paths_exist(
                    image_paths, input_container_sas, metadata_available)
                if res is not None:
                    msg = ('path {} provided in list of images to process '.format(res),
                           'does not exist in the container pointed to by '
                           'data_container_sas.')
                    raise LookupError(msg)

        # apply the first_n and sample_n filters
        if first_n is not None:
            assert first_n > 0, 'parameter first_n is 0.'
            # OK if first_n > total number of images
            image_paths = image_paths[:first_n]

        if sample_n is not None:
            assert sample_n > 0, 'parameter sample_n is 0.'
            if sample_n > len(image_paths):
                msg = ('parameter sample_n specifies more images than '
                       'available (after filtering by other provided params).')
                raise ValueError(msg)

            # sample by shuffling image paths and take the first sample_n images
            print('First path before shuffling:', image_paths[0])
            shuffle(image_paths)
            print('First path after shuffling:', image_paths[0])
            image_paths = orchestrator.sort_image_paths(
                image_paths[:sample_n], metadata_available)

        num_images = len(image_paths)
        print('runserver.py, num_images after applying all filters: {}'.format(num_images))
        if num_images < 1:
            task_status = orchestrator.get_task_status(
                'completed',
                'Zero images found in container or in provided list of images '
                'after filtering with the provided parameters.')
            update_task_status(api_task_manager, request_id, task_status)
            return
        if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED:
            task_status = orchestrator.get_task_status(
                'failed',
                'The number of images ({}) requested for processing exceeds the maximum accepted {} in one call'.format(
                    num_images, api_config.MAX_NUMBER_IMAGES_ACCEPTED))
            update_task_status(api_task_manager, request_id, task_status)
            return

        # finalized image_paths is uploaded to internal_container; all sharding
        # and scoring use the uploaded list
        image_paths_string = json.dumps(image_paths, indent=1)
        internal_storage_service.create_blob_from_text(
            internal_container, '{}/{}_images.json'.format(request_id, request_id),
            image_paths_string)
        # the list of images json does not have request_name or timestamp in the
        # file name so that score.py can locate it

        task_status = orchestrator.get_task_status(
            'running', 'Images listed; processing {} images.'.format(num_images))
        update_task_status(api_task_manager, request_id, task_status)
        print('runserver.py, running - images listed; processing {} images'.format(num_images))

        # set up connection to AML Compute and data stores
        # do this for each request since pipeline step is associated with the
        # data stores
        aml_compute = orchestrator.AMLCompute(
            request_id=request_id, use_url=use_url,
            input_container_sas=input_container_sas,
            internal_datastore=internal_datastore, model_name=model_name)
        print('AMLCompute resource connected successfully.')

        num_images_per_job = api_config.NUM_IMAGES_PER_JOB
        num_jobs = math.ceil(num_images / num_images_per_job)

        # list_jobs: Dict[str, Dict[str, int]] = {}
        list_jobs = {}
        for job_index in range(num_jobs):
            begin = job_index * num_images_per_job
            end = begin + num_images_per_job

            # Experiment name must be between 1 and 36 characters long. Its
            # first character has to be alphanumeric, and the rest may contain
            # hyphens and underscores.
            shortened_request_id = request_id.split('-')[0]
            if len(shortened_request_id) > 8:
                shortened_request_id = shortened_request_id[:8]

            # request ID, job index, total
            job_id = 'r{}_i{}_t{}'.format(shortened_request_id, job_index, num_jobs)

            list_jobs[job_id] = {'begin': begin, 'end': end}

        list_jobs_submitted = aml_compute.submit_jobs(
            list_jobs, api_task_manager, num_images)
        task_status = orchestrator.get_task_status(
            'running',
            'All {} images submitted to cluster for processing.'.format(num_images))
        update_task_status(api_task_manager, request_id, task_status)

    except Exception as e:
        task_status = orchestrator.get_task_status(
            'failed', 'An error occurred while processing the request: {}'.format(e))
        update_task_status(api_task_manager, request_id, task_status)
        print('runserver.py, exception in _request_detections: {}'.format(e))
        return  # do not initiate _monitor_detections_request

    try:
        aml_monitor = orchestrator.AMLMonitor(
            request_id=request_id,
            shortened_request_id=shortened_request_id,
            list_jobs_submitted=list_jobs_submitted,
            request_name=request_name,
            request_submission_timestamp=request_submission_timestamp,
            model_version=model_version)

        # start another thread to monitor the jobs and consolidate the results
        # when they finish
        # HACK
        ai4e_service._create_and_execute_thread(
            func=_monitor_detections_request,
            api_path='/request_detections_aml',
            request_id=request_id, aml_monitor=aml_monitor)

        # ai4e_service.wrap_async_endpoint(
        #     _monitor_detections_request,
        #     trace_name='post:_monitor_detections_request',
        #     request_id=request_id, aml_monitor=aml_monitor)
    except Exception as e:
        task_status = orchestrator.get_task_status(
            'problem',
            ('An error occurred when starting the status monitoring process. '
             'The images should be submitted for processing though - please '
             'contact us to retrieve your results. Error: {}'.format(e)))
        update_task_status(api_task_manager, request_id, task_status)
        print('runserver.py, exception when starting orchestrator.AMLMonitor: {}'.format(e))
Example #2
0
def _monitor_detections_request(**kwargs: Any) -> None:
    try:
        request_id = kwargs['request_id']
        aml_monitor = kwargs['aml_monitor']

        max_num_checks = api_config.MAX_MONITOR_CYCLES
        num_checks = 0

        # errors encountered during aml_monitor.check_job_status()
        num_errors_job_status = 0

        # errors encountered during aml_monitor.aggregate_results()
        num_errors_aggregation = 0

        print('Monitoring thread with _monitor_detections_request started.')

        while True:
            # time.sleep() blocks the current thread only
            time.sleep(api_config.MONITOR_PERIOD_MINUTES * 60)

            print('runserver.py, _monitor_detections_request, woke up at '
                  '{} for check number {}.'.format(datetime.now(), num_checks))

            # check the status of the jobs, with retries
            try:
                all_jobs_finished, status_tally = aml_monitor.check_job_status()
            except Exception as e:
                num_errors_job_status += 1
                print('runserver.py, _monitor_detections_request, exception in '
                      'aml_monitor.check_job_status(): {}'.format(e))

                if num_errors_job_status <= api_config.NUM_RETRIES:
                    print('Will retry in the next monitoring cycle. Number of '
                          'errors so far: {}'.format(num_errors_job_status))
                    continue
                else:
                    print('Number of retries reached for '
                          'aml_monitor.check_job_status().')
                    raise e

            print('all jobs finished? {}'.format(all_jobs_finished))
            for status, count in status_tally.items():
                print('status {}, number of jobs = {}'.format(status, count))

            num_failed = status_tally['Failed']
            # need to periodically check the enumerations are what AML returns
            # - not the same as in their doc
            num_finished = status_tally['Finished'] + num_failed

            # if all jobs finished, aggregate the results and return the URLs
            # to the output files
            if all_jobs_finished:
                task_status = orchestrator.get_task_status(
                    'running',
                    'Model inference finished; now aggregating results.')
                update_task_status(api_task_manager, request_id, task_status)

                # retrieve and join the output CSVs from each job, with retries
                try:
                    output_file_urls = aml_monitor.aggregate_results()
                except Exception as e:
                    num_errors_aggregation += 1
                    print('runserver.py, _monitor_detections_request, '
                          'exception in aml_monitor.aggregate_results(): {}'.format(e))

                    if num_errors_aggregation <= api_config.NUM_RETRIES:
                        print('Will retry during the next monitoring wake-up '
                              'cycle. Number of errors so far: {}'.format(num_errors_aggregation))
                        task_status = orchestrator.get_task_status(
                            'running',
                            'All shards finished but results aggregation '
                            'failed. Will retry in '
                            '{} minutes.'.format(api_config.MONITOR_PERIOD_MINUTES))
                        update_task_status(api_task_manager, request_id,
                                           task_status)
                        continue

                    print('Number of retries reached for '
                          'aml_monitor.aggregate_results().')
                    raise e

                # output_file_urls_str = json.dumps(output_file_urls)
                message = {
                    'num_failed_shards': num_failed,
                    'output_file_urls': output_file_urls
                }
                task_status = orchestrator.get_task_status('completed', message)
                update_task_status(api_task_manager, request_id, task_status)
                break

            # not all jobs are finished, update the status with number of shards
            # finished
            task_status = orchestrator.get_task_status(
                'running',
                'Last status check at {}, '
                '{} out of {} shards '
                'finished processing, {} failed.'.format(
                    orchestrator.get_utc_time(), num_finished, aml_monitor.get_total_jobs(), num_failed
                ))
            update_task_status(api_task_manager, request_id, task_status)

            # not all jobs are finished but the maximum number of checking cycle
            # is reached, stop this thread
            num_checks += 1
            if num_checks >= max_num_checks:
                task_status = orchestrator.get_task_status(
                    'problem',
                    'Request unfinished after {} '
                    'x {} minutes; '
                    'abandoning the monitoring thread. Please contact us to '
                    'retrieve any results.'.format(
                        api_config.MAX_MONITOR_CYCLES, api_config.MONITOR_PERIOD_MINUTES
                    ))
                update_task_status(api_task_manager, request_id, task_status)
                print('runserver.py, _monitor_detections_request, ending!')
                break

    except Exception as e:
        task_status = orchestrator.get_task_status(
            'problem',
            'An error occurred while monitoring the status of this request. '
            'The images should be processing though - please contact us to '
            'retrieve your results. Error: {}'.format(e))
        update_task_status(api_task_manager, request_id, task_status)
        print('runserver.py, exception in _monitor_detections_request(): ', e)
Example #3
0
def _request_detections(**kwargs):
    try:
        body = kwargs.get('post_body')

        input_container_sas = body.get('input_container_sas', None)

        use_url = body.get('use_url', False)

        images_requested_json_sas = body.get('images_requested_json_sas', None)

        image_path_prefix = body.get('image_path_prefix', None)

        first_n = body.get('first_n', None)
        first_n = int(first_n) if first_n else None

        sample_n = body.get('sample_n', None)
        sample_n = int(sample_n) if sample_n else None

        model_version = body.get('model_version', '')
        if model_version == '':
            model_version = api_config.AML_CONFIG['default_model_version']
        model_name = api_config.AML_CONFIG['models'][model_version]

        # request_name and request_submission_timestamp are for appending to output file names
        request_name = body.get('request_name', '')
        request_submission_timestamp = orchestrator.get_utc_timestamp()

        request_id = kwargs['request_id']
        api_task_manager.UpdateTaskStatus(
            request_id, get_task_status('running', 'Request received.'))
        print((
            'runserver.py, request_id {}, model_version {}, model_name {}, request_name {}, submission timestamp '
            'is {}').format(request_id, model_version, model_name,
                            request_name, request_submission_timestamp))

        # image_paths can be a list of strings (paths on Azure blobs or public URLs), or a list of lists,
        # each of length 2 and is the [image_id, metadata] pair

        # case 1 - listing all images in the container
        if images_requested_json_sas is None:
            metadata_available = False  # not possible to have attached metadata if listing images in a blob
            api_task_manager.UpdateTaskStatus(
                request_id,
                get_task_status('running', 'Listing all images to process.'))
            print('runserver.py, running - listing all images to process.')

            # list all images to process
            image_paths = SasBlob.list_blobs_in_container(
                api_config.MAX_NUMBER_IMAGES_ACCEPTED +
                1,  # so > MAX_NUMBER_IMAGES_ACCEPTED will find that there are too many images requested so should not proceed
                sas_uri=input_container_sas,
                blob_prefix=image_path_prefix,
                blob_suffix='.jpg')
        # case 2 - user supplied a list of images to process; can include metadata
        else:
            print('runserver.py, running - using provided list of images.')
            image_paths_text = SasBlob.download_blob_to_text(
                images_requested_json_sas)
            image_paths = json.loads(image_paths_text)
            print(
                'runserver.py, length of image_paths provided by the user: {}'.
                format(len(image_paths)))
            if len(image_paths) == 0:
                api_task_manager.UpdateTaskStatus(
                    request_id,
                    get_task_status(
                        'completed',
                        'Zero images found in provided list of images.'))
                return

            error, metadata_available = orchestrator.validate_provided_image_paths(
                image_paths)
            if error is not None:
                raise ValueError(
                    'image paths provided in the json are not valid: {}'.
                    format(error))

            valid_image_paths = []
            for p in image_paths:
                locator = p[0] if metadata_available else p
                if locator.lower().endswith(
                        api_config.ACCEPTED_IMAGE_FILE_ENDINGS):
                    valid_image_paths.append(p)
            image_paths = valid_image_paths
            print(
                'runserver.py, length of image_paths provided by the user, after filtering to jpg: {}'
                .format(len(image_paths)))

            valid_image_paths = []
            if image_path_prefix is not None:
                for p in image_paths:
                    locator = p[0] if metadata_available else p
                    if locator.startswith(image_path_prefix):
                        valid_image_paths.append(p)
                image_paths = valid_image_paths
                print(
                    'runserver.py, length of image_paths provided by the user, after filtering for image_path_prefix: {}'
                    .format(len(image_paths)))

            if not use_url:
                res = orchestrator.spot_check_blob_paths_exist(
                    image_paths, input_container_sas, metadata_available)
                if res is not None:
                    raise LookupError(
                        'path {} provided in list of images to process does not exist in the container pointed to by data_container_sas.'
                        .format(res))

        # apply the first_n and sample_n filters
        if first_n is not None:
            assert first_n > 0, 'parameter first_n is 0.'
            image_paths = image_paths[:
                                      first_n]  # will not error if first_n > total number of images

        if sample_n is not None:
            assert sample_n > 0, 'parameter sample_n is 0.'
            if sample_n > len(image_paths):
                raise ValueError(
                    'parameter sample_n specifies more images than available (after filtering by other provided params).'
                )

            # we sample by shuffling the image paths and take the first sample_n images
            print('First path before shuffling:', image_paths[0])
            shuffle(image_paths)
            print('First path after shuffling:', image_paths[0])
            image_paths = image_paths[:sample_n]
            image_paths = orchestrator.sort_image_paths(
                image_paths, metadata_available)

        num_images = len(image_paths)
        print('runserver.py, num_images after applying all filters: {}'.format(
            num_images))
        if num_images < 1:
            api_task_manager.UpdateTaskStatus(
                request_id,
                get_task_status(
                    'completed',
                    'Zero images found in container or in provided list of images after filtering with the provided parameters.'
                ))
            return
        if num_images > api_config.MAX_NUMBER_IMAGES_ACCEPTED:
            api_task_manager.UpdateTaskStatus(
                request_id,
                get_task_status(
                    'failed',
                    'The number of images ({}) requested for processing exceeds the maximum accepted ({}) in one call.'
                    .format(num_images,
                            api_config.MAX_NUMBER_IMAGES_ACCEPTED)))
            return

        # finalized image_paths is uploaded to internal_container; all sharding and scoring use the uploaded list
        image_paths_string = json.dumps(image_paths, indent=1)
        internal_storage_service.create_blob_from_text(
            internal_container,
            '{}/{}_images.json'.format(request_id,
                                       request_id), image_paths_string)
        # the list of images json does not have request_name or timestamp in the file name so that score.py can locate it

        api_task_manager.UpdateTaskStatus(
            request_id,
            get_task_status(
                'running',
                'Images listed; processing {} images.'.format(num_images)))
        print('runserver.py, running - images listed; processing {} images.'.
              format(num_images))

        # set up connection to AML Compute and data stores
        # do this for each request since pipeline step is associated with the data stores
        aml_compute = orchestrator.AMLCompute(
            request_id=request_id,
            use_url=use_url,
            input_container_sas=input_container_sas,
            internal_datastore=internal_datastore,
            model_name=model_name)
        print('AMLCompute resource connected successfully.')

        num_images_per_job = api_config.NUM_IMAGES_PER_JOB
        num_jobs = math.ceil(num_images / num_images_per_job)

        list_jobs = {}
        for job_index in range(num_jobs):
            begin, end = job_index * num_images_per_job, (
                job_index + 1) * num_images_per_job
            job_id = 'request{}_jobindex{}_total{}'.format(
                request_id, job_index, num_jobs)
            list_jobs[job_id] = {'begin': begin, 'end': end}

        list_jobs_submitted = aml_compute.submit_jobs(list_jobs,
                                                      api_task_manager,
                                                      num_images)
        api_task_manager.UpdateTaskStatus(
            request_id,
            get_task_status(
                'running',
                'All {} images submitted to cluster for processing.'.format(
                    num_images)))

    except Exception as e:
        api_task_manager.UpdateTaskStatus(
            request_id,
            get_task_status(
                'failed',
                'An error occurred while processing the request: {}'.format(
                    e)))
        print('runserver.py, exception in _request_detections: {}'.format(
            str(e)))
        return  # do not initiate _monitor_detections_request

    try:
        aml_monitor = orchestrator.AMLMonitor(
            request_id=request_id,
            list_jobs_submitted=list_jobs_submitted,
            request_name=request_name,
            request_submission_timestamp=request_submission_timestamp,
            model_version=model_version)

        # start another thread to monitor the jobs and consolidate the results when they finish
        ai4e_wrapper.wrap_async_endpoint(_monitor_detections_request,
                                         'post:_monitor_detections_request',
                                         request_id=request_id,
                                         aml_monitor=aml_monitor)
    except Exception as e:
        api_task_manager.UpdateTaskStatus(
            request_id,
            get_task_status('problem', (
                'An error occurred when starting the status monitoring process. '
                'The images should be submitted for processing though - please contact us to retrieve your results. '
                'Error: {}'.format(e))))
        print(
            'runserver.py, exception when starting orchestrator.AMLMonitor: ',
            str(e))