def test_calculate_job_state_2(requireMocking, mocked_10_sample_job): result = schedule_job({'pathParameters': {"job": "12345"}}, {}) watch_job_schedule_queue() state = get_job_state("12345") assert state == SCHEDULED # set 1 sample state to failed save_sample_state(sample="abc_0", state=FAILED) sync_job(get_job_config("12345")) # this should update the job state accordingly to processing state = get_job_state("12345") assert state == PROCESSING # set sample state to failed save_sample_state(sample="abc_1", state=FAILED) # this should keep the job state in state processing sync_job(get_job_config("12345")) state = get_job_state("12345") assert state == PROCESSING # set all other samples to exported save_sample_state(sample="abc_2", state=EXPORTED) save_sample_state(sample="abc_3", state=EXPORTED) save_sample_state(sample="abc_4", state=EXPORTED) save_sample_state(sample="abc_5", state=EXPORTED) save_sample_state(sample="abc_6", state=EXPORTED) save_sample_state(sample="abc_7", state=EXPORTED) save_sample_state(sample="abc_8", state=EXPORTED) save_sample_state(sample="abc_9", state=EXPORTED) sync_job(get_job_config("12345")) # this should set the job state to aggregation scheduling now state = get_job_state("12345") assert state == AGGREGATING_SCHEDULED
def schedule_job_from_queue(event, context): """ listens to the job queue and executes the scheduling for us. :param event: :param context: :return: """ for message in event['Records']: body = json.loads(json.loads(message['body'])['default']) if 'job' in body: job_id = body['job'] key = body['key'] pkey = body.get('paginate', None) details = get_job_config(job_id) try: method = details['method'] profile = details['profile'] resource = details['resource'] samples, pkey = load_job_samples_with_pagination( job=job_id, pagination_value=pkey, pagination_size=25) schedule_samples_to_queue(job_id, key, method, profile, resource, samples) if pkey is None or len(samples) == 0: logger.info("job was completely scheduled!") set_job_state(job=job_id, method=method, profile=profile, state=SCHEDULED, resource=resource) else: logger.info( 'job was too large, requires resubmission to queue to spread the load out!' ) # send job again to queue to schedule_to_queue(body={ "job": job_id, "key": key, "paginate": pkey }, resource=Backend.NO_BACKEND_REQUIRED, service=None, queue_name="jobQueue") except Exception as e: logger.info( f"observed error in {job_id} with the following details {details}, error was: {e} and received " f"body was : {body}") traceback.print_exc() raise e
def test_calculate_job_state_with_zip_upload(requireMocking, mocked_10_sample_job): # set 1 sample state to exported save_sample_state(sample="abc_0", state=EXPORTED) # this should update the job state accordingly to processing sync_job(get_job_config("12345")) state = get_job_state("12345") assert state == PROCESSING # set sample state to failed save_sample_state(sample="abc_1", state=FAILED) sync_job(get_job_config("12345")) # this should keep the job state in state processing state = get_job_state("12345") assert state == PROCESSING # set all other samples to exported save_sample_state(sample="abc_2", state=EXPORTED) save_sample_state(sample="abc_3", state=EXPORTED) save_sample_state(sample="abc_4", state=EXPORTED) save_sample_state(sample="abc_5", state=EXPORTED) save_sample_state(sample="abc_6", state=EXPORTED) save_sample_state(sample="abc_7", state=EXPORTED) save_sample_state(sample="abc_8", state=EXPORTED) save_sample_state(sample="abc_9", state=EXPORTED) sync_job(get_job_config("12345")) # this should set the job state to aggregation scheduling now state = get_job_state("12345") assert state == AGGREGATING_SCHEDULED # trigger an upload to the zip bucket bucket_zip({'Records': [{'s3': {'object': {'key': '12345.zip'}}}]}, {}) # job should now be aggregated state = get_job_state("12345") assert state == AGGREGATED_AND_UPLOADED
def do_sync(event, context): """ synchronizes the actual job """ ## # sqs trigger if 'Records' in event: for message in event['Records']: logger.info(message) body = json.loads(json.loads(message['body'])['default']) if 'job' in body: job = body['job'] config = get_job_config(job) logger.info("received job to synchronize: {}".format(config)) sync_job(config) ## # http trigger if 'pathParameters' in event: job = event['pathParameters']['job'] import boto3 client = boto3.client('sqs') arn = _get_queue(client, resource=Backend.NO_BACKEND_REQUIRED, queue_name="sample_sync_queue") logger.info("sending sync request for job {} to queue {}".format(job, arn)) serialized = json.dumps({'job': job}, use_decimal=True) result = client.send_message( QueueUrl=arn, MessageBody=json.dumps({'default': serialized}), ) return { 'body': json.dumps({'result': str(result), 'job': job}), 'statusCode': 200, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ }
def schedule_job(event, context): """ schedules the job for our processing """ key = None if 'headers' in event: if event['headers'] and 'x-api-key' in event['headers']: key = event['headers']['x-api-key'] job_id = event['pathParameters']['job'] details = get_job_config(job_id) if details is None: return { 'body': json.dumps({ 'error': 'this job has not been stored yet!', 'job': job_id }), 'statusCode': 404, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ } if details['state'] == REGISTERING: return { 'body': json.dumps({ 'error': 'job is currently in state registering and waiting for more samples. It cannot be scheduled yet!', 'job': job_id }), 'statusCode': 425, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ } method = details['method'] profile = details['profile'] resource = details['resource'] try: # update job state set_job_state(job=job_id, method=method, profile=profile, state=SCHEDULING, resource=resource) # now send to job sync queue schedule_to_queue(body={ "job": job_id, "key": key }, resource=Backend.NO_BACKEND_REQUIRED, service=None, queue_name="jobQueue") return { 'body': json.dumps({ 'state': str(SCHEDULING), 'job': job_id }), 'statusCode': 200, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ } except Exception as e: error_diagnostics = traceback.format_exc() # update job state in the system to failed with the related reason set_job_state(job=job_id, method=method, profile=profile, state=FAILED, reason=f"{str(e)} = {error_diagnostics}") traceback.print_exc() return { 'body': json.dumps({ 'state': str(FAILED), 'job': job_id, 'reason': str(e) }), 'statusCode': 500, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ }
def store_job(event, context): """ stores a job in the internal database :param event: :param context: :return: """ body = json.loads(event['body']) try: validate(body, __JOB_SCHEMA__) except ValidationError as e: return { 'body': json.dumps({ 'state': str(FAILED), 'reason': str(e) }), 'statusCode': 503, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ } job_id = body['id'] method = body['method'] profile = body['profile'] # in case we want to tracking = body.get('meta', {}).get('tracking', []) if 'resource' in body: resource = Backend(body['resource']) else: resource = DEFAULT_PROCESSING_BACKEND # send to processing queue, might timeout web session for very large jobs # refactor later accordingly to let it get processed in a lambda itself to avoid this try: # todo store notification data here for this job... register_notifications(job_id, body.get('notify', {})) # store actual job in the job table with state scheduled set_job_state(job=job_id, method=method, profile=profile, state=body.get('state', ENTERED), resource=resource) result = get_job_config(job_id) return { 'body': json.dumps({ 'state': str(result['state']), 'job': job_id }), 'statusCode': 200, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ } except Exception as e: # update job state in the system to failed with the related reason error_diagnostics = traceback.format_exc() set_job_state(job=job_id, method=method, profile=profile, state=FAILED, reason=f"{str(e)} = {error_diagnostics}") traceback.print_exc() return { 'body': json.dumps({ 'state': str(FAILED), 'job': job_id, 'reason': str(e) }), 'statusCode': 500, 'isBase64Encoded': False, 'headers': __HTTP_HEADERS__ }
def calculate_job_state(job: str) -> Optional[str]: """ this method keeps the stasis tracking table and the job tracking in sync. """ # 1. evaluate existing job state # to avoid expensive synchronization state = get_job_state(job=job) s = States() logger.info("current job state for {} is {}".format(job, state)) if state is None: logger.info(f"no job state found -> forcing scheduled state for {job}") update_job_state(job=job, state=SCHEDULED, reason="job was forced to state scheduled due to no state being available!") elif s.priority(state) >= s.priority(AGGREGATING_SCHEDULING): logger.info(f"job was already in a finished state {job}, state {state} and so needs no further analysis") return state else: logger.info(f"job {job} was in state {state}, which requires it to get it's final state analyzed") # 2. load job definition # loading all the samples here still causes timeouts or excessive CPU cost todo find a solution job_definition = load_job_samples_with_states(job=job) job_config = get_job_config(job=job) if job_definition is not None and job_config is not None and job_config['state'] != REGISTERING: states = [] try: # 3. go over all samples for sample, tracking_state in job_definition.items(): states.append(tracking_state) logger.info("received sample states for job are: {}".format(states)) if len(states) == 0: # bigger issue nothing found to synchronize logger.info("no states found!") return None # ALL ARE FAILED elif states.count(FAILED) == len(states): update_job_state(job=job_config['id'], state=FAILED, reason="job is in state failed, due to all samples being in state failed") logger.info("job is failed, no sample was successful") return FAILED # ALL ARE EXPORTED OR FAILED elif states.count(EXPORTED) + states.count(UPLOADED) + states.count(EXPORTING) + states.count(FAILED) == len(states): update_job_state(job=job_config['id'], state=EXPORTED, reason="job state was set to exported due to all samples having been exported or failed") logger.info("job should now be exported") return EXPORTED # ANY ARE SCHEDULED elif states.count(SCHEDULED) == len(states): update_job_state(job=job_config['id'], state=SCHEDULED, reason="job is in state scheduled, due to all samples being in state scheduled") logger.info("job still in state scheduled") return SCHEDULED # otherwise we must be processing else: update_job_state(job=job_config['id'], state=PROCESSING, reason="job is in state processing") logger.info("job is in state processing right now") from collections import Counter logger.info(Counter(states)) return PROCESSING finally: from collections import Counter logger.info("state distribution for job '{}' with {} samples is: {}".format(job, len(states), Counter(states))) counter = 0 for sample, tracking_state in job_definition.items(): logger.info(f"{counter} - sample {sample} is in state {tracking_state} ") counter = counter + 1 logger.info("done") else: raise Exception("we did not find a job definition for {}, Please investigate".format(job))