def cancel_tasks_by_status(session_id, task_state): """ Cancel tasks of in the specific state within a session. Args: string: session_id string: task_state Returns: dict: results """ response = state_table.get_tasks_by_state(session_id, task_state) print(f"state_table.get_tasks_by_state: {response}") try: for row in response['Items']: state_table.update_task_status_to_cancelled(row['task_id']) except StateTableException as e: errlog.log( "StateTableException error in setting task's status to cancelled {} [{}]" .format(e, traceback.format_exc())) raise e except Exception as e: errlog.log( "Unexpected error in in setting task's status to cancelled {} [{}]" .format(e, traceback.format_exc())) raise e return response['Items']
def get_session_id_from_event(event): """ Args: lambda's invocation event Returns: str: session id encoded in the event """ # If lambda are called through ALB - extracting actual event if event.get('queryStringParameters') is not None: all_params = event.get('queryStringParameters') encoded_json_tasks = all_params.get('submission_content') if encoded_json_tasks is None: raise Exception( 'Invalid submission format, expect submission_content parameter' ) decoded_json_tasks = base64.urlsafe_b64decode( encoded_json_tasks).decode('utf-8') event = json.loads(decoded_json_tasks) return event['session_id'] else: errlog.log("Uniplemented path, exiting") assert (False)
def retreive_expired_tasks(ddb_part_str): """This function retrieves the list of expired tasks from the DynamoDB table Args: ddb_part_str(str): DynamoDB infO Returns: dict: a list of expired tasks Raises: ClientError: if DynamoDB query failed """ try: now = int(time.time()) response = table.query( IndexName="gsi_ttl_index", KeyConditionExpression=Key('task_status').eq('processing' + ddb_part_str) & Key('heartbeat_expiration_timestamp').lt(now), Limit=RETRIEVE_EXPIRED_TASKS_LIMIT ) return response except ClientError as e: errlog.log("Cannot retreive expired tasks : {}".format(e)) raise e
def fail_task(task_id, sqs_handler_id, ddb_part_str): """This function set the task_status of task to fail Args: task_id(str): the id of the task to update sqs_handler_id(str): the sqs handler associated to this task ddb_part_str(str): DynamoDB info to forward to the status Returns: Nothing Raises: ClientError: if DynamoDB table cannot be updated """ try: delete_message_from_queue(sqs_handler_id) table.update_item( Key={ 'task_id': task_id }, UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2", ExpressionAttributeValues={ ':val1': 'None', ':val2': TTL_LAMBDA_FAILED_STATUS + ddb_part_str }, ExpressionAttributeNames={ "#var_task_owner": "task_owner", "#var_task_status": "task_status" } ) except ClientError as e: errlog.log("Cannot fail task {} : {}".format(task_id, e)) raise e
def set_task_inconsistent(task_id, ddb_part_str): """This function set the task_status of task to inconsistent Args: task_id(str): the id of the task to update ddb_part_str(str): DynamoDB info to forward to the status Returns: Nothing Raises: ClientError: if DynamoDB table cannot be updated """ try: table.update_item( Key={ 'task_id': task_id }, UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2", ExpressionAttributeValues={ ':val1': 'None', ':val2': TTL_LAMBDA_INCONSISTENT_STATUS + ddb_part_str }, ExpressionAttributeNames={ "#var_task_owner": "task_owner", "#var_task_status": "task_status" }, # This condition is probably redundant and should be removed in the future ConditionExpression=Key('task_owner').eq('None') ) except ClientError as e: errlog.log("Cannot set task to inconsystent {} : {}".format(task_id, e)) raise e
def send_messages(self, message_bodies=[], message_attributes={}): """ Sends a single message or a batch of messages into SQS queue Args: message_bodies - list of messages to be send message_attributes - dictionary that should contain priority Returns: response from SQS """ try: if "priority" in message_attributes: queue = self.priority_to_queue_lookup[ message_attributes["priority"]] else: queue = self.priority_to_queue_lookup[0] response = queue.send_messages(message_bodies, message_attributes) except Exception as e: msg = f"Priority QueueSQS: failed to send {len(message_bodies)} messages [{message_bodies}], Exception: [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc()) return response
def __get_queue_object(self, message_handle_id, task_priority=None) -> QueueSQS: """This function finds a corresponding queue by message_handle_id or task_priority Args: message_handle_id(str): the sqs handler associated of the message to be deleted task_priority(int): priority of the task Returns: QueueSQS """ queue = None if message_handle_id in self.msg_handle_to_queue_lookup: # <1.> If this object was used to submit the message then we should have # a mapping from the handle to the queue object that was used to in-queue this message queue = self.msg_handle_to_queue_lookup[message_handle_id] elif task_priority is not None: # <2.> The message was in-queued by some external object, (this can happen if submit_tasks lambda # inserted the message, but now ttl_lambda is modifying the state of that message) # thus we should determine the name of the queue by using the priority argument. queue = self.priority_to_queue_lookup[task_priority] else: msg = "PrioritySQS: Can not find QueueSQS by message_handle_id [{message_handle_id}] and priority [{task_priority}]" errlog.log(msg) raise TaskQueueException(None, msg, traceback.format_exc()) return queue
def change_visibility(self, message_handle_id, visibility_timeout_sec, task_priority=None): """Changes visibility timeout of the message by its handle Args: message_handle_id(str): the sqs handler associated of the message to be deleted task_priority(int): Returns: None """ try: queue = self.__get_queue_object(message_handle_id, task_priority) res = queue.change_visibility(message_handle_id, visibility_timeout_sec) return res except Exception as e: msg = f"PrioritySQS: Failed to change visibility by message_handle_id [{message_handle_id}] priority [{task_priority}] : [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc())
def release_task(task_id, retries, ddb_part_str): """ Args: task_id: retries: ddb_part_str: Returns: """ try: table.update_item( Key={ 'task_id': task_id }, UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2, #var_retries = :val3", ExpressionAttributeValues={ ':val1': 'None', ':val2': 'pending' + ddb_part_str, ':val3': retries }, ExpressionAttributeNames={ "#var_task_owner": "task_owner", "#var_task_status": "task_status", "#var_retries": "retries" } ) except ClientError as e: errlog.log("Cannot release task {} : {}".format(task_id, e)) raise e
def update_ttl_if_required(task): ddb_res = True # If this is the first time we are resetting ttl value or # If the next time we will come to this point ttl ticket will expire if ((ttl_gen.get_next_refresh_timestamp() == 0) or (ttl_gen.get_next_refresh_timestamp() < time.time() + work_proc_status_pull_interval_sec)): logging.info("***Updating TTL***") # event_counter_post.increment("counter_update_ttl") count = 0 while True: count += 1 t1 = get_time_now_ms() # Note, if we will timeout on DDB update operation and we have to repeat this loop iteration, # we will regenerate a new TTL ofset, which is what we want. ddb_res, response, error = ddb.update_own_tasks_ttl( status_table_cc, task, SELF_ID, ttl_gen.generate_next_ttl().get_next_expiration_timestamp()) t2 = get_time_now_ms() if not ddb_res and error.response['Error']['Code'] in [ "ThrottlingException", "ProvisionedThroughputExceededException" ]: errlog.log("Agent TTL@DDB Throttling #{} for {} ms".format( count, t2 - t1)) continue else: break return ddb_res
def lambda_handler(event, context): try: lambda_response = {} session_ids_to_cancel = get_session_id_from_event(event) for session2cancel in session_ids_to_cancel: lambda_sub_response = cancel_session(session2cancel) lambda_response[session2cancel] = lambda_sub_response return { 'statusCode': 200, 'body': json.dumps(lambda_response) } except Exception as e: errlog.log('Lambda cancel_tasks error: {} trace: {}'.format(e, traceback.format_exc())) return { 'statusCode': 542, 'body': "{}".format(e) }
def retreive_retries_and_task_handler_and_priority(task_id): """This function retrieve (i) the number of retries, (ii) the task's handler associated to an expired task and (iii) and the priority under which this task was executed. Args: task_id(str): the id of the expired task Returns: rtype: 3 variables Raises: ClientError: if DynamoDB query failed """ try: resp_task = state_table.get_task_by_id(task_id) # CHeck if 1 and only 1 return resp_task.get('retries'),\ resp_task.get('task_handler_id'),\ resp_task.get('task_priority') except ClientError as e: errlog.log("Cannot retreive retries and handler for task {} : {}".format(task_id, e)) raise e
def change_visibility(self, message_handle_id, visibility_timeout_sec, task_priority=None) -> None: """Changes visibility timeout of the message by its handle id Args: message_handle_id(str): the sqs handler associated of the message to be deleted task_priority(int): <Interface argument, not used in this class> Returns: None """ try: self.sqs_client.change_message_visibility( QueueUrl=self.sqs_queue.url, ReceiptHandle=message_handle_id, VisibilityTimeout=visibility_timeout_sec, ) except Exception as e: msg = f"QueueSQS: Cannot reset VTO for message handle id {message_handle_id}, Exception: [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc()) return None
def __init__(self, endpoint_url, queue_name, region): logging.info( f"Initializing QueueSQS: {endpoint_url} {queue_name} {region}") self.endpoint_url = endpoint_url self.queue_name = queue_name try: sqs_resource = boto3.resource("sqs", region_name=region, endpoint_url=endpoint_url) self.sqs_queue = sqs_resource.get_queue_by_name( QueueName=queue_name) self.sqs_client = boto3.client("sqs", region_name=region, endpoint_url=endpoint_url) except Exception as e: msg = f"QueueSQS: cannot initialize queue_name [{queue_name}], endpoint_url [{endpoint_url}] region [{region}] : {e} [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc())
def receive_message(self, wait_time_sec=10): """ Receives a message from the front of the task queue Args: wait_time_sec - pulling time out Returns: empty dictionary if no mesage was read from the queue, otherwise a dictionary containing the body of the message + associated properties """ messages = [] try: messages = self.sqs_queue.receive_messages( MaxNumberOfMessages=1, WaitTimeSeconds=wait_time_sec) except Exception as e: msg = f"QueueSQS: failed to receive a task from SQS queue, Exception: [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc()) if len(messages) == 0: return {} return { "body": messages[0].body, "properties": { "message_handle_id": messages[0].receipt_handle }, }
def reset_task_msg_vto(handler_id, task_priority): """Function makes message re-appear in the tasks queue. Args: handler_id: reference of the message/task. task_priority: priority of the task. Identifies which queue to use (if applicable) Returns: Nothing """ try: visibility_timeout_sec = 0 queue.change_visibility(handler_id, visibility_timeout_sec, task_priority) except ClientError as e: errlog.log("Cannot reset VTO for message {} : {}".format(handler_id, e)) raise e
def reset_sqs_vto(handler_id): """ Args: handler_id: Returns: """ try: sqs_cli.change_message_visibility( QueueUrl=queue.url, ReceiptHandle=handler_id, VisibilityTimeout=0 ) except ClientError as e: errlog.log("Cannot reset VTO for message {} : {}".format(handler_id, e)) raise e
def set_task_inconsistent(task_id): """This function set the task_status of task to inconsistent Args: task_id(str): the id of the task to update Returns: Nothing Raises: ClientError: if DynamoDB table cannot be updated """ try: state_table.update_task_status_to_inconsistent(task_id) except ClientError as e: errlog.log("Cannot set task to inconsistent {} : {}".format(task_id, e)) raise e
def delete_message_from_queue(task_handler_id, task_priority): """This function delete the message from the task queue Args: task_handler_id(str): the task handler associated of the message to be deleted task_priority(int): priority of the task Returns: Nothing Raises: ClientError: if task queue cannot be updated """ try: queue.delete_message(task_handler_id, task_priority) except ClientError as e: errlog.log("Cannot delete message {} : {}".format(task_handler_id, e)) raise e
def delete_message_from_queue(sqs_handler_id): """This function delete a message from a SQS queue Args: sqs_handler_id(str): the sqs handler associated of the message to be deleted Returns: Nothing Raises: ClientError: if SQS queue cannot be updated """ try: sqs_cli.delete_message( QueueUrl=queue.url, ReceiptHandle=sqs_handler_id ) except ClientError as e: errlog.log("Cannot delete message {} : {}".format(sqs_handler_id, e)) raise e
def send_messages(self, message_bodies=[], message_attributes={}): """ Sends a single message or a batch of messages into SQS queue Args: message_bodies - list of messages to be send message_attributes - unused parameter for singe SQS task queue Returns: response from SQS """ try: return self.sqs_queue.send_messages(Entries=message_bodies) except Exception as e: msg = f"QueueSQS: failed to send {len(message_bodies)} messages [{message_bodies}], Exception: [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc())
def delete_message(self, message_handle_id, task_priority=None) -> None: """Deletes message from the queue by the message_handle_id. Often this function is called when message is successfully consumed. Args: message_handle_id(str): the sqs handler associated of the message to be deleted task_priority(int): <Interface argument, not used in this class> Returns: None """ try: self.sqs_client.delete_message(QueueUrl=self.sqs_queue.url, ReceiptHandle=message_handle_id) except Exception as e: msg = f"QueueSQS: Cannot delete message by handle id {message_handle_id}, Exception: [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc()) return None
def lambda_handler(event, context): session_id = None try: session_id = get_session_id_from_event(event) lambda_responce = get_tasks_statuses_in_session(session_id) book_keeping(lambda_responce) return {'statusCode': 200, 'body': json.dumps(lambda_responce)} except ClientError as e: errlog.log('Lambda get_result error: {} trace: {}'.format( e.response['Error']['Message'], traceback.format_exc())) return {'statusCode': 542, 'body': e.response['Error']['Message']} except Exception as e: errlog.log('Lambda get_result error: {} trace: {}'.format( e, traceback.format_exc())) return {'statusCode': 542, 'body': "{}".format(e)}
def retreive_retries_and_sqs_handler(task_id): """This function retrieve the number of retries and the SQS handler associated to an expired task Args: task_id(str): the id of the expired task Returns: rtype: dict Raises: ClientError: if DynamoDB query failed """ try: response = table.query( KeyConditionExpression=Key('task_id').eq(task_id) ) # CHeck if 1 and only 1 return response.get('Items')[0].get('retries'), response.get('Items')[0].get('sqs_handler_id') except ClientError as e: errlog.log("Cannot retreive retries and handler for task {} : {}".format(task_id, e)) raise e
def fail_task(task_id, task_handler_id, task_priority): """This function set the task_status of task to fail Args: task_id(str): the id of the task to update task_handler_id(str): the task handler associated to this task task_priority(int): the priority of the task. Returns: Nothing Raises: ClientError: if DynamoDB table cannot be updated """ try: delete_message_from_queue(task_handler_id, task_priority) state_table.update_task_status_to_failed(task_id) except ClientError as e: errlog.log("Cannot fail task {} : {}".format(task_id, e)) raise e
def acquire_task(task_id, current_owner, current_heartbeat_timestamp, ddb_part_str): """ Args: task_id: current_owner: current_heartbeat_timestamp: ddb_part_str: Returns: """ try: table.update_item( Key={ 'task_id': task_id }, UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2, #var_hb_timestamp = :val3", ExpressionAttributeValues={ ':val1': TTL_LAMBDA_ID, ':val2': TTL_LAMBDA_TMP_STATUS + ddb_part_str, ':val3': 0 }, ExpressionAttributeNames={ "#var_task_owner": "task_owner", "#var_task_status": "task_status", "#var_hb_timestamp": "heartbeat_expiration_timestamp" }, ConditionExpression=Attr('task_status').eq('processing' + ddb_part_str) & Attr('task_owner').eq(current_owner) & Attr('heartbeat_expiration_timestamp').eq(current_heartbeat_timestamp) ) except ClientError as e: errlog.log("Cannot acquire task TTL Checker {} : {}".format(task_id, e)) return False return True
def delete_message(self, message_handle_id, task_priority=None): """Deletes message from the queue by the message_handle_id or task_priority Often this function is called when message is successfully consumed. Args: message_handle_id(str): the sqs handler associated of the message to be deleted task_priority(int): Returns: None """ try: queue = self.__get_queue_object(message_handle_id, task_priority) res = queue.delete_message(message_handle_id) return res except Exception as e: msg = f"PrioritySQS: Failed to delete msg by handle_id [{message_handle_id}] priority [{task_priority}] : [{e}] [{traceback.format_exc()}]" errlog.log(msg) raise TaskQueueException(e, msg, traceback.format_exc())
def lambda_handler(event, context): """Handler called by AWS Lambda runtime Args: event (dict): an dictionary object containing the HTTP status code and the message to send back to the client): an API Gateway generated event context: Returns: dict: A message and a status code bind in dictionary object """ # If lambda are called through ALB - extracting actual event if event.get('queryStringParameters') is not None: all_params = event.get('queryStringParameters') if task_input_passed_via_external_storage == '1': session_id = all_params.get('submission_content') encoded_json_tasks = stdin_iom.get_payload_to_utf8_string(session_id) else: encoded_json_tasks = all_params.get('submission_content') if encoded_json_tasks is None: raise Exception('Invalid submission format, expect submission_content parameter') decoded_json_tasks = base64.urlsafe_b64decode(encoded_json_tasks).decode('utf-8') event = json.loads(decoded_json_tasks) else: encoded_json_tasks = event['body'] decoded_json_tasks = base64.urlsafe_b64decode(encoded_json_tasks).decode('utf-8') event = json.loads(decoded_json_tasks) try: invocation_tstmp = get_time_now_ms() print(event) # Session ID that will be used for all tasks in this event. if event["session_id"] == "None": # Generate new session id if no session is passed # TODO: We are not currently supporting this option, consider for removal and replace with assertion session_id = get_safe_session_id() else: session_id = event["session_id"] verify_passed_sessionid_is_unique(session_id) parent_session_id = event["session_id"] lambda_response = { "session_id": session_id, "task_ids": [] } sqs_batch_entries = [] last_submitted_task_ref = None tasks_list = event['tasks_list']['tasks'] ddb_batch_size = 500 ddb_batch_write_times = [] backoff_count = 0 tasks_batches = [tasks_list[x:x + ddb_batch_size] for x in range(0, len(tasks_list), ddb_batch_size)] for bid, ddb_batch in enumerate(tasks_batches): # <1.> Batch write to dynamoDB with table.batch_writer() as batch: # batch_writer is flushed when exiting this block for i, task_definition in enumerate(ddb_batch): # tdef = json.loads(task_definition["stdin"]) # print(tdef["parent_session_id"]) task_id = session_id + "_" + str(bid * ddb_batch_size + i) time_now_ms = get_time_now_ms() task_json = { 'session_id': session_id, 'task_id': task_id, 'parent_session_id': parent_session_id, 'submission_timestamp': time_now_ms, 'task_completion_timestamp': 0, 'task_status': make_partition_key_4_state("pending", session_id), 'task_owner': "None", 'retries': 0, 'task_definition': task_definition, 'sqs_handler_id': "None", 'heartbeat_expiration_timestamp': 0 } write_to_dynamodb(task_json, batch) # TODO: res not in use task_json_4_sqs: dict = copy.deepcopy(task_json) task_json_4_sqs["stats"] = event["stats"] task_json_4_sqs["stats"]["stage2_sbmtlmba_01_invocation_tstmp"]["tstmp"] = invocation_tstmp task_json_4_sqs["stats"]["stage2_sbmtlmba_02_before_batch_write_tstmp"]["tstmp"] = get_time_now_ms() # task_json["scheduler_data"] = event["scheduler_data"] sqs_batch_entries.append({ 'Id': task_id, # use to return send result for this message 'MessageBody': json.dumps(task_json_4_sqs) } ) last_submitted_task_ref = task_json_4_sqs # <2.> Batch submit tasks to SQS # Performance critical code sqs_max_batch_size = 10 sqs_batch_chunks = [sqs_batch_entries[x:x + sqs_max_batch_size] for x in range(0, len(sqs_batch_entries), sqs_max_batch_size)] for chunk in sqs_batch_chunks: write_to_sqs(chunk) # <3.> Non performance critical code, statistics and book-keeping. event_counter = EventsCounter(["count_submitted_tasks", "count_ddb_batch_backoffs", "count_ddb_batch_write_max", "count_ddb_batch_write_min", "count_ddb_batch_write_avg"]) event_counter.increment("count_submitted_tasks", len(sqs_batch_entries)) last_submitted_task_ref['stats']['stage2_sbmtlmba_03_invocation_over_tstmp'] = {"label": "dynamo_db_submit_ms", "tstmp": get_time_now_ms()} event_counter.increment("count_ddb_batch_backoffs", backoff_count) if len(ddb_batch_write_times) > 0: event_counter.increment("count_ddb_batch_write_max", max(ddb_batch_write_times)) event_counter.increment("count_ddb_batch_write_min", min(ddb_batch_write_times)) event_counter.increment("count_ddb_batch_write_avg", sum(ddb_batch_write_times) * 1.0 / len(ddb_batch_write_times)) print("BKF: [{}] LEN: {} LIST: {}".format(backoff_count, len(ddb_batch_write_times), ddb_batch_write_times)) perf_tracker.add_metric_sample( last_submitted_task_ref['stats'], event_counter=event_counter, from_event="stage1_grid_api_01_task_creation_tstmp", to_event="stage2_sbmtlmba_03_invocation_over_tstmp", # event_time=(datetime.datetime.fromtimestamp(invocation_tstmp/1000.0)).isoformat() ) perf_tracker.submit_measurements() # <4.> Asswmble the response for sqs_msg in sqs_batch_entries: lambda_response["task_ids"].append(sqs_msg['Id']) return { 'statusCode': 200, 'body': json.dumps(lambda_response) } except ClientError as e: errlog.log("ClientError in Submit Tasks {} {}" .format(e.response['Error']['Code'], traceback.format_exc())) return { 'statusCode': 543, 'body': e.response['Error']['Message'] } except Exception as e: errlog.log("Exception in Submit Tasks {} [{}]" .format(e, traceback.format_exc())) return { 'statusCode': 543, 'body': "{}".format(e) }
def process_subprocess_completion(perf_tracker, task, sqs_msg, fname_stdout, stdout=None): """ This function is responsible for updating the dynamoDB item associated to the input task with the ouput of the execution Args: perf_tracker (utils.performance_tracker.PerformanceTracker): endpoint for sending metrics task (dict): the task that went to completion sqs_msg (Message): the SQS message associated to the completed task fname_stdout (file): the file where stdout was redirected stdout (str): the stdout of the execution Returns: Nothing """ task["stats"]["stage4_agent_01_user_code_finished_tstmp"][ "tstmp"] = get_time_now_ms() # <1.> Store stdout/stderr into persistent storage if stdout is not None: b64output = base64.b64encode(stdout.encode("utf-8")) stdout_iom.put_output_from_bytes(task["task_id"], data=b64output) else: stdout_iom.put_output_from_file(task["task_id"], file_name=fname_stdout) # logging.info("\n===========STDOUT: ================") # logging.info(open(fname_stdout, "r").read()) # ret = stdout_iom.put_error_from_file(task["task_id"], file_name=fname_stderr) # logging.info("\n===========STDERR: ================") # logging.info(open(fname_stderr, "r").read()) task["stats"]["stage4_agent_02_S3_stdout_delivered_tstmp"][ "tstmp"] = get_time_now_ms() count = 0 while True: count += 1 time_start_ms = get_time_now_ms() ddb_res, response, error = ddb.dynamodb_update_task_status_to_finished( status_table_cc, task, SELF_ID) time_end_ms = get_time_now_ms() if not ddb_res and error.response['Error']['Code'] in [ "ThrottlingException", "ProvisionedThroughputExceededException" ]: errlog.log("Agent FINISHED@DDB #{} Throttling for {} ms".format( count, time_end_ms - time_start_ms)) continue else: break if not ddb_res: # We can get here if task has been taken over by the watchdog lambda # in this case we ignore results and proceed to the next task. event_counter_post.increment("ddb_set_task_finished_failed") logging.info("Could not set completion time to Finish") else: event_counter_post.increment("ddb_set_task_finished_succeeded") logging.info( "We have succesfully marked task as completed in dynamodb." " Deleting message from the SQS... for task [{}] {}".format( task["task_id"], response)) sqs_msg.delete() logging.info("Exec time1: {} {}".format( get_time_now_ms() - AGENT_EXEC_TIMESTAMP_MS, AGENT_EXEC_TIMESTAMP_MS)) event_counter_post.increment("agent_total_time_ms", get_time_now_ms() - AGENT_EXEC_TIMESTAMP_MS) event_counter_post.set("str_pod_id", SELF_ID) submit_post_agent_measurements(task, perf_tracker)
def try_to_acquire_a_task(): """ This function will fetch tasks from the SQS queue one at a time. Once is tasks is polled from the queue, then agent will try to acquire the task by a conditional write on dymanoDB. The tasks will be acquired if tasks in dynamoDB is set as "pending" and the owner is "None" Returns: A tuple containing the SQS message and the task definition Raises: Exception: occurs when task acquisition failed """ global AGENT_EXEC_TIMESTAMP_MS logging.info("waiting for SQS message") messages = tasks_queue.receive_messages(MaxNumberOfMessages=1, WaitTimeSeconds=10) task_pick_up_from_sqs_ms = get_time_now_ms() logging.info("try_to_acquire_a_task, message: {}".format(messages)) # print(len(messages)) if len(messages) == 0: event_counter_pre.increment("agent_no_messages_in_tasks_queue") return None, None message = messages[0] AGENT_EXEC_TIMESTAMP_MS = get_time_now_ms() task = json.loads(message.body) logging.info("try_to_acquire_a_task, task: {}".format(task)) # Since we read this message from the queue, now we need to associate an # sqs handler with this message, to be able to delete it later task["sqs_handle_id"] = message.receipt_handle try: result, response, error = ddb.claim_task_to_yourself( status_table, task, SELF_ID, ttl_gen.generate_next_ttl().get_next_expiration_timestamp()) logging.info("DDB claim_task_to_yourself result: {} {}".format( result, response)) if not result: event_counter_pre.increment("agent_failed_to_claim_ddb_task") if is_task_has_been_cancelled(task["task_id"]): logging.info( "Task [{}] has been already cancelled, skipping".format( task['task_id'])) message.delete() return None, None else: time.sleep(random.randint(1, 3)) return None, None except Exception as error_acquiring: errlog.log( "Releasing msg after failed try_to_acquire_a_task {} [{}]".format( error_acquiring, traceback.format_exc())) raise error_acquiring # if e.response['Error']['Code'] == 'ResourceNotFoundException': # If we have succesfully ackquired a message we should change its visibility timeout message.change_visibility( VisibilityTimeout=agent_sqs_visibility_timeout_sec) task["stats"]["stage3_agent_01_task_acquired_sqs_tstmp"][ "tstmp"] = task_pick_up_from_sqs_ms task["stats"]["stage3_agent_02_task_acquired_ddb_tstmp"][ "tstmp"] = get_time_now_ms() event_counter_pre.increment("agent_successful_acquire_a_task") return message, task