Python log Examples, utils.grid_error_logger.log Python Examples

Example #1

0

Show file

File: cancel_tasks.py Project: kirillsc/aws-htc-grid

def cancel_tasks_by_status(session_id, task_state):
    """
    Cancel tasks of in the specific state within a session.

    Args:
        string: session_id
        string: task_state

    Returns:
        dict: results

    """

    response = state_table.get_tasks_by_state(session_id, task_state)
    print(f"state_table.get_tasks_by_state: {response}")

    try:
        for row in response['Items']:
            state_table.update_task_status_to_cancelled(row['task_id'])

    except StateTableException as e:
        errlog.log(
            "StateTableException error in setting task's status to cancelled {} [{}]"
            .format(e, traceback.format_exc()))
        raise e
    except Exception as e:
        errlog.log(
            "Unexpected error in in setting task's status to cancelled {} [{}]"
            .format(e, traceback.format_exc()))
        raise e

    return response['Items']

Example #2

0

Show file

File: get_results.py Project: kirillsc/aws-htc-grid

def get_session_id_from_event(event):
    """
    Args:
        lambda's invocation event

    Returns:
        str: session id encoded in the event
    """

    # If lambda are called through ALB - extracting actual event
    if event.get('queryStringParameters') is not None:
        all_params = event.get('queryStringParameters')
        encoded_json_tasks = all_params.get('submission_content')
        if encoded_json_tasks is None:
            raise Exception(
                'Invalid submission format, expect submission_content parameter'
            )
        decoded_json_tasks = base64.urlsafe_b64decode(
            encoded_json_tasks).decode('utf-8')
        event = json.loads(decoded_json_tasks)

        return event['session_id']

    else:
        errlog.log("Uniplemented path, exiting")
        assert (False)

Example #3

0

Show file

def retreive_expired_tasks(ddb_part_str):
    """This function retrieves the list of expired tasks from the DynamoDB table

    Args:
      ddb_part_str(str): DynamoDB infO

    Returns:
      dict: a list of expired tasks

    Raises:
      ClientError: if DynamoDB query failed

    """
    try:
        now = int(time.time())
        response = table.query(
            IndexName="gsi_ttl_index",
            KeyConditionExpression=Key('task_status').eq('processing' + ddb_part_str)
                                   & Key('heartbeat_expiration_timestamp').lt(now),
            Limit=RETRIEVE_EXPIRED_TASKS_LIMIT
        )
        return response
    except ClientError as e:
        errlog.log("Cannot retreive expired tasks : {}".format(e))
        raise e

Example #4

0

Show file

def fail_task(task_id, sqs_handler_id, ddb_part_str):
    """This function set the task_status of task to fail

    Args:
      task_id(str): the id of the task to update
      sqs_handler_id(str): the sqs handler associated to this task
      ddb_part_str(str): DynamoDB info to forward to the status

    Returns:
      Nothing

    Raises:
      ClientError: if DynamoDB table cannot be updated

    """
    try:
        delete_message_from_queue(sqs_handler_id)
        table.update_item(
            Key={
                'task_id': task_id
            },
            UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2",
            ExpressionAttributeValues={
                ':val1': 'None',
                ':val2': TTL_LAMBDA_FAILED_STATUS + ddb_part_str
            },
            ExpressionAttributeNames={
                "#var_task_owner": "task_owner",
                "#var_task_status": "task_status"
            }
        )
    except ClientError as e:
        errlog.log("Cannot fail task {} : {}".format(task_id, e))
        raise e

Example #5

0

Show file

def set_task_inconsistent(task_id, ddb_part_str):
    """This function set the task_status of task to inconsistent

    Args:
      task_id(str): the id of the task to update
      ddb_part_str(str): DynamoDB info to forward to the status

    Returns:
      Nothing

    Raises:
      ClientError: if DynamoDB table cannot be updated

    """
    try:
        table.update_item(
            Key={
                'task_id': task_id
            },
            UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2",
            ExpressionAttributeValues={
                ':val1': 'None',
                ':val2': TTL_LAMBDA_INCONSISTENT_STATUS + ddb_part_str
            },
            ExpressionAttributeNames={
                "#var_task_owner": "task_owner",
                "#var_task_status": "task_status"
            },
            # This condition is probably redundant and should be removed in the future
            ConditionExpression=Key('task_owner').eq('None')
        )
    except ClientError as e:
        errlog.log("Cannot set task to inconsystent {} : {}".format(task_id, e))
        raise e

Example #6

0

Show file

    def send_messages(self, message_bodies=[], message_attributes={}):
        """
        Sends a single message or a batch of messages into SQS queue

        Args:
            message_bodies - list of messages to be send
            message_attributes - dictionary that should contain priority

        Returns:
            response from SQS

        """

        try:
            if "priority" in message_attributes:
                queue = self.priority_to_queue_lookup[
                    message_attributes["priority"]]
            else:
                queue = self.priority_to_queue_lookup[0]

            response = queue.send_messages(message_bodies, message_attributes)

        except Exception as e:

            msg = f"Priority QueueSQS: failed to send {len(message_bodies)} messages [{message_bodies}], Exception: [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

        return response

Example #7

0

Show file

    def __get_queue_object(self,
                           message_handle_id,
                           task_priority=None) -> QueueSQS:
        """This function finds a corresponding queue by message_handle_id or task_priority

        Args:
            message_handle_id(str): the sqs handler associated of the message to be deleted
            task_priority(int): priority of the task

        Returns:
            QueueSQS

        """
        queue = None

        if message_handle_id in self.msg_handle_to_queue_lookup:
            # <1.> If this object was used to submit the message then we should have
            # a mapping from the handle to the queue object that was used to in-queue this message
            queue = self.msg_handle_to_queue_lookup[message_handle_id]

        elif task_priority is not None:
            # <2.> The message was in-queued by some external object, (this can happen if submit_tasks lambda
            # inserted the message, but now ttl_lambda is modifying the state of that message)
            # thus we should determine the name of the queue by using the priority argument.
            queue = self.priority_to_queue_lookup[task_priority]

        else:
            msg = "PrioritySQS: Can not find QueueSQS by message_handle_id [{message_handle_id}] and priority [{task_priority}]"
            errlog.log(msg)
            raise TaskQueueException(None, msg, traceback.format_exc())

        return queue

Example #8

0

Show file

    def change_visibility(self,
                          message_handle_id,
                          visibility_timeout_sec,
                          task_priority=None):
        """Changes visibility timeout of the message by its handle

        Args:
        message_handle_id(str): the sqs handler associated of the message to be deleted
        task_priority(int):

        Returns: None

        """

        try:

            queue = self.__get_queue_object(message_handle_id, task_priority)

            res = queue.change_visibility(message_handle_id,
                                          visibility_timeout_sec)

            return res

        except Exception as e:
            msg = f"PrioritySQS: Failed to change visibility by message_handle_id [{message_handle_id}] priority [{task_priority}] : [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

Example #9

0

Show file

def release_task(task_id, retries, ddb_part_str):
    """

    Args:
      task_id:
      retries:
      ddb_part_str:

    Returns:

    """
    try:
        table.update_item(
            Key={
                'task_id': task_id
            },
            UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2, #var_retries = :val3",
            ExpressionAttributeValues={
                ':val1': 'None',
                ':val2': 'pending' + ddb_part_str,
                ':val3': retries
            },
            ExpressionAttributeNames={
                "#var_task_owner": "task_owner",
                "#var_task_status": "task_status",
                "#var_retries": "retries"
            }
        )
    except ClientError as e:
        errlog.log("Cannot release task {} : {}".format(task_id, e))
        raise e

Example #10

0

Show file

File: agent.py Project: dholczer/aws-htc-grid

def update_ttl_if_required(task):
    ddb_res = True

    # If this is the first time we are resetting ttl value or
    # If the next time we will come to this point ttl ticket will expire
    if ((ttl_gen.get_next_refresh_timestamp() == 0)
            or (ttl_gen.get_next_refresh_timestamp() <
                time.time() + work_proc_status_pull_interval_sec)):
        logging.info("***Updating TTL***")
        # event_counter_post.increment("counter_update_ttl")

        count = 0
        while True:
            count += 1
            t1 = get_time_now_ms()

            # Note, if we will timeout on DDB update operation and we have to repeat this loop iteration,
            # we will regenerate a new TTL ofset, which is what we want.
            ddb_res, response, error = ddb.update_own_tasks_ttl(
                status_table_cc, task, SELF_ID,
                ttl_gen.generate_next_ttl().get_next_expiration_timestamp())

            t2 = get_time_now_ms()

            if not ddb_res and error.response['Error']['Code'] in [
                    "ThrottlingException",
                    "ProvisionedThroughputExceededException"
            ]:
                errlog.log("Agent TTL@DDB Throttling #{} for {} ms".format(
                    count, t2 - t1))
                continue
            else:
                break

    return ddb_res

Example #11

0

Show file

File: cancel_tasks.py Project: dholczer/aws-htc-grid

def lambda_handler(event, context):

    try:

        lambda_response = {}

        session_ids_to_cancel = get_session_id_from_event(event)

        for session2cancel in session_ids_to_cancel:

            lambda_sub_response = cancel_session(session2cancel)

            lambda_response[session2cancel] = lambda_sub_response

        return {
            'statusCode': 200,
            'body': json.dumps(lambda_response)
        }

    except Exception as e:
        errlog.log('Lambda cancel_tasks error: {} trace: {}'.format(e, traceback.format_exc()))
        return {
            'statusCode': 542,
            'body': "{}".format(e)
        }

Example #12

0

Show file

File: ttl_checker.py Project: kirillsc/aws-htc-grid

def retreive_retries_and_task_handler_and_priority(task_id):
    """This function retrieve (i) the number of retries,
    (ii) the task's handler associated to an expired task
    and (iii) and the priority under which this task was executed.

    Args:
      task_id(str): the id of the expired task

    Returns:
      rtype: 3 variables

    Raises:
      ClientError: if DynamoDB query failed

    """
    try:

        resp_task = state_table.get_task_by_id(task_id)
        # CHeck if 1 and only 1
        return resp_task.get('retries'),\
               resp_task.get('task_handler_id'),\
               resp_task.get('task_priority')

    except ClientError as e:
        errlog.log("Cannot retreive retries and handler for task {} : {}".format(task_id, e))
        raise e

Example #13

0

Show file

File: task_queue_sqs.py Project: kirillsc/aws-htc-grid

    def change_visibility(self,
                          message_handle_id,
                          visibility_timeout_sec,
                          task_priority=None) -> None:
        """Changes visibility timeout of the message by its handle id

        Args:
            message_handle_id(str): the sqs handler associated of the message to be deleted
            task_priority(int): <Interface argument, not used in this class>

        Returns: None
        """

        try:

            self.sqs_client.change_message_visibility(
                QueueUrl=self.sqs_queue.url,
                ReceiptHandle=message_handle_id,
                VisibilityTimeout=visibility_timeout_sec,
            )

        except Exception as e:
            msg = f"QueueSQS: Cannot reset VTO for message handle id {message_handle_id}, Exception: [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

        return None

Example #14

0

Show file

File: task_queue_sqs.py Project: kirillsc/aws-htc-grid

    def __init__(self, endpoint_url, queue_name, region):

        logging.info(
            f"Initializing QueueSQS: {endpoint_url} {queue_name} {region}")

        self.endpoint_url = endpoint_url

        self.queue_name = queue_name

        try:

            sqs_resource = boto3.resource("sqs",
                                          region_name=region,
                                          endpoint_url=endpoint_url)

            self.sqs_queue = sqs_resource.get_queue_by_name(
                QueueName=queue_name)

            self.sqs_client = boto3.client("sqs",
                                           region_name=region,
                                           endpoint_url=endpoint_url)

        except Exception as e:

            msg = f"QueueSQS: cannot initialize queue_name [{queue_name}], endpoint_url [{endpoint_url}] region [{region}] : {e} [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

Example #15

0

Show file

File: task_queue_sqs.py Project: kirillsc/aws-htc-grid

    def receive_message(self, wait_time_sec=10):
        """
        Receives a message from the front of the task queue

        Args:
            wait_time_sec - pulling time out


        Returns:
            empty dictionary if no mesage was read from the queue, otherwise
            a dictionary containing the body of the message + associated properties

        """

        messages = []
        try:
            messages = self.sqs_queue.receive_messages(
                MaxNumberOfMessages=1, WaitTimeSeconds=wait_time_sec)

        except Exception as e:
            msg = f"QueueSQS: failed to receive a task from SQS queue, Exception: [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

        if len(messages) == 0:
            return {}

        return {
            "body": messages[0].body,
            "properties": {
                "message_handle_id": messages[0].receipt_handle
            },
        }

Example #16

0

Show file

File: ttl_checker.py Project: kirillsc/aws-htc-grid

def reset_task_msg_vto(handler_id, task_priority):
    """Function makes message re-appear in the tasks queue.

    Args:
      handler_id: reference of the message/task.
      task_priority: priority of the task. Identifies which queue to use (if applicable)

    Returns: Nothing

    """
    try:
        visibility_timeout_sec = 0
        queue.change_visibility(handler_id, visibility_timeout_sec, task_priority)

    except ClientError as e:
        errlog.log("Cannot reset VTO for message {} : {}".format(handler_id, e))
        raise e

Example #17

0

Show file

def reset_sqs_vto(handler_id):
    """

    Args:
      handler_id:

    Returns:

    """
    try:
        sqs_cli.change_message_visibility(
            QueueUrl=queue.url,
            ReceiptHandle=handler_id,
            VisibilityTimeout=0
        )
    except ClientError as e:
        errlog.log("Cannot reset VTO for message {} : {}".format(handler_id, e))
        raise e

Example #18

0

Show file

File: ttl_checker.py Project: kirillsc/aws-htc-grid

def set_task_inconsistent(task_id):
    """This function set the task_status of task to inconsistent

    Args:
      task_id(str): the id of the task to update

    Returns:
      Nothing

    Raises:
      ClientError: if DynamoDB table cannot be updated

    """
    try:

        state_table.update_task_status_to_inconsistent(task_id)

    except ClientError as e:
        errlog.log("Cannot set task to inconsistent {} : {}".format(task_id, e))
        raise e

Example #19

0

Show file

File: ttl_checker.py Project: kirillsc/aws-htc-grid

def delete_message_from_queue(task_handler_id, task_priority):
    """This function delete the message from the task queue

    Args:
      task_handler_id(str): the task handler associated of the message to be deleted
      task_priority(int): priority of the task

    Returns:
      Nothing

    Raises:
      ClientError: if task queue cannot be updated

    """

    try:
        queue.delete_message(task_handler_id, task_priority)
    except ClientError as e:
        errlog.log("Cannot delete message {} : {}".format(task_handler_id, e))
        raise e

Example #20

0

Show file

def delete_message_from_queue(sqs_handler_id):
    """This function delete a message from a SQS queue

    Args:
      sqs_handler_id(str): the sqs handler associated of the message to be deleted

    Returns:
      Nothing

    Raises:
      ClientError: if SQS queue cannot be updated

    """
    try:
        sqs_cli.delete_message(
            QueueUrl=queue.url,
            ReceiptHandle=sqs_handler_id
        )
    except ClientError as e:
        errlog.log("Cannot delete message {} : {}".format(sqs_handler_id, e))
        raise e

Example #21

0

Show file

File: task_queue_sqs.py Project: kirillsc/aws-htc-grid

    def send_messages(self, message_bodies=[], message_attributes={}):
        """
        Sends a single message or a batch of messages into SQS queue

        Args:
            message_bodies - list of messages to be send
            message_attributes - unused parameter for singe SQS task queue

        Returns:
            response from SQS

        """

        try:

            return self.sqs_queue.send_messages(Entries=message_bodies)

        except Exception as e:
            msg = f"QueueSQS: failed to send {len(message_bodies)} messages [{message_bodies}], Exception: [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

Example #22

0

Show file

File: task_queue_sqs.py Project: kirillsc/aws-htc-grid

    def delete_message(self, message_handle_id, task_priority=None) -> None:
        """Deletes message from the queue by the message_handle_id.
        Often this function is called when message is successfully consumed.

        Args:
            message_handle_id(str): the sqs handler associated of the message to be deleted
            task_priority(int): <Interface argument, not used in this class>

        Returns: None
        """

        try:
            self.sqs_client.delete_message(QueueUrl=self.sqs_queue.url,
                                           ReceiptHandle=message_handle_id)

        except Exception as e:
            msg = f"QueueSQS: Cannot delete message by handle id {message_handle_id}, Exception: [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

        return None

Example #23

0

Show file

File: get_results.py Project: kirillsc/aws-htc-grid

def lambda_handler(event, context):

    session_id = None

    try:

        session_id = get_session_id_from_event(event)

        lambda_responce = get_tasks_statuses_in_session(session_id)

        book_keeping(lambda_responce)

        return {'statusCode': 200, 'body': json.dumps(lambda_responce)}

    except ClientError as e:
        errlog.log('Lambda get_result error: {} trace: {}'.format(
            e.response['Error']['Message'], traceback.format_exc()))
        return {'statusCode': 542, 'body': e.response['Error']['Message']}
    except Exception as e:
        errlog.log('Lambda get_result error: {} trace: {}'.format(
            e, traceback.format_exc()))
        return {'statusCode': 542, 'body': "{}".format(e)}

Example #24

0

Show file

def retreive_retries_and_sqs_handler(task_id):
    """This function retrieve the number of retries and the SQS handler associated to an expired task

    Args:
      task_id(str): the id of the expired task

    Returns:
      rtype: dict

    Raises:
      ClientError: if DynamoDB query failed

    """
    try:
        response = table.query(
            KeyConditionExpression=Key('task_id').eq(task_id)
        )
        # CHeck if 1 and only 1
        return response.get('Items')[0].get('retries'), response.get('Items')[0].get('sqs_handler_id')
    except ClientError as e:
        errlog.log("Cannot retreive retries and handler for task {} : {}".format(task_id, e))
        raise e

Example #25

0

Show file

File: ttl_checker.py Project: kirillsc/aws-htc-grid

def fail_task(task_id, task_handler_id, task_priority):
    """This function set the task_status of task to fail

    Args:
      task_id(str): the id of the task to update
      task_handler_id(str): the task handler associated to this task
      task_priority(int): the priority of the task.

    Returns:
      Nothing

    Raises:
      ClientError: if DynamoDB table cannot be updated

    """
    try:
      delete_message_from_queue(task_handler_id, task_priority)

      state_table.update_task_status_to_failed(task_id)

    except ClientError as e:
      errlog.log("Cannot fail task {} : {}".format(task_id, e))
      raise e

Example #26

0

Show file

def acquire_task(task_id, current_owner, current_heartbeat_timestamp, ddb_part_str):
    """

    Args:
      task_id:
      current_owner:
      current_heartbeat_timestamp:
      ddb_part_str:

    Returns:

    """
    try:
        table.update_item(
            Key={
                'task_id': task_id
            },
            UpdateExpression="SET #var_task_owner = :val1, #var_task_status = :val2, #var_hb_timestamp = :val3",
            ExpressionAttributeValues={
                ':val1': TTL_LAMBDA_ID,
                ':val2': TTL_LAMBDA_TMP_STATUS + ddb_part_str,
                ':val3': 0
            },
            ExpressionAttributeNames={
                "#var_task_owner": "task_owner",
                "#var_task_status": "task_status",
                "#var_hb_timestamp": "heartbeat_expiration_timestamp"
            },
            ConditionExpression=Attr('task_status').eq('processing' + ddb_part_str)
                                & Attr('task_owner').eq(current_owner)
                                & Attr('heartbeat_expiration_timestamp').eq(current_heartbeat_timestamp)
        )
    except ClientError as e:
        errlog.log("Cannot acquire task TTL Checker {} : {}".format(task_id, e))
        return False
    return True

Example #27

0

Show file

    def delete_message(self, message_handle_id, task_priority=None):
        """Deletes message from the queue by the message_handle_id or task_priority
        Often this function is called when message is successfully consumed.

        Args:
            message_handle_id(str): the sqs handler associated of the message to be deleted
            task_priority(int):

        Returns: None
        """

        try:

            queue = self.__get_queue_object(message_handle_id, task_priority)

            res = queue.delete_message(message_handle_id)

            return res

        except Exception as e:

            msg = f"PrioritySQS: Failed to delete msg by handle_id [{message_handle_id}] priority [{task_priority}] : [{e}] [{traceback.format_exc()}]"
            errlog.log(msg)
            raise TaskQueueException(e, msg, traceback.format_exc())

Example #28

0

Show file

def lambda_handler(event, context):
    """Handler called by AWS Lambda runtime

    Args:
      event (dict): an dictionary object containing the HTTP status code and the message to send back to the client):
      an API Gateway generated event
      context:

    Returns:
        dict: A message and a status code bind in dictionary object


    """
    # If lambda are called through ALB - extracting actual event
    if event.get('queryStringParameters') is not None:
        all_params = event.get('queryStringParameters')
        if task_input_passed_via_external_storage == '1':
            session_id = all_params.get('submission_content')
            encoded_json_tasks = stdin_iom.get_payload_to_utf8_string(session_id)
        else:
            encoded_json_tasks = all_params.get('submission_content')
        if encoded_json_tasks is None:
            raise Exception('Invalid submission format, expect submission_content parameter')
        decoded_json_tasks = base64.urlsafe_b64decode(encoded_json_tasks).decode('utf-8')
        event = json.loads(decoded_json_tasks)
    else:
        encoded_json_tasks = event['body']
        decoded_json_tasks = base64.urlsafe_b64decode(encoded_json_tasks).decode('utf-8')
        event = json.loads(decoded_json_tasks)

    try:
        invocation_tstmp = get_time_now_ms()

        print(event)

        # Session ID that will be used for all tasks in this event.
        if event["session_id"] == "None":
            # Generate new session id if no session is passed
            # TODO: We are not currently supporting this option, consider for removal and replace with assertion
            session_id = get_safe_session_id()
        else:
            session_id = event["session_id"]
            verify_passed_sessionid_is_unique(session_id)

        parent_session_id = event["session_id"]

        lambda_response = {
            "session_id": session_id,
            "task_ids": []
        }

        sqs_batch_entries = []
        last_submitted_task_ref = None

        tasks_list = event['tasks_list']['tasks']
        ddb_batch_size = 500
        ddb_batch_write_times = []
        backoff_count = 0

        tasks_batches = [tasks_list[x:x + ddb_batch_size] for x in range(0, len(tasks_list), ddb_batch_size)]

        for bid, ddb_batch in enumerate(tasks_batches):
            # <1.> Batch write to dynamoDB
            with table.batch_writer() as batch:  # batch_writer is flushed when exiting this block

                for i, task_definition in enumerate(ddb_batch):
                    # tdef = json.loads(task_definition["stdin"])
                    # print(tdef["parent_session_id"])
                    task_id = session_id + "_" + str(bid * ddb_batch_size + i)
                    time_now_ms = get_time_now_ms()

                    task_json = {
                        'session_id': session_id,
                        'task_id': task_id,
                        'parent_session_id': parent_session_id,
                        'submission_timestamp': time_now_ms,
                        'task_completion_timestamp': 0,
                        'task_status': make_partition_key_4_state("pending", session_id),
                        'task_owner': "None",
                        'retries': 0,
                        'task_definition': task_definition,
                        'sqs_handler_id': "None",
                        'heartbeat_expiration_timestamp': 0
                    }

                    write_to_dynamodb(task_json, batch)  # TODO: res not in use

                    task_json_4_sqs: dict = copy.deepcopy(task_json)

                    task_json_4_sqs["stats"] = event["stats"]
                    task_json_4_sqs["stats"]["stage2_sbmtlmba_01_invocation_tstmp"]["tstmp"] = invocation_tstmp
                    task_json_4_sqs["stats"]["stage2_sbmtlmba_02_before_batch_write_tstmp"]["tstmp"] = get_time_now_ms()

                    # task_json["scheduler_data"] = event["scheduler_data"]

                    sqs_batch_entries.append({
                        'Id': task_id,  # use to return send result for this message
                        'MessageBody': json.dumps(task_json_4_sqs)
                    }
                    )

                    last_submitted_task_ref = task_json_4_sqs

        # <2.> Batch submit tasks to SQS
        # Performance critical code
        sqs_max_batch_size = 10
        sqs_batch_chunks = [sqs_batch_entries[x:x + sqs_max_batch_size] for x in
                            range(0, len(sqs_batch_entries), sqs_max_batch_size)]
        for chunk in sqs_batch_chunks:
            write_to_sqs(chunk)

        # <3.> Non performance critical code, statistics and book-keeping.
        event_counter = EventsCounter(["count_submitted_tasks", "count_ddb_batch_backoffs", "count_ddb_batch_write_max",
                                       "count_ddb_batch_write_min", "count_ddb_batch_write_avg"])
        event_counter.increment("count_submitted_tasks", len(sqs_batch_entries))

        last_submitted_task_ref['stats']['stage2_sbmtlmba_03_invocation_over_tstmp'] = {"label": "dynamo_db_submit_ms",
                                                                                        "tstmp": get_time_now_ms()}

        event_counter.increment("count_ddb_batch_backoffs", backoff_count)

        if len(ddb_batch_write_times) > 0:
            event_counter.increment("count_ddb_batch_write_max", max(ddb_batch_write_times))
            event_counter.increment("count_ddb_batch_write_min", min(ddb_batch_write_times))
            event_counter.increment("count_ddb_batch_write_avg",
                                    sum(ddb_batch_write_times) * 1.0 / len(ddb_batch_write_times))

        print("BKF: [{}] LEN: {} LIST: {}".format(backoff_count, len(ddb_batch_write_times), ddb_batch_write_times))

        perf_tracker.add_metric_sample(
            last_submitted_task_ref['stats'],
            event_counter=event_counter,
            from_event="stage1_grid_api_01_task_creation_tstmp",
            to_event="stage2_sbmtlmba_03_invocation_over_tstmp",
            # event_time=(datetime.datetime.fromtimestamp(invocation_tstmp/1000.0)).isoformat()
        )
        perf_tracker.submit_measurements()

        # <4.> Asswmble the response
        for sqs_msg in sqs_batch_entries:
            lambda_response["task_ids"].append(sqs_msg['Id'])

        return {
            'statusCode': 200,
            'body': json.dumps(lambda_response)
        }
    except ClientError as e:
        errlog.log("ClientError in Submit Tasks {} {}"
                   .format(e.response['Error']['Code'], traceback.format_exc()))

        return {
            'statusCode': 543,
            'body': e.response['Error']['Message']
        }

    except Exception as e:
        errlog.log("Exception in Submit Tasks {} [{}]"
                   .format(e, traceback.format_exc()))

        return {
            'statusCode': 543,
            'body': "{}".format(e)
        }

Example #29

0

Show file

File: agent.py Project: dholczer/aws-htc-grid

def process_subprocess_completion(perf_tracker,
                                  task,
                                  sqs_msg,
                                  fname_stdout,
                                  stdout=None):
    """
    This function is responsible for updating the dynamoDB item associated to the input task with the ouput of the
    execution
    Args:
        perf_tracker (utils.performance_tracker.PerformanceTracker): endpoint for sending metrics
        task (dict): the task that went to completion
        sqs_msg (Message): the SQS message associated to the completed task
        fname_stdout (file): the file  where stdout was redirected
        stdout (str): the stdout of the execution

    Returns:
        Nothing

    """
    task["stats"]["stage4_agent_01_user_code_finished_tstmp"][
        "tstmp"] = get_time_now_ms()

    # <1.> Store stdout/stderr into persistent storage
    if stdout is not None:
        b64output = base64.b64encode(stdout.encode("utf-8"))
        stdout_iom.put_output_from_bytes(task["task_id"], data=b64output)
    else:
        stdout_iom.put_output_from_file(task["task_id"],
                                        file_name=fname_stdout)
        # logging.info("\n===========STDOUT: ================")
        # logging.info(open(fname_stdout, "r").read())

        # ret = stdout_iom.put_error_from_file(task["task_id"], file_name=fname_stderr)

        # logging.info("\n===========STDERR: ================")
        # logging.info(open(fname_stderr, "r").read())

    task["stats"]["stage4_agent_02_S3_stdout_delivered_tstmp"][
        "tstmp"] = get_time_now_ms()

    count = 0
    while True:
        count += 1
        time_start_ms = get_time_now_ms()
        ddb_res, response, error = ddb.dynamodb_update_task_status_to_finished(
            status_table_cc, task, SELF_ID)
        time_end_ms = get_time_now_ms()

        if not ddb_res and error.response['Error']['Code'] in [
                "ThrottlingException", "ProvisionedThroughputExceededException"
        ]:
            errlog.log("Agent FINISHED@DDB #{} Throttling for {} ms".format(
                count, time_end_ms - time_start_ms))
            continue
        else:
            break

    if not ddb_res:
        # We can get here if task has been taken over by the watchdog lambda
        # in this case we ignore results and proceed to the next task.
        event_counter_post.increment("ddb_set_task_finished_failed")
        logging.info("Could not set completion time to Finish")

    else:
        event_counter_post.increment("ddb_set_task_finished_succeeded")
        logging.info(
            "We have succesfully marked task as completed in dynamodb."
            " Deleting message from the SQS... for task [{}] {}".format(
                task["task_id"], response))
        sqs_msg.delete()

    logging.info("Exec time1: {} {}".format(
        get_time_now_ms() - AGENT_EXEC_TIMESTAMP_MS, AGENT_EXEC_TIMESTAMP_MS))
    event_counter_post.increment("agent_total_time_ms",
                                 get_time_now_ms() - AGENT_EXEC_TIMESTAMP_MS)
    event_counter_post.set("str_pod_id", SELF_ID)

    submit_post_agent_measurements(task, perf_tracker)

Example #30

0

Show file

File: agent.py Project: dholczer/aws-htc-grid

def try_to_acquire_a_task():
    """
    This function will fetch tasks from the SQS queue one at a time. Once is tasks is polled from the queue, then agent
    will try to acquire the task by a conditional write on dymanoDB. The tasks will be acquired if tasks in dynamoDB
    is set as "pending" and the owner is "None"

    Returns:
        A tuple containing the SQS message and the task definition

    Raises:
        Exception: occurs when task acquisition failed

    """
    global AGENT_EXEC_TIMESTAMP_MS
    logging.info("waiting for SQS message")
    messages = tasks_queue.receive_messages(MaxNumberOfMessages=1,
                                            WaitTimeSeconds=10)

    task_pick_up_from_sqs_ms = get_time_now_ms()

    logging.info("try_to_acquire_a_task, message: {}".format(messages))
    # print(len(messages))

    if len(messages) == 0:
        event_counter_pre.increment("agent_no_messages_in_tasks_queue")
        return None, None

    message = messages[0]
    AGENT_EXEC_TIMESTAMP_MS = get_time_now_ms()

    task = json.loads(message.body)
    logging.info("try_to_acquire_a_task, task: {}".format(task))

    # Since we read this message from the queue, now we need to associate an
    # sqs handler with this message, to be able to delete it later
    task["sqs_handle_id"] = message.receipt_handle
    try:
        result, response, error = ddb.claim_task_to_yourself(
            status_table, task, SELF_ID,
            ttl_gen.generate_next_ttl().get_next_expiration_timestamp())
        logging.info("DDB claim_task_to_yourself result: {} {}".format(
            result, response))

        if not result:
            event_counter_pre.increment("agent_failed_to_claim_ddb_task")

            if is_task_has_been_cancelled(task["task_id"]):
                logging.info(
                    "Task [{}] has been already cancelled, skipping".format(
                        task['task_id']))
                message.delete()
                return None, None

            else:

                time.sleep(random.randint(1, 3))
                return None, None

    except Exception as error_acquiring:
        errlog.log(
            "Releasing msg after failed try_to_acquire_a_task {} [{}]".format(
                error_acquiring, traceback.format_exc()))
        raise error_acquiring
        # if e.response['Error']['Code'] == 'ResourceNotFoundException':
    # If we have succesfully ackquired a message we should change its visibility timeout
    message.change_visibility(
        VisibilityTimeout=agent_sqs_visibility_timeout_sec)
    task["stats"]["stage3_agent_01_task_acquired_sqs_tstmp"][
        "tstmp"] = task_pick_up_from_sqs_ms

    task["stats"]["stage3_agent_02_task_acquired_ddb_tstmp"][
        "tstmp"] = get_time_now_ms()
    event_counter_pre.increment("agent_successful_acquire_a_task")

    return message, task