def process( self, message ) : """ This should never happen, so don't terminate the instance, leave it up so it can get SSH'ed into to determine the cause of failure :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) instance_termination_exception = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_TERMINATION_TIME_EXCEPTION ] spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'process_pending_termination_exception for spot_request_uuid, instance_termination_exception=' + instance_termination_exception ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now, TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_termination_exception, TableSpotRequest.instance_termination_exception:instance_termination_exception }, region_name=self.region_name, profile_name=self.profile_name ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + traceback.format_exc() )
def process( self, message ) : """ Batch job (i.e. user script) threw an exception, try it again TODO: need to check against some "max contiguous errors", i.e. if this fails 3x in a row then terminate the request :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_uuid = spot_request_msg.spot_request_uuid spot_master_uuid = spot_request_msg.spot_master_uuid logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process() for spot_master_uuid: ' + spot_master_uuid ) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP] cmd_exception_message = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_MESSAGE] cmd_exception_traceback = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_TRACEBACK] key_value_pairs = { TableSpotRequest.is_open:0, TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete_exception, TableSpotRequest.ts_cmd_complete:ts_cmd_complete, TableSpotRequest.cmd_exception_message:cmd_exception_message, TableSpotRequest.cmd_exception_traceback:cmd_exception_traceback, } spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )
def process( self, message ) : """ Process the message :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_uuid = spot_request_msg.spot_request_uuid spot_master_uuid = spot_request_msg.spot_master_uuid spot_request_id = spot_request_msg.spot_request_id logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process_check_status' ) # Get spot request row from DynamoDB and process based on state spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot request state=' + spot_request_item[TableSpotRequest.spot_request_state_code]) next_status_msg_delay_secs = 60 is_send_request_msg_check_status = True spot_request_state_code = spot_request_item[TableSpotRequest.spot_request_state_code] # Update the LastStateCheck timestamp spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.ts_last_state_check:int( time.time() ), }, region_name=self.region_name, profile_name=self.profile_name ) if SpotRequestStateCode.spot_request_in_progress == spot_request_state_code: self.handle_state_request_spot_request_in_progress( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_starting == spot_request_state_code: self.handle_state_request_instance_starting( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_running == spot_request_state_code: self.handle_state_request_instance_running( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_complete == spot_request_state_code: self.handle_state_request_instance_complete( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) is_send_request_msg_check_status = False elif SpotRequestStateCode.instance_state_unknown == spot_request_state_code: self.handle_state_request_instance_state_unknown( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.constraint_encountered == spot_request_state_code: self.handle_state_request_constraint_encountered( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_force_termination_pending == spot_request_state_code: self.handle_state_instance_force_termination_pending( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_force_terminated == spot_request_state_code: self.handle_state_request_instance_force_terminated( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) is_send_request_msg_check_status = False if is_send_request_msg_check_status: spot_request_msg_check_status = SpotRequestMsg( spot_request_uuid=spot_request_uuid, spot_master_uuid=spot_master_uuid, spot_request_msg_type=SpotRequestMsg.TYPE_CHECK_STATUS, spot_request_id=spot_request_id ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageCheckStatus ) self.spot_request_sqs_message_durable.send_message( spot_request_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )
def handle_state_request_instance_starting( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Spot request fullfilled, the instance is starting. Once the instance is running/ok, change status to instance_running :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_starting' ) # Check if the instance is totally up (running and all status checks ok) ec2_conn = awsext.ec2.connect_to_region( region_name=self.region_name, profile_name=self.profile_name ) instance_id = spot_request_item[ TableSpotRequest.instance_id ] state_name, status = ec2_conn.get_instance_state_name_and_status( instance_id ) logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot instance state_name=' + state_name + ', status=' + status ) if state_name == 'running' and status == 'ok': try: instance = ec2_conn.get_only_instances(instance_ids=[instance_id])[0] spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_running, TableSpotRequest.instance_public_ip_address:instance.ip_address, }, region_name=self.region_name, profile_name=self.profile_name ) # Instance is running and ok - run the clientlaunch, which will start the wrapper, run some init cmds, launch user cmd on thread and # start sending heartbeats launch_remote_client( self.spot_batch_job_parm_table_name, self.spot_rsa_key_table_name, spot_request_item, region_name=self.region_name, profile_name=self.profile_name) except boto.exception.EC2ResponseError as e: if e.code == 'InvalidInstanceID.NotFound': logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance terminated after starting and before init complete, spot_master_uuid=' + spot_master_uuid) # looks like spot instance was terminated between the time it was allocated and initialized spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_terminated, }, region_name=self.region_name, profile_name=self.profile_name ) else: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance unknown state, exception code=' + e.code + ', spot_master_uuid=' + spot_master_uuid) logger.error( traceback.format_exc() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_state_unknown, }, region_name=self.region_name, profile_name=self.profile_name ) elif state_name == 'terminated': logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance terminated after starting and before init complete, spot_master_uuid=' + spot_master_uuid) # looks like spot instance was terminated between the time it was allocated and initialized spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_terminated, }, region_name=self.region_name, profile_name=self.profile_name )
def process( self, message ) : """ Start SpotRequest process 1. Create item in SpotRequestItem table 2. queue up potRequestMessageCheckStatus, this will start the state-based process :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) logger.info( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'process_spot_request_initiated for spot_master_uuid: ' + spot_request_msg.spot_master_uuid ) ts_now = int( time.time() ) dict_create_spot_request_item = { TableSpotRequest.spot_request_uuid:spot_request_msg.spot_request_uuid, TableSpotRequest.spot_master_uuid:spot_request_msg.spot_master_uuid, TableSpotRequest.spot_request_id:spot_request_msg.spot_request_id, TableSpotRequest.ts_last_state_check:ts_now, TableSpotRequest.attempt_number:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_ATTEMPT_NUMBER ], TableSpotRequest.spot_price:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_SPOT_PRICE ], TableSpotRequest.instance_username:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_USERNAME ], TableSpotRequest.is_open:1, TableSpotRequest.spot_request_state_code:SpotRequestStateCode.spot_request_in_progress, TableSpotRequest.ts_start:ts_now, } put_attempt_cnt = 0 put_attempt_max = 10 while True: dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) spot_request_table = Table( self.spot_request_table_name, connection=dynamodb_conn ) result_spot_request_put = spot_request_table.put_item(data=dict_create_spot_request_item) if result_spot_request_put: break put_attempt_cnt += 1 if put_attempt_cnt == put_attempt_max: raise awsspotbatch.common.exception.DynamoDbPutItemMaxAttemptsExceeded('Failed attempt to insert item in: ' + self.spot_request_table_name + ' for spot_request_uuid: ' + spot_request_msg.spot_request_uuid, self.spot_request_table_name ) time.sleep(6) next_status_msg_delay_secs = 30 spot_request_msg_check_status = SpotRequestMsg( spot_request_uuid=spot_request_msg.spot_request_uuid, spot_master_uuid=spot_request_msg.spot_master_uuid, spot_request_msg_type=SpotRequestMsg.TYPE_CHECK_STATUS, spot_request_id=spot_request_msg.spot_request_id ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageCheckStatus ) self.spot_request_sqs_message_durable.send_message( spot_request_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + traceback.format_exc() )
def handle_state_request_constraint_encountered( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Constraint encountered after spot request initiated but before request fullfilled, i.e. time limit expired Submit another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_constraint_encountered' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def handle_state_instance_force_termination_pending( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ AWS has started the termination process for this instance, i.e. the price has increased This is the beginning of the two minute warning pending forced termination Terminate the instance and start another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_instance_force_termination_pending' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def handle_state_request_spot_request_in_progress( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Check if spot request has been fullfilled, terminated or is still pending :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_spot_request_in_progress' ) ec2_conn = awsext.ec2.connect_to_region( region_name=self.region_name, profile_name=self.profile_name ) try: spot_instance_requests = ec2_conn.get_all_spot_instance_requests(request_ids=[spot_request_item[TableSpotRequest.spot_request_id]] ) if spot_instance_requests != None and len(spot_instance_requests) == 1: spot_instance_request = spot_instance_requests[0] logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot_instance_request.status.code=' + spot_instance_request.status.code + ' for spot_request_id=' + spot_request_item[TableSpotRequest.spot_request_id ] + ', spot_request_uuid=' + spot_request_uuid ) if spot_instance_request.status.code == 'schedule-expired' or spot_instance_request.status.code in awsext.ec2.connection.AwsExtEC2Connection.SPOT_REQUEST_CONSTRAINTS: # update status so that a new spot request can occur spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.constraint_encountered, TableSpotRequest.constraint_code:spot_instance_request.status.code, }, region_name=self.region_name, profile_name=self.profile_name ) # Instance has been assigned, it's starting up now if spot_instance_request.instance_id != None: ec2_conn.create_tags( [spot_instance_request.instance_id], { 'Name':'spot_request_uuid_'+spot_request_uuid, 'spot_master_uuid':spot_master_uuid, 'spot_request_uuid':spot_request_uuid} ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_starting, TableSpotRequest.instance_id:spot_instance_request.instance_id, }, region_name=self.region_name, profile_name=self.profile_name ) except boto.exception.EC2ResponseError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance unknown state, exception code=' + e.code + ', spot_request_uuid=' + spot_request_uuid + ', spot_master_uuid=' + spot_master_uuid) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_state_unknown, }, region_name=self.region_name, profile_name=self.profile_name )
def handle_state_request_instance_running( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ At this point, :class:awsspotbatch.client.clientlaunch is running on the spot instance, it's sending back heartbeats, checking for spot termination and monitoring the users' batch job :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_running' )
def handle_state_request_instance_state_unknown( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Handle any future states - :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_state_unknown' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name )
def handle_state_request_instance_complete( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ User's batch job has completed, start instance termination process :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_complete' ) # terminate the instance, if it exists if spot_request_item[TableSpotRequest.instance_id] != None: ec2_conn = awsext.ec2.connect_to_region( self.region_name, profile_name=self.profile_name ) ec2_conn.terminate_instances( instance_ids=[ spot_request_item[TableSpotRequest.instance_id] ] ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name )