def process( self, message ) :
        """ This should never happen, so don't terminate the instance, leave it up so it can get SSH'ed into
            to determine the cause of failure

        :param message: SQS Message instance

        """
        try:
            spot_request_msg = SpotRequestMsg( raw_json=message.get_body() )
            instance_termination_exception = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_TERMINATION_TIME_EXCEPTION ]
            spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'process_pending_termination_exception for spot_request_uuid, instance_termination_exception=' + instance_termination_exception )
            ts_now = int( time.time() )
            spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, 
                                           {
                                            TableSpotRequest.is_open:0,
                                            TableSpotRequest.ts_end:ts_now,
                                            TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_termination_exception,
                                            TableSpotRequest.instance_termination_exception:instance_termination_exception
                                            },
                                            region_name=self.region_name, profile_name=self.profile_name  )
            self.spot_request_sqs_message_durable.delete_message(message)            
    
        except StandardError as e:
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception'  )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + str(e) )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + traceback.format_exc() )    
    def process( self, message ) :
        """ Batch job (i.e. user script) threw an exception, try it again
            TODO: need to check against some "max contiguous errors", 
            i.e. if this fails 3x in a row then terminate the request

        :param message: SQS Message instance

        """
        try:
            spot_request_msg = SpotRequestMsg( raw_json=message.get_body() )
            spot_request_uuid = spot_request_msg.spot_request_uuid
            spot_master_uuid = spot_request_msg.spot_master_uuid
            logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process() for spot_master_uuid: ' + spot_master_uuid )
            spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name )
            ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP]
            cmd_exception_message = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_MESSAGE]
            cmd_exception_traceback = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_TRACEBACK]
            key_value_pairs = {
                                TableSpotRequest.is_open:0,
                                TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete_exception,
                                TableSpotRequest.ts_cmd_complete:ts_cmd_complete,
                                TableSpotRequest.cmd_exception_message:cmd_exception_message,
                                TableSpotRequest.cmd_exception_traceback:cmd_exception_traceback,
                                 }
            spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name )
            self.spot_request_sqs_message_durable.delete_message(message)        

        except StandardError as e:
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception'  )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )    
コード例 #3
0
    def process( self, message ) :
        """ Process the message

        :param message: SQS Message instance

        """
        try:
            spot_request_msg = SpotRequestMsg( raw_json=message.get_body() )
            spot_request_uuid = spot_request_msg.spot_request_uuid
            spot_master_uuid = spot_request_msg.spot_master_uuid
            spot_request_id = spot_request_msg.spot_request_id
            logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process_check_status' )
            # Get spot request row from DynamoDB and process based on state
            spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name )
            logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot request state=' + spot_request_item[TableSpotRequest.spot_request_state_code])
             
            next_status_msg_delay_secs = 60
            is_send_request_msg_check_status = True
            spot_request_state_code = spot_request_item[TableSpotRequest.spot_request_state_code]
            # Update the LastStateCheck timestamp
            spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                            TableSpotRequest.ts_last_state_check:int( time.time() ),
                                            },
                                            region_name=self.region_name, profile_name=self.profile_name )
             
            if SpotRequestStateCode.spot_request_in_progress == spot_request_state_code:
                self.handle_state_request_spot_request_in_progress( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
            elif SpotRequestStateCode.instance_starting == spot_request_state_code:
                self.handle_state_request_instance_starting( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
            elif SpotRequestStateCode.instance_running == spot_request_state_code:
                self.handle_state_request_instance_running( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
            elif SpotRequestStateCode.instance_complete == spot_request_state_code:
                self.handle_state_request_instance_complete( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
                is_send_request_msg_check_status = False
            elif SpotRequestStateCode.instance_state_unknown == spot_request_state_code:
                self.handle_state_request_instance_state_unknown( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
            elif SpotRequestStateCode.constraint_encountered == spot_request_state_code:
                self.handle_state_request_constraint_encountered( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )       
            elif SpotRequestStateCode.instance_force_termination_pending == spot_request_state_code:
                self.handle_state_instance_force_termination_pending( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
            elif SpotRequestStateCode.instance_force_terminated == spot_request_state_code:
                self.handle_state_request_instance_force_terminated( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid )
                is_send_request_msg_check_status = False
            
            if is_send_request_msg_check_status:
                spot_request_msg_check_status = SpotRequestMsg( spot_request_uuid=spot_request_uuid, 
                                                                spot_master_uuid=spot_master_uuid, 
                                                                spot_request_msg_type=SpotRequestMsg.TYPE_CHECK_STATUS, 
                                                                spot_request_id=spot_request_id )
                message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageCheckStatus )
                self.spot_request_sqs_message_durable.send_message( spot_request_msg_check_status.to_json(), 
                                                                    delay_seconds=next_status_msg_delay_secs, 
                                                                    message_attributes=message_attributes  )
        
            self.spot_request_sqs_message_durable.delete_message(message)

        except StandardError as e:
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception'  )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )    
コード例 #4
0
    def handle_state_request_instance_starting( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ Spot request fullfilled, the instance is starting.  
            Once the instance is running/ok, change status to instance_running

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_starting' )   
        # Check if the instance is totally up (running and all status checks ok)
        ec2_conn = awsext.ec2.connect_to_region( region_name=self.region_name, profile_name=self.profile_name )
        instance_id = spot_request_item[ TableSpotRequest.instance_id ]
        state_name, status = ec2_conn.get_instance_state_name_and_status( instance_id )
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot instance state_name=' + state_name + ', status=' + status )
        if state_name == 'running' and status == 'ok':
            try:
                instance = ec2_conn.get_only_instances(instance_ids=[instance_id])[0]
                spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                        TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_running,
                                                        TableSpotRequest.instance_public_ip_address:instance.ip_address,
                                                        },
                                                        region_name=self.region_name, profile_name=self.profile_name )
                # Instance is running and ok - run the clientlaunch, which will start the wrapper, run some init cmds, launch user cmd on thread and
                # start sending heartbeats
                launch_remote_client( self.spot_batch_job_parm_table_name, 
                                      self.spot_rsa_key_table_name, spot_request_item, 
                                      region_name=self.region_name, 
                                      profile_name=self.profile_name)

            except boto.exception.EC2ResponseError as e:
                if e.code == 'InvalidInstanceID.NotFound':
                    logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance terminated after starting and before init complete, spot_master_uuid=' + spot_master_uuid)
                    # looks like spot instance was terminated between the time it was allocated and initialized
                    spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                            TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_terminated,
                                                            },
                                                            region_name=self.region_name, profile_name=self.profile_name )
                else: 
                    logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance unknown state, exception code=' + e.code + ', spot_master_uuid=' + spot_master_uuid)
                    logger.error( traceback.format_exc() )  
                    spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                            TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_state_unknown,
                                                            },
                                                            region_name=self.region_name, profile_name=self.profile_name )
        elif state_name == 'terminated':
            logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance terminated after starting and before init complete, spot_master_uuid=' + spot_master_uuid)
            # looks like spot instance was terminated between the time it was allocated and initialized
            spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                    TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_terminated,
                                                    },
                                                    region_name=self.region_name, profile_name=self.profile_name )
    def process( self, message ) :
        """ Start SpotRequest process
            1. Create item in SpotRequestItem table
            2. queue up potRequestMessageCheckStatus, this will start the state-based process

        :param message: SQS Message instance

        """
        try:
            spot_request_msg = SpotRequestMsg( raw_json=message.get_body() )
            logger.info( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'process_spot_request_initiated for spot_master_uuid: ' + spot_request_msg.spot_master_uuid )
            ts_now = int( time.time() )
            dict_create_spot_request_item = {
                                        TableSpotRequest.spot_request_uuid:spot_request_msg.spot_request_uuid,
                                        TableSpotRequest.spot_master_uuid:spot_request_msg.spot_master_uuid,
                                        TableSpotRequest.spot_request_id:spot_request_msg.spot_request_id,
                                        TableSpotRequest.ts_last_state_check:ts_now,
                                        TableSpotRequest.attempt_number:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_ATTEMPT_NUMBER ],
                                        TableSpotRequest.spot_price:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_SPOT_PRICE ],
                                        TableSpotRequest.instance_username:spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_USERNAME ],
                                        TableSpotRequest.is_open:1,
                                        TableSpotRequest.spot_request_state_code:SpotRequestStateCode.spot_request_in_progress,
                                        TableSpotRequest.ts_start:ts_now,
                                       }
            put_attempt_cnt = 0
            put_attempt_max = 10
            while True:
                dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name )
                spot_request_table = Table( self.spot_request_table_name, connection=dynamodb_conn ) 
                result_spot_request_put = spot_request_table.put_item(data=dict_create_spot_request_item)
                if result_spot_request_put: break
                put_attempt_cnt += 1
                if put_attempt_cnt == put_attempt_max: 
                    raise awsspotbatch.common.exception.DynamoDbPutItemMaxAttemptsExceeded('Failed attempt to insert item in: ' + self.spot_request_table_name + 
                                                             ' for spot_request_uuid: ' + spot_request_msg.spot_request_uuid, self.spot_request_table_name )
                time.sleep(6)
            
            next_status_msg_delay_secs = 30
            spot_request_msg_check_status = SpotRequestMsg( 
                                                           spot_request_uuid=spot_request_msg.spot_request_uuid, 
                                                           spot_master_uuid=spot_request_msg.spot_master_uuid,
                                                           spot_request_msg_type=SpotRequestMsg.TYPE_CHECK_STATUS, 
                                                           spot_request_id=spot_request_msg.spot_request_id )
            message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageCheckStatus )
            self.spot_request_sqs_message_durable.send_message( spot_request_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes )
            self.spot_request_sqs_message_durable.delete_message(message) 
            
        except StandardError as e:
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception'  )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + str(e) )
            logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + traceback.format_exc() )    
コード例 #6
0
    def handle_state_request_constraint_encountered( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ Constraint encountered after spot request initiated but before request fullfilled, 
            i.e. time limit expired
            Submit another spot request

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_constraint_encountered' )
        ts_now = int( time.time() )
        spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                                TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete,
                                                                TableSpotRequest.is_open:0, 
                                                                TableSpotRequest.ts_end:ts_now
                                                                 },
                                                                 region_name=self.region_name, profile_name=self.profile_name )
        # Create a new spot request based on the spot request that just failed
        master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, 
                                                  spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST,
                                                  spot_request_uuid=spot_request_msg.spot_request_uuid
                                                   )
        message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest )
        spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name )
        spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(),
                                                           message_attributes=message_attributes )
コード例 #7
0
    def handle_state_instance_force_termination_pending( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ AWS has started the termination process for this instance, i.e. the price has increased
            This is the beginning of the two minute warning pending forced termination
            Terminate the instance and start another spot request

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_instance_force_termination_pending' )
        ts_now = int( time.time() )
        spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                                TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete,
                                                                TableSpotRequest.is_open:0, 
                                                                TableSpotRequest.ts_end:ts_now
                                                                 },
                                                                 region_name=self.region_name, profile_name=self.profile_name )
        # Create a new spot request based on the spot request that just failed
        master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, 
                                                  spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST,
                                                  spot_request_uuid=spot_request_msg.spot_request_uuid
                                                   )
        message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest )
        spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name )
        spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(),
                                                           message_attributes=message_attributes )
コード例 #8
0
    def handle_state_request_spot_request_in_progress( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ Check if spot request has been fullfilled, terminated or is still pending

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_spot_request_in_progress' )
        ec2_conn = awsext.ec2.connect_to_region( region_name=self.region_name, profile_name=self.profile_name )
        try:
            spot_instance_requests = ec2_conn.get_all_spot_instance_requests(request_ids=[spot_request_item[TableSpotRequest.spot_request_id]] )
            if spot_instance_requests != None and len(spot_instance_requests) == 1:
                spot_instance_request = spot_instance_requests[0]
                logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot_instance_request.status.code=' + spot_instance_request.status.code + 
                            ' for spot_request_id=' + spot_request_item[TableSpotRequest.spot_request_id ] + 
                            ', spot_request_uuid=' + spot_request_uuid )
                if spot_instance_request.status.code == 'schedule-expired' or spot_instance_request.status.code in awsext.ec2.connection.AwsExtEC2Connection.SPOT_REQUEST_CONSTRAINTS:
                    # update status so that a new spot request can occur
                    spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                  TableSpotRequest.spot_request_state_code:SpotRequestStateCode.constraint_encountered,
                                                  TableSpotRequest.constraint_code:spot_instance_request.status.code,
                                                  },
                                                  region_name=self.region_name, profile_name=self.profile_name )
                # Instance has been assigned, it's starting up now
                if spot_instance_request.instance_id != None:
                    ec2_conn.create_tags( [spot_instance_request.instance_id], 
                                          { 'Name':'spot_request_uuid_'+spot_request_uuid, 'spot_master_uuid':spot_master_uuid, 'spot_request_uuid':spot_request_uuid} )
                    spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                        TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_starting,
                                                        TableSpotRequest.instance_id:spot_instance_request.instance_id,
                                                        },
                                                        region_name=self.region_name, profile_name=self.profile_name )
                
        except boto.exception.EC2ResponseError as e:
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Instance unknown state, exception code=' + e.code + ', spot_request_uuid=' + spot_request_uuid + ', spot_master_uuid=' + spot_master_uuid)
            logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )  
            spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                            TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_state_unknown,
                                                            },
                                                            region_name=self.region_name, profile_name=self.profile_name )
コード例 #9
0
    def handle_state_request_instance_running( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ At this point, :class:awsspotbatch.client.clientlaunch is running on the spot instance,
            it's sending back heartbeats, checking for spot termination and monitoring the users' batch job

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_running' )
コード例 #10
0
    def handle_state_request_instance_state_unknown( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ Handle any future states - 

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.warning( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_state_unknown' )
        ts_now = int( time.time() )
        spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {
                                                                TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete,
                                                                TableSpotRequest.is_open:0, 
                                                                TableSpotRequest.ts_end:ts_now
                                                                 },
                                                                 region_name=self.region_name, profile_name=self.profile_name )
コード例 #11
0
    def handle_state_request_instance_complete( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ):
        """ User's batch job has completed, start instance termination process

        :param spot_request_msg: 
        :param spot_request_item: 
        :param spot_request_uuid: 
        :param spot_master_uuid: 

        """
        logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_instance_complete' )
        # terminate the instance, if it exists              
        if spot_request_item[TableSpotRequest.instance_id] != None:
            ec2_conn = awsext.ec2.connect_to_region( self.region_name, profile_name=self.profile_name )
            ec2_conn.terminate_instances( instance_ids=[ spot_request_item[TableSpotRequest.instance_id] ] )
        ts_now = int( time.time() )
        spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, 
                                            {TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now },
                                            region_name=self.region_name, profile_name=self.profile_name )