def process( self, message ) : """ Batch job (i.e. user script) threw an exception, try it again TODO: need to check against some "max contiguous errors", i.e. if this fails 3x in a row then terminate the request :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_uuid = spot_request_msg.spot_request_uuid spot_master_uuid = spot_request_msg.spot_master_uuid logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process() for spot_master_uuid: ' + spot_master_uuid ) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP] cmd_exception_message = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_MESSAGE] cmd_exception_traceback = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_BATCH_PROCESS_START_EXCEPTION_TRACEBACK] key_value_pairs = { TableSpotRequest.is_open:0, TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete_exception, TableSpotRequest.ts_cmd_complete:ts_cmd_complete, TableSpotRequest.cmd_exception_message:cmd_exception_message, TableSpotRequest.cmd_exception_traceback:cmd_exception_traceback, } spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )
def process( self, message ) : """ This should never happen, so don't terminate the instance, leave it up so it can get SSH'ed into to determine the cause of failure :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) instance_termination_exception = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_TERMINATION_TIME_EXCEPTION ] spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'process_pending_termination_exception for spot_request_uuid, instance_termination_exception=' + instance_termination_exception ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now, TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_termination_exception, TableSpotRequest.instance_termination_exception:instance_termination_exception }, region_name=self.region_name, profile_name=self.profile_name ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_msg.spot_request_uuid ) + traceback.format_exc() )
def process( self, message ) : """ Process the message :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_uuid = spot_request_msg.spot_request_uuid spot_master_uuid = spot_request_msg.spot_master_uuid spot_request_id = spot_request_msg.spot_request_id logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'process_check_status' ) # Get spot request row from DynamoDB and process based on state spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'spot request state=' + spot_request_item[TableSpotRequest.spot_request_state_code]) next_status_msg_delay_secs = 60 is_send_request_msg_check_status = True spot_request_state_code = spot_request_item[TableSpotRequest.spot_request_state_code] # Update the LastStateCheck timestamp spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.ts_last_state_check:int( time.time() ), }, region_name=self.region_name, profile_name=self.profile_name ) if SpotRequestStateCode.spot_request_in_progress == spot_request_state_code: self.handle_state_request_spot_request_in_progress( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_starting == spot_request_state_code: self.handle_state_request_instance_starting( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_running == spot_request_state_code: self.handle_state_request_instance_running( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_complete == spot_request_state_code: self.handle_state_request_instance_complete( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) is_send_request_msg_check_status = False elif SpotRequestStateCode.instance_state_unknown == spot_request_state_code: self.handle_state_request_instance_state_unknown( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.constraint_encountered == spot_request_state_code: self.handle_state_request_constraint_encountered( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_force_termination_pending == spot_request_state_code: self.handle_state_instance_force_termination_pending( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) elif SpotRequestStateCode.instance_force_terminated == spot_request_state_code: self.handle_state_request_instance_force_terminated( spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ) is_send_request_msg_check_status = False if is_send_request_msg_check_status: spot_request_msg_check_status = SpotRequestMsg( spot_request_uuid=spot_request_uuid, spot_master_uuid=spot_master_uuid, spot_request_msg_type=SpotRequestMsg.TYPE_CHECK_STATUS, spot_request_id=spot_request_id ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_REQUEST_CLASSNAME_SpotRequestMessageCheckStatus ) self.spot_request_sqs_message_durable.send_message( spot_request_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + str(e) ) logger.error( fmt_request_uuid_msg_hdr( spot_request_uuid ) + traceback.format_exc() )
def process(self, message): """ Spot Request has completed, write completion info to SpotRequestItem in DynamoDB, let master know this request has completed so the master can determine if the job has completed :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg(raw_json=message.get_body()) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name, ) ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP ] cmd_returncode = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_RETURNCODE] cmd_std_out = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_OUT] cmd_std_err = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_ERR] key_value_pairs = { TableSpotRequest.is_open: 0, TableSpotRequest.spot_request_state_code: SpotRequestStateCode.instance_complete, TableSpotRequest.ts_cmd_complete: ts_cmd_complete, TableSpotRequest.cmd_returncode: cmd_returncode, } if cmd_std_out != None and len(cmd_std_out) > 0: key_value_pairs[TableSpotRequest.cmd_std_out] = cmd_std_out if cmd_std_err != None and len(cmd_std_err) > 0: key_value_pairs[TableSpotRequest.cmd_std_err] = cmd_std_err spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name, ) # let the Master increment the completion count to determine if the job is complete master_msg_incr_instance_success = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_INCR_INSTANCE_SUCCESS_CNT, ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageIncrSuccessCnt ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_incr_instance_success.to_json(), message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error(fmt_request_item_msg_hdr(spot_request_item) + "Exiting SpotRequestDispatcher due to exception") logger.error(fmt_request_item_msg_hdr(spot_request_item) + str(e)) logger.error(fmt_request_item_msg_hdr(spot_request_item) + traceback.format_exc())
def process( self, message ) : """ AWS is going to terminate the request - update the status in SpotRequestItem. A SpotRequestCheckStatus message is processed at a regular interval and will detect the status change and process accordingly :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.info( fmt_request_item_msg_hdr( spot_request_item ) + 'process_pending_termination_detected' ) ts_pending_termination_detected = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_TS_PENDING_TERMINATION_DETECTED] spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_force_termination_pending, TableSpotRequest.ts_pending_termination_detected:ts_pending_termination_detected }, region_name=self.region_name, profile_name=self.profile_name ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error( fmt_request_item_msg_hdr( spot_request_item ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_item_msg_hdr( spot_request_item ) + str(e) ) logger.error( fmt_request_item_msg_hdr( spot_request_item ) + traceback.format_exc() )
def process(self, message): """ Heartbeat daemon successfully started, update timestamp in SpotRequestItem TODO: need to add check to determine the heartbeat daemon has started within a reasonable time period after the instance has started. If the heartbeat daemon can't start, the users' script will never run. :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg(raw_json=message.get_body()) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name, ) logger.info(fmt_request_item_msg_hdr(spot_request_item) + "process_heartbeat_daemon_started") ts_heartbeat_daemon_started = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_HEARTBEAT_DAEMON_STARTED_TIMESTAMP ] spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.ts_heartbeat_daemon_started: ts_heartbeat_daemon_started, TableSpotRequest.ts_heartbeat: ts_heartbeat_daemon_started, }, region_name=self.region_name, profile_name=self.profile_name, ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error(fmt_request_item_msg_hdr(spot_request_item) + "Exiting SpotRequestDispatcher due to exception") logger.error(fmt_request_item_msg_hdr(spot_request_item) + str(e)) logger.error(fmt_request_item_msg_hdr(spot_request_item) + traceback.format_exc())
def process( self, message ) : """ Write heartbeat timestamp to SpotRequestItem :param message: SQS Message instance """ try: before = time.time() spot_request_msg = SpotRequestMsg( raw_json=message.get_body() ) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name ) logger.info( fmt_request_item_msg_hdr( spot_request_item ) + 'process_instance_heartbeat' ) if spot_request_item != None: ts_heartbeat = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_INSTANCE_HEARTBEAT_TIMESTAMP] spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, {TableSpotRequest.ts_heartbeat:ts_heartbeat }, region_name=self.region_name, profile_name=self.profile_name ) else: logger.warning('Heartbeat not saved, spot_request_uuid not found: ' + spot_request_msg.spot_request_uuid ) self.spot_request_sqs_message_durable.delete_message(message) # logging.info('>>> Elapsed message process for heartbeat: ' + str(time.time() - before) ) except StandardError as e: logger.error( fmt_request_item_msg_hdr( spot_request_item ) + 'Exiting SpotRequestDispatcher due to exception' ) logger.error( fmt_request_item_msg_hdr( spot_request_item ) + str(e) ) logger.error( fmt_request_item_msg_hdr( spot_request_item ) + traceback.format_exc() )