def handle_state_request_constraint_encountered( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ Constraint encountered after spot request initiated but before request fullfilled, i.e. time limit expired Submit another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_request_constraint_encountered' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def handle_state_instance_force_termination_pending( self, spot_request_msg, spot_request_item, spot_request_uuid, spot_master_uuid ): """ AWS has started the termination process for this instance, i.e. the price has increased This is the beginning of the two minute warning pending forced termination Terminate the instance and start another spot request :param spot_request_msg: :param spot_request_item: :param spot_request_uuid: :param spot_master_uuid: """ logger.info( fmt_request_uuid_msg_hdr( spot_request_uuid ) + 'handle_state_instance_force_termination_pending' ) ts_now = int( time.time() ) spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, { TableSpotRequest.spot_request_state_code:SpotRequestStateCode.instance_complete, TableSpotRequest.is_open:0, TableSpotRequest.ts_end:ts_now }, region_name=self.region_name, profile_name=self.profile_name ) # Create a new spot request based on the spot request that just failed master_msg_resubmit_failed_request = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_RESUBMIT_FAILED_REQUEST, spot_request_uuid=spot_request_msg.spot_request_uuid ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageResubmitFailedRequest ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_resubmit_failed_request.to_json(), message_attributes=message_attributes )
def process(self, message): """ Spot Request has completed, write completion info to SpotRequestItem in DynamoDB, let master know this request has completed so the master can determine if the job has completed :param message: SQS Message instance """ try: spot_request_msg = SpotRequestMsg(raw_json=message.get_body()) spot_request_item = get_spot_request_item( self.spot_request_table_name, spot_request_msg.spot_request_uuid, region_name=self.region_name, profile_name=self.profile_name, ) ts_cmd_complete = spot_request_msg.name_value_pairs[ SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_COMPLETE_TIMESTAMP ] cmd_returncode = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_RETURNCODE] cmd_std_out = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_OUT] cmd_std_err = spot_request_msg.name_value_pairs[SpotRequestMsg.PAIR_NAME_BATCH_PROCESS_STD_ERR] key_value_pairs = { TableSpotRequest.is_open: 0, TableSpotRequest.spot_request_state_code: SpotRequestStateCode.instance_complete, TableSpotRequest.ts_cmd_complete: ts_cmd_complete, TableSpotRequest.cmd_returncode: cmd_returncode, } if cmd_std_out != None and len(cmd_std_out) > 0: key_value_pairs[TableSpotRequest.cmd_std_out] = cmd_std_out if cmd_std_err != None and len(cmd_std_err) > 0: key_value_pairs[TableSpotRequest.cmd_std_err] = cmd_std_err spot_request_row_partial_save( self.spot_request_table_name, spot_request_item, key_value_pairs, region_name=self.region_name, profile_name=self.profile_name, ) # let the Master increment the completion count to determine if the job is complete master_msg_incr_instance_success = SpotMasterMsg( spot_master_uuid=spot_request_msg.spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_INCR_INSTANCE_SUCCESS_CNT, ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageIncrSuccessCnt ) spot_master_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, self.region_name, profile_name=self.profile_name ) spot_master_sqs_message_durable.send_message( master_msg_incr_instance_success.to_json(), message_attributes=message_attributes ) self.spot_request_sqs_message_durable.delete_message(message) except StandardError as e: logger.error(fmt_request_item_msg_hdr(spot_request_item) + "Exiting SpotRequestDispatcher due to exception") logger.error(fmt_request_item_msg_hdr(spot_request_item) + str(e)) logger.error(fmt_request_item_msg_hdr(spot_request_item) + traceback.format_exc())
def process( self, message ) : """ Process the message :param message: SQS Message instance """ try: spot_master_msg = SpotMasterMsg( raw_json=message.get_body() ) spot_master_uuid = spot_master_msg.spot_master_uuid logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'process_check_status' ) # Get master row from DynamoDB and process based on state dynamodb_conn = boto.dynamodb2.connect_to_region( self.region_name, profile_name=self.profile_name ) spot_master_table = Table( self.spot_master_table_name, connection=dynamodb_conn ) spot_master_item = spot_master_table.get_item( spot_master_uuid=spot_master_uuid ) logger.info( fmt_master_uuid_msg_hdr( spot_master_uuid ) + 'master state=' + spot_master_item[TableSpotMaster.spot_master_state_code]) next_status_msg_delay_secs = 60 is_send_master_msg_check_status = True master_state_code = spot_master_item[TableSpotMaster.spot_master_state_code] spot_master_item[ TableSpotMaster.ts_last_state_check ] = int( time.time() ) spot_master_row_partial_save( self.spot_master_table_name, spot_master_item, {TableSpotMaster.ts_last_state_check:int( time.time() )}, region_name=self.region_name, profile_name=self.profile_name ) # Process based on the current Master State if SpotMasterStateCode.master_resources_in_progress == master_state_code: self.handle_state_master_resources_in_progress( spot_master_item ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.master_role_policy_in_progress == master_state_code: self.handle_state_master_role_policy_in_progress( spot_master_item, dynamodb_conn ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.waiting_for_instances_complete == master_state_code: self.handle_state_waiting_for_instances_complete( spot_master_item ) elif SpotMasterStateCode.waiting_for_instances_terminated == master_state_code: self.handle_state_waiting_for_instances_terminated( spot_master_item ) elif SpotMasterStateCode.waiting_for_master_resources_terminated == master_state_code: self.handle_state_waiting_for_master_resources_terminated( spot_master_item ) next_status_msg_delay_secs = 5 elif SpotMasterStateCode.cleanup_in_progress == master_state_code: self.handle_state_cleanup_in_progress( spot_master_item ) elif SpotMasterStateCode.cleanup_complete == master_state_code: self.handle_state_cleanup_complete( spot_master_item ) is_send_master_msg_check_status = False self.spot_master_sqs_message_durable.delete_message(message) if is_send_master_msg_check_status: spot_master_msg_check_status = SpotMasterMsg( spot_master_uuid=spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_CHECK_STATUS ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageCheckStatus ) self.spot_master_sqs_message_durable.send_message( spot_master_msg_check_status.to_json(), delay_seconds=next_status_msg_delay_secs, message_attributes=message_attributes ) except StandardError as e: logger.error( fmt_master_uuid_msg_hdr( spot_master_uuid ) + str(e) ) logger.error( fmt_master_uuid_msg_hdr( spot_master_uuid ) + traceback.format_exc() )
def submit_spot_batch_job( argv ): """ Submit a users' spot batch job Submit an SQS message containing the 2 parm files - Batch Job and User Parm :param argv: """ import logging.config if len(sys.argv) == 1: print 'ERROR: Missing log configuration file, first argument must be path/name.ext of the log configuration file' sys.exit(8) logging.config.fileConfig( sys.argv[1], disable_existing_loggers=False) logger = logging.getLogger(__name__) if len(sys.argv) == 2: logger.error( 'ERROR: Missing Batch Job Parm file, second argument must be path/name.ext of the log Batch Job Parm file' ) sys.exit(8) try: logger.info("Starting") path_batch_job_parm_file = sys.argv[2] if len(sys.argv) == 4: path_user_job_parm_file = sys.argv[3] else: path_user_job_parm_file = None with open( path_batch_job_parm_file ) as parm_file: raw_batch_job_parm_item = parm_file.read() if path_user_job_parm_file != None: with open( path_user_job_parm_file ) as parm_file: raw_user_job_parm_item = parm_file.read() else: raw_user_job_parm_item = None batch_job_parm_item = BatchJobParmItem( stringParmFile=raw_batch_job_parm_item ) spot_master_sqs_message_durable = SqsMessageDurable( awsspotbatch.common.const.SPOT_MASTER_QUEUE_NAME, batch_job_parm_item.primary_region_name, profile_name=batch_job_parm_item.profile_name ) spot_master_uuid = str(uuid.uuid1()) logger.info('Submitting test batch message, spot_master_uuid=' + spot_master_uuid ) spot_master_msg = SpotMasterMsg( spot_master_uuid=spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_SUBMIT_BATCH, raw_batch_job_parm_item=raw_batch_job_parm_item, raw_user_job_parm_item=raw_user_job_parm_item) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageSubmitBatch ) spot_master_sqs_message_durable.send_message( spot_master_msg.to_json(), message_attributes=message_attributes ) logger.info( 'Completed Successfully' ) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)
def send_check_status( self, spot_master_uuid ): """ Queue a Message to do a CheckStatus on this Master in the near future, i.e. in 5 seconds This is the first message that will do CheckStatus to check/transition the Master status, in SpotMasterMessageCheckStatus.process() it will continue to queue up another CheckStatus message (with a variable message delay based on the state) until the job completes :param spot_master_uuid: """ spot_master_msg_check_status = SpotMasterMsg( spot_master_uuid=spot_master_uuid, spot_master_msg_type=SpotMasterMsg.TYPE_CHECK_STATUS ) message_attributes = create_microsvc_message_attributes( awsspotbatch.common.const.MICROSVC_MASTER_CLASSNAME_SpotMasterMessageCheckStatus ) self.spot_master_sqs_message_durable.send_message( spot_master_msg_check_status.to_json(), delay_seconds=5, message_attributes=message_attributes )
def send_test_data( self ): """ """ try: spot_master_uuid = str( uuid.uuid1() ) spot_master_msg_submit_batch = SpotMasterMsg( spot_master_uuid, SpotMasterMsg.TYPE_SUBMIT_BATCH ) spot_master_msg_check_status = SpotMasterMsg( spot_master_uuid, SpotMasterMsg.TYPE_CHECK_STATUS ) spot_sqs_message_durable = SqsMessageDurable( self.spot_master_queue_name, region_name=self.region_name, profile_name=self.profile_name) spot_sqs_message_durable.send_message( spot_master_msg_submit_batch.to_json() ) spot_sqs_message_durable.send_message( spot_master_msg_check_status.to_json() ) except StandardError as e: logger.error( e ) logger.error( traceback.format_exc() ) sys.exit(8)