def get_m_state(self): """ Update the machine state of the current instance by querying the cloud middleware for the instance object itself (via the instance id) and updating self.m_state field to match the state returned by the cloud middleware. Also, update local last_state_update timestamp. :rtype: String :return: the current state of the instance as obtained from the cloud middleware """ self.last_state_update = Time.now() self.get_cloud_instance_object(deep=True) if self.inst: try: state = self.inst.state log.debug("Requested instance {0} update: old state: {1}; new state: {2}" .format(self.get_desc(), self.m_state, state)) if state != self.m_state: self.m_state = state self.last_m_state_change = Time.now() except EC2ResponseError, e: log.debug("Error updating instance {0} state: {1}".format( self.get_id(), e)) self.m_state = instance_states.ERROR
def get_all_services_status(self, trans): status_dict = self.app.manager.get_all_services_status() # status_dict['filesystems'] = self.app.manager.get_all_filesystems_status() status_dict['galaxy_dns'] = self.get_galaxy_dns(trans) status_dict['galaxy_rev'] = self.app.manager.get_galaxy_rev() status_dict['galaxy_admins'] = self.app.manager.get_galaxy_admins() snap_status = self.app.manager.snapshot_status() status_dict['snapshot'] = { 'status': str(snap_status[0]), 'progress': str(snap_status[1]) } status_dict['master_is_exec_host'] = self.app.manager.master_exec_host status_dict[ 'ignore_deps_framework'] = self.app.config.ignore_unsatisfiable_dependencies status_dict['messages'] = self.messages_string( self.app.msgs.get_messages()) status_dict[ 'cluster_startup_time'] = self.app.manager.startup_time.strftime( "%b %d %Y %H:%M:%S") cluster_uptime = misc.format_time_delta(Time.now() - self.app.manager.startup_time) status_dict['cluster_uptime'] = cluster_uptime # status_dict['dummy'] = str(datetime.now()) # Used for testing only # print "status_dict: %s" % status_dict return json.dumps(status_dict)
def get_status_array(self): if self.m_state.lower() == "running": # For extra states. if self.is_alive is not True: ld = "Starting" elif self.load: lds = self.load.split(' ') if len(lds) == 3: try: load1 = float(lds[0]) / self.num_cpus load2 = float(lds[1]) / self.num_cpus load3 = float(lds[2]) / self.num_cpus ld = "%s %s %s" % (load1, load2, load3) except Exception, e: log.debug("Problems normalizing load: %s" % e) ld = self.load else: ld = self.load elif self.worker_status == "Ready": ld = "Running" return [ self.id, ld, misc.format_seconds(Time.now() - self.last_m_state_change), self.nfs_data, self.nfs_tools, self.nfs_indices, self.nfs_sge, self.get_cert, self.sge_started, self.worker_status ]
def maintain(self): """ Based on the state and status of this instance, try to do the right thing to keep the instance functional. Note that this may lead to terminating the instance. """ def reboot_terminate_logic(): """ Make a decision whether to terminate or reboot an instance. CALL THIS METHOD CAREFULLY because it defaults to terminating the instance! """ if self.reboot_count < self.config.instance_reboot_attempts: self.reboot() elif self.terminate_attempt_count >= self.config.instance_terminate_attempts: log.info( "Tried terminating instance {0} {1} times but was unsuccessful. Giving up." .format(self.inst.id, self.config.instance_terminate_attempts)) self._remove_instance() else: log.info( "Instance {0} not responding after {1} reboots. Terminating instance." .format(self.id, self.reboot_count)) self.terminate() # Update state then do resolution state = self.get_m_state() if state == instance_states.PENDING or state == instance_states.SHUTTING_DOWN: if (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \ (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout: log.debug( "'Maintaining' instance {0} stuck in '{1}' state.".format( self.get_desc(), state)) reboot_terminate_logic() elif state == instance_states.ERROR: log.debug("'Maintaining' instance {0} in '{1}' state.".format( self.get_desc(), instance_states.ERROR)) reboot_terminate_logic() elif state == instance_states.TERMINATED: log.debug("'Maintaining' instance {0} in '{1}' state.".format( self.get_desc(), instance_states.TERMINATED)) self._remove_instance() elif state == instance_states.RUNNING: log.debug( "'Maintaining' instance {0} in '{1}' state (last comm before {2} | " "last m_state change before {3} | time_rebooted before {4}". format( self.get_desc(), instance_states.RUNNING, dt.timedelta(seconds=(Time.now() - self.last_comm).seconds), dt.timedelta(seconds=(Time.now() - self.last_m_state_change).seconds), dt.timedelta(seconds=(Time.now() - self.time_rebooted).seconds))) if (Time.now() - self.last_comm).seconds > self.config.instance_comm_timeout and \ (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \ (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout: reboot_terminate_logic()
def maintain(self): """ Based on the state and status of this instance, try to do the right thing to keep the instance functional. Note that this may lead to terminating the instance. """ def reboot_terminate_logic(): """ Make a decision whether to terminate or reboot an instance. CALL THIS METHOD CAREFULLY because it defaults to terminating the instance! """ if self.reboot_count < self.config.instance_reboot_attempts: self.reboot() elif self.terminate_attempt_count >= self.config.instance_terminate_attempts: log.info("Tried terminating instance {0} {1} times but was unsuccessful. Giving up." .format(self.inst.id, self.config.instance_terminate_attempts)) self._remove_instance() else: log.info("Instance {0} not responding after {1} reboots. Terminating instance." .format(self.id, self.reboot_count)) self.terminate() # Update state then do resolution state = self.get_m_state() if state == instance_states.PENDING or state == instance_states.SHUTTING_DOWN: if (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \ (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout: log.debug("'Maintaining' instance {0} stuck in '{1}' state.".format( self.get_desc(), state)) reboot_terminate_logic() elif state == instance_states.ERROR: log.debug("'Maintaining' instance {0} in '{1}' state.".format(self.get_desc(), instance_states.ERROR)) reboot_terminate_logic() elif state == instance_states.TERMINATED: log.debug("'Maintaining' instance {0} in '{1}' state.".format(self.get_desc(), instance_states.TERMINATED)) self._remove_instance() elif state == instance_states.RUNNING: log.debug("'Maintaining' instance {0} in '{1}' state (last comm before {2} | " "last m_state change before {3} | time_rebooted before {4}" .format(self.get_desc(), instance_states.RUNNING, dt.timedelta(seconds=(Time.now() - self.last_comm).seconds), dt.timedelta(seconds=(Time.now() - self.last_m_state_change).seconds), dt.timedelta(seconds=(Time.now() - self.time_rebooted).seconds))) if (Time.now() - self.last_comm).seconds > self.config.instance_comm_timeout and \ (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \ (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout: reboot_terminate_logic()
def reboot(self, count_reboot=True): """ Reboot this instance. If ``count_reboot`` is set, increment the number of reboots for this instance (a treshold in this count leads to eventual instance termination, see ``self.config.instance_reboot_attempts``). """ if self.inst is not None: # Show reboot count only if this reboot counts toward the reboot quota s = " (reboot #{0})".format(self.reboot_count + 1) log.info("Rebooting instance {0}{1}.".format(self.get_desc(), s if count_reboot else '')) try: self.inst.reboot() self.time_rebooted = Time.now() except EC2ResponseError, e: log.error("Trouble rebooting instance {0}: {1}".format(self.get_desc(), e))
def get_all_services_status(self, trans): status_dict = self.app.manager.get_all_services_status() # status_dict['filesystems'] = self.app.manager.get_all_filesystems_status() status_dict['galaxy_dns'] = self.get_galaxy_dns(trans) status_dict['galaxy_rev'] = self.app.manager.get_galaxy_rev() status_dict['galaxy_admins'] = self.app.manager.get_galaxy_admins() snap_status = self.app.manager.snapshot_status() status_dict['snapshot'] = {'status': str(snap_status[0]), 'progress': str(snap_status[1])} status_dict['master_is_exec_host'] = self.app.manager.master_exec_host status_dict['ignore_deps_framework'] = self.app.config.ignore_unsatisfiable_dependencies status_dict['messages'] = self.messages_string(self.app.msgs.get_messages()) status_dict['cluster_startup_time'] = self.app.manager.startup_time.strftime("%b %d %Y %H:%M:%S") cluster_uptime = misc.format_time_delta(Time.now() - self.app.manager.startup_time) status_dict['cluster_uptime'] = cluster_uptime # status_dict['dummy'] = str(datetime.now()) # Used for testing only # print "status_dict: %s" % status_dict return json.dumps(status_dict)
def get_status_dict(self): toret = { 'id': self.id, 'alias': self.alias, 'ld': self.load, 'time_in_state': misc.format_seconds(Time.now() - self.last_m_state_change), 'nfs_data': self.nfs_data, 'nfs_tools': self.nfs_tools, 'nfs_indices': self.nfs_indices, 'nfs_sge': self.nfs_sge, 'nfs_tfs': self.nfs_tfs, 'get_cert': self.get_cert, 'slurmd_running': self.slurmd_running, 'worker_status': self.worker_status, 'instance_state': self.m_state, 'instance_type': self.type, 'public_ip': self.public_ip } if self.load: lds = self.load.split(' ') if len(lds) == 3: toret['ld'] = "%s %s %s" % (float(lds[0]) / self.num_cpus, float(lds[1]) / self.num_cpus, float(lds[2]) / self.num_cpus) return toret
def get_status_array(self): if self.m_state.lower() == "running": # For extra states. if self.is_alive is not True: ld = "Starting" elif self.load: lds = self.load.split(' ') if len(lds) == 3: try: load1 = float(lds[0]) / self.num_cpus load2 = float(lds[1]) / self.num_cpus load3 = float(lds[2]) / self.num_cpus ld = "%s %s %s" % (load1, load2, load3) except Exception, e: log.debug("Problems normalizing load: %s" % e) ld = self.load else: ld = self.load elif self.worker_status == "Ready": ld = "Running" return [self.id, ld, misc.format_seconds( Time.now() - self.last_m_state_change), self.nfs_data, self.nfs_tools, self.nfs_indices, self.nfs_sge, self.get_cert, self.sge_started, self.worker_status]
def get_status_dict(self): toret = {'id': self.id, 'alias': self.alias, 'ld': self.load, 'time_in_state': misc.format_seconds(Time.now() - self.last_m_state_change), 'nfs_data': self.nfs_data, 'nfs_tools': self.nfs_tools, 'nfs_indices': self.nfs_indices, 'nfs_sge': self.nfs_sge, 'nfs_tfs': self.nfs_tfs, 'get_cert': self.get_cert, 'slurmd_running': self.slurmd_running, 'worker_status': self.worker_status, 'instance_state': self.m_state, 'instance_type': self.type, 'public_ip': self.public_ip} if self.load: lds = self.load.split(' ') if len(lds) == 3: toret['ld'] = "%s %s %s" % (float(lds[0]) / self.num_cpus, float( lds[1]) / self.num_cpus, float(lds[2]) / self.num_cpus) return toret
def handle_message(self, msg): # log.debug( "Handling message: %s from %s" % ( msg, self.id ) ) self.is_alive = True self.last_comm = Time.now() # Transition from states to a particular response. if self.app.manager.console_monitor.conn: msg_type = msg.split(' | ')[0] if msg_type == "ALIVE": self.worker_status = "Starting" log.info("Instance %s reported alive" % self.get_desc()) msp = msg.split(' | ') self.private_ip = msp[1] self.public_ip = msp[2] self.zone = msp[3] self.type = msp[4] self.ami = msp[5] try: self.local_hostname = msp[6] self.num_cpus = int(msp[7]) self.total_memory = int(msp[8]) self.hostname = msp[9] except: # Older versions of CloudMan did not pass this value so if the master # and the worker are running 2 diff versions (can happen after an # automatic update), don't crash here. self.local_hostname = self.public_ip log.debug("INSTANCE_ALIVE private_ip: %s public_ip: %s zone: %s " "type: %s AMI: %s local_hostname: %s, CPUs: %s, hostname: %s" % (self.private_ip, self.public_ip, self.zone, self.type, self.ami, self.local_hostname, self.num_cpus, self.hostname)) # Add instance IP/name to /etc/hosts misc.add_to_etc_hosts(self.private_ip, [self.alias, self.local_hostname, self.hostname]) # Instance is alive and responding. self.send_mount_points() elif msg_type == "GET_MOUNTPOINTS": self.send_mount_points() elif msg_type == "MOUNT_DONE": log.debug("Got MOUNT_DONE message") # Update the list of mount points that have mounted if len(msg.split(' | ')) > 1: msg_body = msg.split(' | ')[1] try: body = json.loads(msg_body) mounted_fs = body.get('mounted_fs', {}) # Currently, only interested in the transient FS self.nfs_tfs = mounted_fs.get('transient_nfs', 0) log.debug("Got transient_nfs state on {0}: {1}".format( self.alias, self.nfs_tfs)) except ValueError, vexc: log.warning('ValueError trying to decode msg: {0}' .format(vexc)) self.app.manager.sync_etc_hosts() self.send_master_pubkey() # Add hostname to /etc/hosts (for SGE config) if self.app.cloud_type in ('openstack', 'eucalyptus'): hn2 = '' if '.' in self.local_hostname: hn2 = (self.local_hostname).split('.')[0] worker_host_line = '{ip} {hn1} {hn2}\n'.format(ip=self.private_ip, hn1=self.local_hostname, hn2=hn2) log.debug("worker_host_line: {0}".format(worker_host_line)) with open('/etc/hosts', 'r+') as f: hosts = f.readlines() if worker_host_line not in hosts: log.debug("Adding worker {0} to /etc/hosts".format( self.local_hostname)) f.write(worker_host_line) if self.app.cloud_type == 'opennebula': f = open("/etc/hosts", 'a') f.write("%s\tworker-%s\n" % (self.private_ip, self.id)) f.close() # log.debug("Update /etc/hosts through master") # self.app.manager.update_etc_host() elif msg_type == "WORKER_H_CERT": log.debug("Got WORKER_H_CERT message") self.is_alive = True # This is for the case that an existing worker is added to a new master. self.app.manager.save_host_cert(msg.split(" | ")[1]) log.debug("Worker '%s' host certificate received and appended " "to /root/.ssh/known_hosts" % self.id) for job_manager_svc in self.app.manager.service_registry.active( service_role=ServiceRole.JOB_MANAGER): job_manager_svc.add_node(self) # Instruct the worker to start appropriate job manager daemon if ServiceRole.SLURMCTLD in job_manager_svc.svc_roles: self.send_start_slurmd() else: self.send_start_sge() else: log.warning('Could not get a handle on job manager service to ' 'add node {0}'.format(self.get_desc())) # If there are any bucket-based FSs, tell the worker to add those fss = self.app.manager.get_services(svc_type=ServiceType.FILE_SYSTEM) for fs in fss: if len(fs.buckets) > 0: for b in fs.buckets: self.send_add_s3fs(b.bucket_name, fs.svc_roles) log.info("Waiting on worker instance %s to configure itself." % self.get_desc()) elif msg_type == "NODE_READY": self.worker_status = "Ready" log.info("Instance %s ready" % self.get_desc()) # Make sure the instace is tagged (this is also necessary to do # here for OpenStack because it does not allow tags to be added # until an instance is 'running') self.app.cloud_interface.add_tag(self.inst, 'clusterName', self.app.config['cluster_name']) self.app.cloud_interface.add_tag(self.inst, 'role', 'worker') self.app.cloud_interface.add_tag(self.inst, 'alias', self.alias) self.app.cloud_interface.add_tag( self.inst, 'Name', "Worker: {0}".format(self.app.config['cluster_name'])) self.app.manager.update_condor_host(self.public_ip) elif msg_type == "NODE_STATUS": # log.debug("Node {0} status message: {1}".format(self.get_desc(), msg)) if not self.worker_status == 'Stopping': msplit = msg.split(' | ') self.nfs_data = msplit[1] self.nfs_tools = msplit[2] # Workers currently do not update this field self.nfs_indices = msplit[3] self.nfs_sge = msplit[4] self.get_cert = msplit[5] self.sge_started = msplit[6] self.load = msplit[7] self.worker_status = msplit[8] self.nfs_tfs = msplit[9] self.slurmd_running = msplit[10] else: log.debug("Worker {0} in state Stopping so not updating status" .format(self.get_desc())) elif msg_type == 'NODE_SHUTTING_DOWN': msplit = msg.split(' | ') self.worker_status = msplit[1] else: # Catch-all condition log.debug("Unknown Message: %s" % msg)
self.inst = inst # boto object of the instance self.spot_state = None self.private_ip = None self.public_ip = None self.local_hostname = None if inst: try: self.id = str(inst.id) except EC2ResponseError, e: log.error("Error retrieving instance id: %s" % e) else: self.id = None # Machine state as obtained from the cloud middleware (see # instance_states Bunch) self.m_state = m_state self.last_m_state_change = Time.now() # A time stamp when the most recent update of the instance state # (m_state) took place self.last_state_update = Time.now() self.is_alive = False self.num_cpus = 1 self.total_memory = 1 # in bytes self.time_rebooted = TIME_IN_PAST # Initialize to a date in the past self.reboot_count = 0 self.terminate_attempt_count = 0 self.last_comm = TIME_IN_PAST # Initialize to a date in the past self.nfs_data = 0 self.nfs_tools = 0 self.nfs_indices = 0 self.nfs_sge = 0 self.nfs_tfs = 0 # Transient file system, NFS-mounted from the master
def handle_message(self, msg): # log.debug( "Handling message: %s from %s" % ( msg, self.id ) ) self.is_alive = True self.last_comm = Time.now() # Transition from states to a particular response. if self.app.manager.console_monitor.conn: msg_type = msg.split(' | ')[0] if msg_type == "ALIVE": self.worker_status = "Starting" log.info("Instance %s reported alive" % self.get_desc()) msp = msg.split(' | ') self.private_ip = msp[1] self.public_ip = msp[2] self.zone = msp[3] self.type = msp[4] self.ami = msp[5] try: self.local_hostname = msp[6] self.num_cpus = int(msp[7]) self.total_memory = int(msp[8]) self.hostname = msp[9] except: # Older versions of CloudMan did not pass this value so if the master # and the worker are running 2 diff versions (can happen after an # automatic update), don't crash here. self.local_hostname = self.public_ip log.debug("INSTANCE_ALIVE private_ip: %s public_ip: %s zone: %s " "type: %s AMI: %s local_hostname: %s, CPUs: %s, hostname: %s" % (self.private_ip, self.public_ip, self.zone, self.type, self.ami, self.local_hostname, self.num_cpus, self.hostname)) # Add instance IP/name to /etc/hosts misc.add_to_etc_hosts(self.private_ip, [self.alias, self.local_hostname, self.hostname]) # Instance is alive and responding. self.send_mount_points() elif msg_type == "GET_MOUNTPOINTS": self.send_mount_points() elif msg_type == "MOUNT_DONE": log.debug("Got MOUNT_DONE message") # Update the list of mount points that have mounted if len(msg.split(' | ')) > 1: msg_body = msg.split(' | ')[1] try: body = json.loads(msg_body) mounted_fs = body.get('mounted_fs', {}) # Currently, only interested in the transient FS self.nfs_tfs = mounted_fs.get('transient_nfs', 0) log.debug("Got transient_nfs state on {0}: {1}".format( self.alias, self.nfs_tfs)) except ValueError, vexc: log.warning('ValueError trying to decode msg: {0}' .format(vexc)) self.app.manager.sync_etc_hosts() self.send_master_pubkey() # Add hostname to /etc/hosts (for SGE config) if self.app.cloud_type in ('openstack', 'eucalyptus'): hn2 = '' if '.' in self.local_hostname: hn2 = (self.local_hostname).split('.')[0] worker_host_line = '{ip} {hn1} {hn2}\n'.format(ip=self.private_ip, hn1=self.local_hostname, hn2=hn2) log.debug("worker_host_line: {0}".format(worker_host_line)) with open('/etc/hosts', 'r+') as f: hosts = f.readlines() if worker_host_line not in hosts: log.debug("Adding worker {0} to /etc/hosts".format( self.local_hostname)) f.write(worker_host_line) if self.app.cloud_type == 'opennebula': f = open("/etc/hosts", 'a') f.write("%s\tworker-%s\n" % (self.private_ip, self.id)) f.close() # log.debug("Update /etc/hosts through master") # self.app.manager.update_etc_host() elif msg_type == "WORKER_H_CERT": log.debug("Got WORKER_H_CERT message") self.is_alive = True # This is for the case that an existing worker is added to a new master. self.app.manager.save_host_cert(msg.split(" | ")[1]) log.debug("Worker '%s' host certificate received and appended " "to /root/.ssh/known_hosts" % self.id) for job_manager_svc in self.app.manager.service_registry.active( service_role=ServiceRole.JOB_MANAGER): job_manager_svc.add_node(self) # Instruct the worker to start appropriate job manager daemon if ServiceRole.SLURMCTLD in job_manager_svc.svc_roles: self.send_start_slurmd() else: self.send_start_sge() else: log.warning('Could not get a handle on job manager service to ' 'add node {0}'.format(self.get_desc())) # If there are any bucket-based FSs, tell the worker to add those fss = self.app.manager.get_services(svc_type=ServiceType.FILE_SYSTEM) for fs in fss: if len(fs.buckets) > 0: for b in fs.buckets: self.send_add_s3fs(b.bucket_name, fs.svc_roles) log.info("Waiting on worker instance %s to configure itself." % self.get_desc()) elif msg_type == "NODE_READY": self.worker_status = "Ready" log.info("Instance %s ready" % self.get_desc()) # Make sure the instace is tagged (this is also necessary to do # here for OpenStack because it does not allow tags to be added # until an instance is 'running') self.app.cloud_interface.add_tag(self.inst, 'clusterName', self.app.config['cluster_name']) self.app.cloud_interface.add_tag(self.inst, 'role', 'worker') self.app.cloud_interface.add_tag(self.inst, 'alias', self.alias) self.app.cloud_interface.add_tag(self.inst, 'Name', "Worker: {0}".format(self.app.config['cluster_name'])) self.app.manager.update_condor_host(self.public_ip) elif msg_type == "NODE_STATUS": # log.debug("Node {0} status message: {1}".format(self.get_desc(), msg)) if not self.worker_status == 'Stopping': msplit = msg.split(' | ') self.nfs_data = msplit[1] self.nfs_tools = msplit[2] # Workers currently do not update this field self.nfs_indices = msplit[3] self.nfs_sge = msplit[4] self.get_cert = msplit[5] self.sge_started = msplit[6] self.load = msplit[7] self.worker_status = msplit[8] self.nfs_tfs = msplit[9] self.slurmd_running = msplit[10] else: log.debug("Worker {0} in state Stopping so not updating status" .format(self.get_desc())) elif msg_type == 'NODE_SHUTTING_DOWN': msplit = msg.split(' | ') self.worker_status = msplit[1] else: # Catch-all condition log.debug("Unknown Message: %s" % msg)