예제 #1
0
    def get_m_state(self):
        """ Update the machine state of the current instance by querying the
            cloud middleware for the instance object itself (via the instance
            id) and updating self.m_state field to match the state returned by
            the cloud middleware.
            Also, update local last_state_update timestamp.

            :rtype: String
            :return: the current state of the instance as obtained from the
                     cloud middleware
        """
        self.last_state_update = Time.now()
        self.get_cloud_instance_object(deep=True)
        if self.inst:
            try:
                state = self.inst.state
                log.debug("Requested instance {0} update: old state: {1}; new state: {2}"
                          .format(self.get_desc(), self.m_state, state))
                if state != self.m_state:
                    self.m_state = state
                    self.last_m_state_change = Time.now()
            except EC2ResponseError, e:
                log.debug("Error updating instance {0} state: {1}".format(
                    self.get_id(), e))
                self.m_state = instance_states.ERROR
예제 #2
0
    def get_m_state(self):
        """ Update the machine state of the current instance by querying the
            cloud middleware for the instance object itself (via the instance
            id) and updating self.m_state field to match the state returned by
            the cloud middleware.
            Also, update local last_state_update timestamp.

            :rtype: String
            :return: the current state of the instance as obtained from the
                     cloud middleware
        """
        self.last_state_update = Time.now()
        self.get_cloud_instance_object(deep=True)
        if self.inst:
            try:
                state = self.inst.state
                log.debug("Requested instance {0} update: old state: {1}; new state: {2}"
                          .format(self.get_desc(), self.m_state, state))
                if state != self.m_state:
                    self.m_state = state
                    self.last_m_state_change = Time.now()
            except EC2ResponseError, e:
                log.debug("Error updating instance {0} state: {1}".format(
                    self.get_id(), e))
                self.m_state = instance_states.ERROR
예제 #3
0
 def get_all_services_status(self, trans):
     status_dict = self.app.manager.get_all_services_status()
     # status_dict['filesystems'] = self.app.manager.get_all_filesystems_status()
     status_dict['galaxy_dns'] = self.get_galaxy_dns(trans)
     status_dict['galaxy_rev'] = self.app.manager.get_galaxy_rev()
     status_dict['galaxy_admins'] = self.app.manager.get_galaxy_admins()
     snap_status = self.app.manager.snapshot_status()
     status_dict['snapshot'] = {
         'status': str(snap_status[0]),
         'progress': str(snap_status[1])
     }
     status_dict['master_is_exec_host'] = self.app.manager.master_exec_host
     status_dict[
         'ignore_deps_framework'] = self.app.config.ignore_unsatisfiable_dependencies
     status_dict['messages'] = self.messages_string(
         self.app.msgs.get_messages())
     status_dict[
         'cluster_startup_time'] = self.app.manager.startup_time.strftime(
             "%b %d %Y %H:%M:%S")
     cluster_uptime = misc.format_time_delta(Time.now() -
                                             self.app.manager.startup_time)
     status_dict['cluster_uptime'] = cluster_uptime
     # status_dict['dummy'] = str(datetime.now()) # Used for testing only
     # print "status_dict: %s" % status_dict
     return json.dumps(status_dict)
예제 #4
0
 def get_status_array(self):
     if self.m_state.lower() == "running":  # For extra states.
         if self.is_alive is not True:
             ld = "Starting"
         elif self.load:
             lds = self.load.split(' ')
             if len(lds) == 3:
                 try:
                     load1 = float(lds[0]) / self.num_cpus
                     load2 = float(lds[1]) / self.num_cpus
                     load3 = float(lds[2]) / self.num_cpus
                     ld = "%s %s %s" % (load1, load2, load3)
                 except Exception, e:
                     log.debug("Problems normalizing load: %s" % e)
                     ld = self.load
             else:
                 ld = self.load
         elif self.worker_status == "Ready":
             ld = "Running"
         return [
             self.id, ld,
             misc.format_seconds(Time.now() - self.last_m_state_change),
             self.nfs_data, self.nfs_tools, self.nfs_indices, self.nfs_sge,
             self.get_cert, self.sge_started, self.worker_status
         ]
예제 #5
0
    def maintain(self):
        """ Based on the state and status of this instance, try to do the right thing
            to keep the instance functional. Note that this may lead to terminating
            the instance.
        """
        def reboot_terminate_logic():
            """ Make a decision whether to terminate or reboot an instance.
                CALL THIS METHOD CAREFULLY because it defaults to terminating the
                instance!
            """
            if self.reboot_count < self.config.instance_reboot_attempts:
                self.reboot()
            elif self.terminate_attempt_count >= self.config.instance_terminate_attempts:
                log.info(
                    "Tried terminating instance {0} {1} times but was unsuccessful. Giving up."
                    .format(self.inst.id,
                            self.config.instance_terminate_attempts))
                self._remove_instance()
            else:
                log.info(
                    "Instance {0} not responding after {1} reboots. Terminating instance."
                    .format(self.id, self.reboot_count))
                self.terminate()

        # Update state then do resolution
        state = self.get_m_state()
        if state == instance_states.PENDING or state == instance_states.SHUTTING_DOWN:
            if (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \
               (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout:
                log.debug(
                    "'Maintaining' instance {0} stuck in '{1}' state.".format(
                        self.get_desc(), state))
                reboot_terminate_logic()
        elif state == instance_states.ERROR:
            log.debug("'Maintaining' instance {0} in '{1}' state.".format(
                self.get_desc(), instance_states.ERROR))
            reboot_terminate_logic()
        elif state == instance_states.TERMINATED:
            log.debug("'Maintaining' instance {0} in '{1}' state.".format(
                self.get_desc(), instance_states.TERMINATED))
            self._remove_instance()
        elif state == instance_states.RUNNING:
            log.debug(
                "'Maintaining' instance {0} in '{1}' state (last comm before {2} | "
                "last m_state change before {3} | time_rebooted before {4}".
                format(
                    self.get_desc(), instance_states.RUNNING,
                    dt.timedelta(seconds=(Time.now() -
                                          self.last_comm).seconds),
                    dt.timedelta(seconds=(Time.now() -
                                          self.last_m_state_change).seconds),
                    dt.timedelta(seconds=(Time.now() -
                                          self.time_rebooted).seconds)))
            if (Time.now() - self.last_comm).seconds > self.config.instance_comm_timeout and \
               (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \
               (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout:
                reboot_terminate_logic()
예제 #6
0
    def maintain(self):
        """ Based on the state and status of this instance, try to do the right thing
            to keep the instance functional. Note that this may lead to terminating
            the instance.
        """
        def reboot_terminate_logic():
            """ Make a decision whether to terminate or reboot an instance.
                CALL THIS METHOD CAREFULLY because it defaults to terminating the
                instance!
            """
            if self.reboot_count < self.config.instance_reboot_attempts:
                self.reboot()
            elif self.terminate_attempt_count >= self.config.instance_terminate_attempts:
                log.info("Tried terminating instance {0} {1} times but was unsuccessful. Giving up."
                         .format(self.inst.id, self.config.instance_terminate_attempts))
                self._remove_instance()
            else:
                log.info("Instance {0} not responding after {1} reboots. Terminating instance."
                         .format(self.id, self.reboot_count))
                self.terminate()

        # Update state then do resolution
        state = self.get_m_state()
        if state == instance_states.PENDING or state == instance_states.SHUTTING_DOWN:
            if (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \
               (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout:
                log.debug("'Maintaining' instance {0} stuck in '{1}' state.".format(
                    self.get_desc(), state))
                reboot_terminate_logic()
        elif state == instance_states.ERROR:
            log.debug("'Maintaining' instance {0} in '{1}' state.".format(self.get_desc(), instance_states.ERROR))
            reboot_terminate_logic()
        elif state == instance_states.TERMINATED:
            log.debug("'Maintaining' instance {0} in '{1}' state.".format(self.get_desc(), instance_states.TERMINATED))
            self._remove_instance()
        elif state == instance_states.RUNNING:
            log.debug("'Maintaining' instance {0} in '{1}' state (last comm before {2} | "
                      "last m_state change before {3} | time_rebooted before {4}"
                      .format(self.get_desc(), instance_states.RUNNING,
                              dt.timedelta(seconds=(Time.now() - self.last_comm).seconds),
                              dt.timedelta(seconds=(Time.now() - self.last_m_state_change).seconds),
                              dt.timedelta(seconds=(Time.now() - self.time_rebooted).seconds)))
            if (Time.now() - self.last_comm).seconds > self.config.instance_comm_timeout and \
               (Time.now() - self.last_m_state_change).seconds > self.config.instance_state_change_wait and \
               (Time.now() - self.time_rebooted).seconds > self.config.instance_reboot_timeout:
                reboot_terminate_logic()
예제 #7
0
 def reboot(self, count_reboot=True):
     """
     Reboot this instance. If ``count_reboot`` is set, increment the number
     of reboots for this instance (a treshold in this count leads to eventual
     instance termination, see ``self.config.instance_reboot_attempts``).
     """
     if self.inst is not None:
         # Show reboot count only if this reboot counts toward the reboot quota
         s = " (reboot #{0})".format(self.reboot_count + 1)
         log.info("Rebooting instance {0}{1}.".format(self.get_desc(),
                                                      s if count_reboot else ''))
         try:
             self.inst.reboot()
             self.time_rebooted = Time.now()
         except EC2ResponseError, e:
             log.error("Trouble rebooting instance {0}: {1}".format(self.get_desc(), e))
예제 #8
0
 def reboot(self, count_reboot=True):
     """
     Reboot this instance. If ``count_reboot`` is set, increment the number
     of reboots for this instance (a treshold in this count leads to eventual
     instance termination, see ``self.config.instance_reboot_attempts``).
     """
     if self.inst is not None:
         # Show reboot count only if this reboot counts toward the reboot quota
         s = " (reboot #{0})".format(self.reboot_count + 1)
         log.info("Rebooting instance {0}{1}.".format(self.get_desc(),
                                                      s if count_reboot else ''))
         try:
             self.inst.reboot()
             self.time_rebooted = Time.now()
         except EC2ResponseError, e:
             log.error("Trouble rebooting instance {0}: {1}".format(self.get_desc(), e))
예제 #9
0
파일: root.py 프로젝트: AAFC-MBB/cloudman
 def get_all_services_status(self, trans):
     status_dict = self.app.manager.get_all_services_status()
     # status_dict['filesystems'] = self.app.manager.get_all_filesystems_status()
     status_dict['galaxy_dns'] = self.get_galaxy_dns(trans)
     status_dict['galaxy_rev'] = self.app.manager.get_galaxy_rev()
     status_dict['galaxy_admins'] = self.app.manager.get_galaxy_admins()
     snap_status = self.app.manager.snapshot_status()
     status_dict['snapshot'] = {'status': str(snap_status[0]),
                                'progress': str(snap_status[1])}
     status_dict['master_is_exec_host'] = self.app.manager.master_exec_host
     status_dict['ignore_deps_framework'] = self.app.config.ignore_unsatisfiable_dependencies
     status_dict['messages'] = self.messages_string(self.app.msgs.get_messages())
     status_dict['cluster_startup_time'] = self.app.manager.startup_time.strftime("%b %d %Y %H:%M:%S")
     cluster_uptime = misc.format_time_delta(Time.now() - self.app.manager.startup_time)
     status_dict['cluster_uptime'] = cluster_uptime
     # status_dict['dummy'] = str(datetime.now()) # Used for testing only
     # print "status_dict: %s" % status_dict
     return json.dumps(status_dict)
예제 #10
0
    def get_status_dict(self):
        toret = {
            'id':
            self.id,
            'alias':
            self.alias,
            'ld':
            self.load,
            'time_in_state':
            misc.format_seconds(Time.now() - self.last_m_state_change),
            'nfs_data':
            self.nfs_data,
            'nfs_tools':
            self.nfs_tools,
            'nfs_indices':
            self.nfs_indices,
            'nfs_sge':
            self.nfs_sge,
            'nfs_tfs':
            self.nfs_tfs,
            'get_cert':
            self.get_cert,
            'slurmd_running':
            self.slurmd_running,
            'worker_status':
            self.worker_status,
            'instance_state':
            self.m_state,
            'instance_type':
            self.type,
            'public_ip':
            self.public_ip
        }

        if self.load:
            lds = self.load.split(' ')
            if len(lds) == 3:
                toret['ld'] = "%s %s %s" % (float(lds[0]) / self.num_cpus,
                                            float(lds[1]) / self.num_cpus,
                                            float(lds[2]) / self.num_cpus)
        return toret
예제 #11
0
 def get_status_array(self):
     if self.m_state.lower() == "running":  # For extra states.
         if self.is_alive is not True:
             ld = "Starting"
         elif self.load:
             lds = self.load.split(' ')
             if len(lds) == 3:
                 try:
                     load1 = float(lds[0]) / self.num_cpus
                     load2 = float(lds[1]) / self.num_cpus
                     load3 = float(lds[2]) / self.num_cpus
                     ld = "%s %s %s" % (load1, load2, load3)
                 except Exception, e:
                     log.debug("Problems normalizing load: %s" % e)
                     ld = self.load
             else:
                 ld = self.load
         elif self.worker_status == "Ready":
             ld = "Running"
         return [self.id, ld, misc.format_seconds(
             Time.now() - self.last_m_state_change),
             self.nfs_data, self.nfs_tools, self.nfs_indices, self.nfs_sge, self.get_cert,
             self.sge_started, self.worker_status]
예제 #12
0
    def get_status_dict(self):
        toret = {'id': self.id,
                 'alias': self.alias,
                 'ld': self.load,
                 'time_in_state': misc.format_seconds(Time.now() - self.last_m_state_change),
                 'nfs_data': self.nfs_data,
                 'nfs_tools': self.nfs_tools,
                 'nfs_indices': self.nfs_indices,
                 'nfs_sge': self.nfs_sge,
                 'nfs_tfs': self.nfs_tfs,
                 'get_cert': self.get_cert,
                 'slurmd_running': self.slurmd_running,
                 'worker_status': self.worker_status,
                 'instance_state': self.m_state,
                 'instance_type': self.type,
                 'public_ip': self.public_ip}

        if self.load:
            lds = self.load.split(' ')
            if len(lds) == 3:
                toret['ld'] = "%s %s %s" % (float(lds[0]) / self.num_cpus, float(
                    lds[1]) / self.num_cpus, float(lds[2]) / self.num_cpus)
        return toret
예제 #13
0
    def handle_message(self, msg):
        # log.debug( "Handling message: %s from %s" % ( msg, self.id ) )
        self.is_alive = True
        self.last_comm = Time.now()
        # Transition from states to a particular response.
        if self.app.manager.console_monitor.conn:
            msg_type = msg.split(' | ')[0]
            if msg_type == "ALIVE":
                self.worker_status = "Starting"
                log.info("Instance %s reported alive" % self.get_desc())
                msp = msg.split(' | ')
                self.private_ip = msp[1]
                self.public_ip = msp[2]
                self.zone = msp[3]
                self.type = msp[4]
                self.ami = msp[5]
                try:
                    self.local_hostname = msp[6]
                    self.num_cpus = int(msp[7])
                    self.total_memory = int(msp[8])
                    self.hostname = msp[9]
                except:
                    # Older versions of CloudMan did not pass this value so if the master
                    # and the worker are running 2 diff versions (can happen after an
                    # automatic update), don't crash here.
                    self.local_hostname = self.public_ip
                log.debug("INSTANCE_ALIVE private_ip: %s public_ip: %s zone: %s "
                          "type: %s AMI: %s local_hostname: %s, CPUs: %s, hostname: %s"
                          % (self.private_ip, self.public_ip, self.zone,
                             self.type, self.ami, self.local_hostname,
                             self.num_cpus, self.hostname))
                # Add instance IP/name to /etc/hosts
                misc.add_to_etc_hosts(self.private_ip, [self.alias, self.local_hostname,
                                      self.hostname])
                # Instance is alive and responding.
                self.send_mount_points()
            elif msg_type == "GET_MOUNTPOINTS":
                self.send_mount_points()
            elif msg_type == "MOUNT_DONE":
                log.debug("Got MOUNT_DONE message")
                # Update the list of mount points that have mounted
                if len(msg.split(' | ')) > 1:
                    msg_body = msg.split(' | ')[1]
                    try:
                        body = json.loads(msg_body)
                        mounted_fs = body.get('mounted_fs', {})
                        # Currently, only interested in the transient FS
                        self.nfs_tfs = mounted_fs.get('transient_nfs', 0)
                        log.debug("Got transient_nfs state on {0}: {1}".format(
                                  self.alias, self.nfs_tfs))
                    except ValueError, vexc:
                        log.warning('ValueError trying to decode msg: {0}'
                                    .format(vexc))
                self.app.manager.sync_etc_hosts()
                self.send_master_pubkey()
                # Add hostname to /etc/hosts (for SGE config)
                if self.app.cloud_type in ('openstack', 'eucalyptus'):
                    hn2 = ''
                    if '.' in self.local_hostname:
                        hn2 = (self.local_hostname).split('.')[0]
                    worker_host_line = '{ip} {hn1} {hn2}\n'.format(ip=self.private_ip,
                                                                   hn1=self.local_hostname,
                                                                   hn2=hn2)
                    log.debug("worker_host_line: {0}".format(worker_host_line))
                    with open('/etc/hosts', 'r+') as f:
                        hosts = f.readlines()
                        if worker_host_line not in hosts:
                            log.debug("Adding worker {0} to /etc/hosts".format(
                                self.local_hostname))
                            f.write(worker_host_line)

                if self.app.cloud_type == 'opennebula':
                    f = open("/etc/hosts", 'a')
                    f.write("%s\tworker-%s\n" % (self.private_ip, self.id))
                    f.close()
                # log.debug("Update /etc/hosts through master")
                # self.app.manager.update_etc_host()
            elif msg_type == "WORKER_H_CERT":
                log.debug("Got WORKER_H_CERT message")
                self.is_alive = True  # This is for the case that an existing worker is added to a new master.
                self.app.manager.save_host_cert(msg.split(" | ")[1])
                log.debug("Worker '%s' host certificate received and appended "
                          "to /root/.ssh/known_hosts" % self.id)
                for job_manager_svc in self.app.manager.service_registry.active(
                        service_role=ServiceRole.JOB_MANAGER):
                    job_manager_svc.add_node(self)
                    # Instruct the worker to start appropriate job manager daemon
                    if ServiceRole.SLURMCTLD in job_manager_svc.svc_roles:
                        self.send_start_slurmd()
                    else:
                        self.send_start_sge()
                else:
                    log.warning('Could not get a handle on job manager service to '
                                'add node {0}'.format(self.get_desc()))
                # If there are any bucket-based FSs, tell the worker to add those
                fss = self.app.manager.get_services(svc_type=ServiceType.FILE_SYSTEM)
                for fs in fss:
                    if len(fs.buckets) > 0:
                        for b in fs.buckets:
                            self.send_add_s3fs(b.bucket_name, fs.svc_roles)
                log.info("Waiting on worker instance %s to configure itself." % self.get_desc())
            elif msg_type == "NODE_READY":
                self.worker_status = "Ready"
                log.info("Instance %s ready" % self.get_desc())
                # Make sure the instace is tagged (this is also necessary to do
                # here for OpenStack because it does not allow tags to be added
                # until an instance is 'running')
                self.app.cloud_interface.add_tag(self.inst, 'clusterName', self.app.config['cluster_name'])
                self.app.cloud_interface.add_tag(self.inst, 'role', 'worker')
                self.app.cloud_interface.add_tag(self.inst, 'alias', self.alias)
                self.app.cloud_interface.add_tag(
                    self.inst, 'Name', "Worker: {0}".format(self.app.config['cluster_name']))

                self.app.manager.update_condor_host(self.public_ip)
            elif msg_type == "NODE_STATUS":
                # log.debug("Node {0} status message: {1}".format(self.get_desc(), msg))
                if not self.worker_status == 'Stopping':
                    msplit = msg.split(' | ')
                    self.nfs_data = msplit[1]
                    self.nfs_tools = msplit[2]  # Workers currently do not update this field
                    self.nfs_indices = msplit[3]
                    self.nfs_sge = msplit[4]
                    self.get_cert = msplit[5]
                    self.sge_started = msplit[6]
                    self.load = msplit[7]
                    self.worker_status = msplit[8]
                    self.nfs_tfs = msplit[9]
                    self.slurmd_running = msplit[10]
                else:
                    log.debug("Worker {0} in state Stopping so not updating status"
                              .format(self.get_desc()))
            elif msg_type == 'NODE_SHUTTING_DOWN':
                msplit = msg.split(' | ')
                self.worker_status = msplit[1]
            else:  # Catch-all condition
                log.debug("Unknown Message: %s" % msg)
예제 #14
0
 self.inst = inst  # boto object of the instance
 self.spot_state = None
 self.private_ip = None
 self.public_ip = None
 self.local_hostname = None
 if inst:
     try:
         self.id = str(inst.id)
     except EC2ResponseError, e:
         log.error("Error retrieving instance id: %s" % e)
 else:
     self.id = None
 # Machine state as obtained from the cloud middleware (see
 # instance_states Bunch)
 self.m_state = m_state
 self.last_m_state_change = Time.now()
 # A time stamp when the most recent update of the instance state
 # (m_state) took place
 self.last_state_update = Time.now()
 self.is_alive = False
 self.num_cpus = 1
 self.total_memory = 1  # in bytes
 self.time_rebooted = TIME_IN_PAST  # Initialize to a date in the past
 self.reboot_count = 0
 self.terminate_attempt_count = 0
 self.last_comm = TIME_IN_PAST  # Initialize to a date in the past
 self.nfs_data = 0
 self.nfs_tools = 0
 self.nfs_indices = 0
 self.nfs_sge = 0
 self.nfs_tfs = 0  # Transient file system, NFS-mounted from the master
예제 #15
0
    def handle_message(self, msg):
        # log.debug( "Handling message: %s from %s" % ( msg, self.id ) )
        self.is_alive = True
        self.last_comm = Time.now()
        # Transition from states to a particular response.
        if self.app.manager.console_monitor.conn:
            msg_type = msg.split(' | ')[0]
            if msg_type == "ALIVE":
                self.worker_status = "Starting"
                log.info("Instance %s reported alive" % self.get_desc())
                msp = msg.split(' | ')
                self.private_ip = msp[1]
                self.public_ip = msp[2]
                self.zone = msp[3]
                self.type = msp[4]
                self.ami = msp[5]
                try:
                    self.local_hostname = msp[6]
                    self.num_cpus = int(msp[7])
                    self.total_memory = int(msp[8])
                    self.hostname = msp[9]
                except:
                    # Older versions of CloudMan did not pass this value so if the master
                    # and the worker are running 2 diff versions (can happen after an
                    # automatic update), don't crash here.
                    self.local_hostname = self.public_ip
                log.debug("INSTANCE_ALIVE private_ip: %s public_ip: %s zone: %s "
                          "type: %s AMI: %s local_hostname: %s, CPUs: %s, hostname: %s"
                          % (self.private_ip, self.public_ip, self.zone,
                             self.type, self.ami, self.local_hostname,
                             self.num_cpus, self.hostname))
                # Add instance IP/name to /etc/hosts
                misc.add_to_etc_hosts(self.private_ip, [self.alias, self.local_hostname,
                                      self.hostname])
                # Instance is alive and responding.
                self.send_mount_points()
            elif msg_type == "GET_MOUNTPOINTS":
                self.send_mount_points()
            elif msg_type == "MOUNT_DONE":
                log.debug("Got MOUNT_DONE message")
                # Update the list of mount points that have mounted
                if len(msg.split(' | ')) > 1:
                    msg_body = msg.split(' | ')[1]
                    try:
                        body = json.loads(msg_body)
                        mounted_fs = body.get('mounted_fs', {})
                        # Currently, only interested in the transient FS
                        self.nfs_tfs = mounted_fs.get('transient_nfs', 0)
                        log.debug("Got transient_nfs state on {0}: {1}".format(
                                  self.alias, self.nfs_tfs))
                    except ValueError, vexc:
                        log.warning('ValueError trying to decode msg: {0}'
                                    .format(vexc))
                self.app.manager.sync_etc_hosts()
                self.send_master_pubkey()
                # Add hostname to /etc/hosts (for SGE config)
                if self.app.cloud_type in ('openstack', 'eucalyptus'):
                    hn2 = ''
                    if '.' in self.local_hostname:
                        hn2 = (self.local_hostname).split('.')[0]
                    worker_host_line = '{ip} {hn1} {hn2}\n'.format(ip=self.private_ip,
                                                                   hn1=self.local_hostname,
                                                                   hn2=hn2)
                    log.debug("worker_host_line: {0}".format(worker_host_line))
                    with open('/etc/hosts', 'r+') as f:
                        hosts = f.readlines()
                        if worker_host_line not in hosts:
                            log.debug("Adding worker {0} to /etc/hosts".format(
                                self.local_hostname))
                            f.write(worker_host_line)

                if self.app.cloud_type == 'opennebula':
                    f = open("/etc/hosts", 'a')
                    f.write("%s\tworker-%s\n" % (self.private_ip, self.id))
                    f.close()
                # log.debug("Update /etc/hosts through master")
                # self.app.manager.update_etc_host()
            elif msg_type == "WORKER_H_CERT":
                log.debug("Got WORKER_H_CERT message")
                self.is_alive = True  # This is for the case that an existing worker is added to a new master.
                self.app.manager.save_host_cert(msg.split(" | ")[1])
                log.debug("Worker '%s' host certificate received and appended "
                          "to /root/.ssh/known_hosts" % self.id)
                for job_manager_svc in self.app.manager.service_registry.active(
                        service_role=ServiceRole.JOB_MANAGER):
                    job_manager_svc.add_node(self)
                    # Instruct the worker to start appropriate job manager daemon
                    if ServiceRole.SLURMCTLD in job_manager_svc.svc_roles:
                        self.send_start_slurmd()
                    else:
                        self.send_start_sge()
                else:
                    log.warning('Could not get a handle on job manager service to '
                                'add node {0}'.format(self.get_desc()))
                # If there are any bucket-based FSs, tell the worker to add those
                fss = self.app.manager.get_services(svc_type=ServiceType.FILE_SYSTEM)
                for fs in fss:
                    if len(fs.buckets) > 0:
                        for b in fs.buckets:
                            self.send_add_s3fs(b.bucket_name, fs.svc_roles)
                log.info("Waiting on worker instance %s to configure itself." % self.get_desc())
            elif msg_type == "NODE_READY":
                self.worker_status = "Ready"
                log.info("Instance %s ready" % self.get_desc())
                # Make sure the instace is tagged (this is also necessary to do
                # here for OpenStack because it does not allow tags to be added
                # until an instance is 'running')
                self.app.cloud_interface.add_tag(self.inst, 'clusterName', self.app.config['cluster_name'])
                self.app.cloud_interface.add_tag(self.inst, 'role', 'worker')
                self.app.cloud_interface.add_tag(self.inst, 'alias', self.alias)
                self.app.cloud_interface.add_tag(self.inst, 'Name', "Worker: {0}".format(self.app.config['cluster_name']))

                self.app.manager.update_condor_host(self.public_ip)
            elif msg_type == "NODE_STATUS":
                # log.debug("Node {0} status message: {1}".format(self.get_desc(), msg))
                if not self.worker_status == 'Stopping':
                    msplit = msg.split(' | ')
                    self.nfs_data = msplit[1]
                    self.nfs_tools = msplit[2]  # Workers currently do not update this field
                    self.nfs_indices = msplit[3]
                    self.nfs_sge = msplit[4]
                    self.get_cert = msplit[5]
                    self.sge_started = msplit[6]
                    self.load = msplit[7]
                    self.worker_status = msplit[8]
                    self.nfs_tfs = msplit[9]
                    self.slurmd_running = msplit[10]
                else:
                    log.debug("Worker {0} in state Stopping so not updating status"
                              .format(self.get_desc()))
            elif msg_type == 'NODE_SHUTTING_DOWN':
                msplit = msg.split(' | ')
                self.worker_status = msplit[1]
            else:  # Catch-all condition
                log.debug("Unknown Message: %s" % msg)
예제 #16
0
 self.inst = inst  # boto object of the instance
 self.spot_state = None
 self.private_ip = None
 self.public_ip = None
 self.local_hostname = None
 if inst:
     try:
         self.id = str(inst.id)
     except EC2ResponseError, e:
         log.error("Error retrieving instance id: %s" % e)
 else:
     self.id = None
 # Machine state as obtained from the cloud middleware (see
 # instance_states Bunch)
 self.m_state = m_state
 self.last_m_state_change = Time.now()
 # A time stamp when the most recent update of the instance state
 # (m_state) took place
 self.last_state_update = Time.now()
 self.is_alive = False
 self.num_cpus = 1
 self.total_memory = 1  # in bytes
 self.time_rebooted = TIME_IN_PAST  # Initialize to a date in the past
 self.reboot_count = 0
 self.terminate_attempt_count = 0
 self.last_comm = TIME_IN_PAST  # Initialize to a date in the past
 self.nfs_data = 0
 self.nfs_tools = 0
 self.nfs_indices = 0
 self.nfs_sge = 0
 self.nfs_tfs = 0  # Transient file system, NFS-mounted from the master