Ejemplo n.º 1
0
    def restore(self):
        """Use the Thruk Rest API to enable notifications for this host"""
        response = thruk_set_notifications(self.thruk_url, self.thruk_username,
                                           self.thruk_password,
                                           self.nagios_hostname, True)

        if response.status != 200:
            log.fatal('[{}] Failed to re-enable notifications for {}'.format(
                self.host, self.nagios_hostname))
Ejemplo n.º 2
0
def discover(job):
    """Parses job configuration and returns list of found hosts"""
    hosts = {}

    for disc in job.get('hosts', []):
        disc_name, disc_args = str_or_dict(disc)

        Discoverer = discoverers.get(disc_name)
        if Discoverer is None:
            log.fatal(
                '[amaltheia] Unknown host discoverer {}'.format(disc_name))
            continue

        hosts.update(Discoverer(disc_args).discover())

    return hosts
Ejemplo n.º 3
0
    def update(self):
        ssh_cmd(self.host, self.host_args, 'sudo reboot')

        if not self.wait:
            log.debug('[{}] Not waiting for reboot'.format(self.host))
            return True

        now = datetime.now()
        timeout = now + timedelta(seconds=self.wait_timeout)
        success = False
        while not success and datetime.now() <= timeout:
            log.debug('[{}] Waiting for reboot...'.format(self.host))
            success = ssh_try_connect(self.host,
                                      self.host_args,
                                      timeout=self.wait_check_interval)

        if not success:
            log.fatal('[{}] Timeout waiting for reboot'.format(self.host))

        return success
Ejemplo n.º 4
0
    def evacuate(self):
        """Use the Thruk Rest API to disable notifications for this host."""

        if self.thruk_url is None:
            return False

        try:
            self.nagios_hostname = thruk_get_host(self.thruk_url,
                                                  self.thruk_username,
                                                  self.thruk_password,
                                                  self.host)

        except (json.JSONDecodeError, ValueError, KeyError, TypeError):
            log.fatal('[{}] Failed to retrieve Nagios name'.format(self.host))

        response = thruk_set_notifications(self.thruk_url, self.thruk_username,
                                           self.thruk_password,
                                           self.nagios_hostname, False)

        if response.status != 200:
            log.fatal('[{}] Failed to disable notifications for {}'.format(
                self.host, self.nagios_hostname))
Ejemplo n.º 5
0
    def evacuate(self):
        """Disable nova-compute service on this host, migrate away
        all running and stopped instances"""

        if self.service_args.get('skip-evacuate'):
            return True

        # Disable nova-compute
        openstack_cmd(
            'openstack compute service set {} nova-compute --disable'.format(
                quote(self.host)))

        # Retrieve list of VMs, indexable by their Instance ID
        server_list = openstack_cmd_table('nova hypervisor-servers {}'.format(
            quote(self.host)))
        servers = {s['ID']: s for s in server_list}

        # Schedule live migration for running VMs
        result = openstack_cmd_table('nova host-evacuate-live {}'.format(
            quote(self.host)))

        for server in result:
            iid = server['Server UUID']

            if server['Live Migration Accepted'] == 'True':
                servers[iid].update({'status': 'OK'})
            else:
                servers[iid].update({
                    'status': 'NOTOK',
                    'error': server['Error Message']
                })

        # Errors with live migration may occur for VMs that are stopped.
        # Migrate them as well
        result = openstack_cmd_table('nova host-servers-migrate {}'.format(
            quote(self.host)))

        for server in result:
            iid = server['Server UUID']

            if server['Migration Accepted'] == 'True':
                servers[iid].update({'status': 'OK'})
                del servers[iid]['error']
            elif servers[iid].get('status', '') != 'OK':
                servers[iid].update({
                    'status': 'NOTOK',
                    'error': server['Error Message']
                })

        errors = {k: v for k, v in servers.items() if v['status'] != 'OK'}
        if errors:
            log.fatal('[{}] {}'.format(self.host, errors))
            return False

        # Wait for migrations to complete
        try:
            timeout_per_server = int(self.service_args.get('timeout', 40))
        except (ValueError, TypeError):
            log.debug('[{}] Defaulting to 40 seconds timeout'.format(
                self.host))

            timeout_per_server = 40

        timeout = len(server_list) * timeout_per_server
        while server_list and timeout > 0:
            timeout -= 5
            sleep(5)

            server_list = openstack_cmd_table(
                'nova hypervisor-servers {}'.format(quote(self.host)))

            log.debug('[{}] Waiting for migrations, {} remaining'.format(
                self.host, len(server_list)))

        if server_list:
            log.fatal('[{}] Some migrations timed-out: {}'.format(
                self.host, server_list))
            return False
        else:
            log.debug('[{}] All servers migrated successfully'.format(
                self.host))

        return True
Ejemplo n.º 6
0
    def update(self):
        try:
            self.jenkins.get_whoami()
        except:
            log.exception('[{}] [jenkins] Failed to authenticate'.format(
                self.host))
            return False

        if self.job is None:
            log.fatal('[{}] [jenkins] Empty job name'.format(self.host))
            return False

        raw_args = self.updater_args.get('build-arguments')
        try:
            if raw_args:
                queue_id = self.jenkins.build_job(
                    self.job,
                    jinja(raw_args, host=self.host, host_args=self.host_args))
            else:
                queue_id = self.jenkins.build_job(self.job)
        except:
            log.exception('[{}] [jenkins] Failed to queue job {}'.format(
                self.host, self.job))
            return False

        log.info('[{}] [jenkins] Queued job {} (queue id {})'.format(
            self.host, self.job, queue_id))

        if not self.wait:
            return True

        now = datetime.now()
        timeout = now + timedelta(seconds=self.wait_timeout)

        while True:
            try:
                queue_item = self.jenkins.get_queue_item(queue_id)
                job_number = queue_item['executable']['number']
                break
            except KeyError:
                sleep(self.wait_check_interval)
                log.debug('[{}] [jenkins] Waiting for job queue {}'.format(
                    self.host, self.job))
            except:
                log.exception('[{}] [jenkins] Failed to queue job {}'.format(
                    self.host, self.job))
                return False

            if datetime.now() > timeout:
                log.fatal(
                    '[{}] [jenkins] Timeout waiting for job queue {}'.format(
                        self.host, self.job))
                return False

        log.info('[{}] [jenkins] Started job {}/{} (queue id {})'.format(
            self.host, self.job, job_number, queue_id))

        done = False
        while not done and datetime.now() <= timeout:
            log.debug('[{}] [jenkins] Waiting for job run {}/{}'.format(
                self.host, self.job, job_number))
            build_info = self.jenkins.get_build_info(self.job, job_number)

            done = build_info['result'] is not None
            if not done:
                sleep(self.wait_check_interval)

        if not done:
            log.fatal(
                '[{}] [jenkins] Timeout waiting for job run {}/{}'.format(
                    self.host, self.job, job_number))
            return False

        return build_info['result'] == 'SUCCESS'