def restore(self): """Use the Thruk Rest API to enable notifications for this host""" response = thruk_set_notifications(self.thruk_url, self.thruk_username, self.thruk_password, self.nagios_hostname, True) if response.status != 200: log.fatal('[{}] Failed to re-enable notifications for {}'.format( self.host, self.nagios_hostname))
def discover(job): """Parses job configuration and returns list of found hosts""" hosts = {} for disc in job.get('hosts', []): disc_name, disc_args = str_or_dict(disc) Discoverer = discoverers.get(disc_name) if Discoverer is None: log.fatal( '[amaltheia] Unknown host discoverer {}'.format(disc_name)) continue hosts.update(Discoverer(disc_args).discover()) return hosts
def update(self): ssh_cmd(self.host, self.host_args, 'sudo reboot') if not self.wait: log.debug('[{}] Not waiting for reboot'.format(self.host)) return True now = datetime.now() timeout = now + timedelta(seconds=self.wait_timeout) success = False while not success and datetime.now() <= timeout: log.debug('[{}] Waiting for reboot...'.format(self.host)) success = ssh_try_connect(self.host, self.host_args, timeout=self.wait_check_interval) if not success: log.fatal('[{}] Timeout waiting for reboot'.format(self.host)) return success
def evacuate(self): """Use the Thruk Rest API to disable notifications for this host.""" if self.thruk_url is None: return False try: self.nagios_hostname = thruk_get_host(self.thruk_url, self.thruk_username, self.thruk_password, self.host) except (json.JSONDecodeError, ValueError, KeyError, TypeError): log.fatal('[{}] Failed to retrieve Nagios name'.format(self.host)) response = thruk_set_notifications(self.thruk_url, self.thruk_username, self.thruk_password, self.nagios_hostname, False) if response.status != 200: log.fatal('[{}] Failed to disable notifications for {}'.format( self.host, self.nagios_hostname))
def evacuate(self): """Disable nova-compute service on this host, migrate away all running and stopped instances""" if self.service_args.get('skip-evacuate'): return True # Disable nova-compute openstack_cmd( 'openstack compute service set {} nova-compute --disable'.format( quote(self.host))) # Retrieve list of VMs, indexable by their Instance ID server_list = openstack_cmd_table('nova hypervisor-servers {}'.format( quote(self.host))) servers = {s['ID']: s for s in server_list} # Schedule live migration for running VMs result = openstack_cmd_table('nova host-evacuate-live {}'.format( quote(self.host))) for server in result: iid = server['Server UUID'] if server['Live Migration Accepted'] == 'True': servers[iid].update({'status': 'OK'}) else: servers[iid].update({ 'status': 'NOTOK', 'error': server['Error Message'] }) # Errors with live migration may occur for VMs that are stopped. # Migrate them as well result = openstack_cmd_table('nova host-servers-migrate {}'.format( quote(self.host))) for server in result: iid = server['Server UUID'] if server['Migration Accepted'] == 'True': servers[iid].update({'status': 'OK'}) del servers[iid]['error'] elif servers[iid].get('status', '') != 'OK': servers[iid].update({ 'status': 'NOTOK', 'error': server['Error Message'] }) errors = {k: v for k, v in servers.items() if v['status'] != 'OK'} if errors: log.fatal('[{}] {}'.format(self.host, errors)) return False # Wait for migrations to complete try: timeout_per_server = int(self.service_args.get('timeout', 40)) except (ValueError, TypeError): log.debug('[{}] Defaulting to 40 seconds timeout'.format( self.host)) timeout_per_server = 40 timeout = len(server_list) * timeout_per_server while server_list and timeout > 0: timeout -= 5 sleep(5) server_list = openstack_cmd_table( 'nova hypervisor-servers {}'.format(quote(self.host))) log.debug('[{}] Waiting for migrations, {} remaining'.format( self.host, len(server_list))) if server_list: log.fatal('[{}] Some migrations timed-out: {}'.format( self.host, server_list)) return False else: log.debug('[{}] All servers migrated successfully'.format( self.host)) return True
def update(self): try: self.jenkins.get_whoami() except: log.exception('[{}] [jenkins] Failed to authenticate'.format( self.host)) return False if self.job is None: log.fatal('[{}] [jenkins] Empty job name'.format(self.host)) return False raw_args = self.updater_args.get('build-arguments') try: if raw_args: queue_id = self.jenkins.build_job( self.job, jinja(raw_args, host=self.host, host_args=self.host_args)) else: queue_id = self.jenkins.build_job(self.job) except: log.exception('[{}] [jenkins] Failed to queue job {}'.format( self.host, self.job)) return False log.info('[{}] [jenkins] Queued job {} (queue id {})'.format( self.host, self.job, queue_id)) if not self.wait: return True now = datetime.now() timeout = now + timedelta(seconds=self.wait_timeout) while True: try: queue_item = self.jenkins.get_queue_item(queue_id) job_number = queue_item['executable']['number'] break except KeyError: sleep(self.wait_check_interval) log.debug('[{}] [jenkins] Waiting for job queue {}'.format( self.host, self.job)) except: log.exception('[{}] [jenkins] Failed to queue job {}'.format( self.host, self.job)) return False if datetime.now() > timeout: log.fatal( '[{}] [jenkins] Timeout waiting for job queue {}'.format( self.host, self.job)) return False log.info('[{}] [jenkins] Started job {}/{} (queue id {})'.format( self.host, self.job, job_number, queue_id)) done = False while not done and datetime.now() <= timeout: log.debug('[{}] [jenkins] Waiting for job run {}/{}'.format( self.host, self.job, job_number)) build_info = self.jenkins.get_build_info(self.job, job_number) done = build_info['result'] is not None if not done: sleep(self.wait_check_interval) if not done: log.fatal( '[{}] [jenkins] Timeout waiting for job run {}/{}'.format( self.host, self.job, job_number)) return False return build_info['result'] == 'SUCCESS'