def get_at_risk_service_instances(self, draining_hosts) -> List[ServiceInstance]: marathon_apps_with_clients = get_marathon_apps_with_clients( clients=self.marathon_clients.get_all_clients(), embed_tasks=True, ) at_risk_tasks = [] for app, client in marathon_apps_with_clients: for task in app.tasks: if task.host in draining_hosts: at_risk_tasks.append(task) self.log.info(f"At risk tasks: {at_risk_tasks}") service_instances: List[ServiceInstance] = [] for task in at_risk_tasks: app_id = task.app_id.strip('/') service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): # https://github.com/python/mypy/issues/2852 service_instances.append(ServiceInstance( # type: ignore service=service, instance=instance, cluster=self.config.get_cluster(), bounce_by=int(time.time()), watcher=type(self).__name__, bounce_timers=None, failures=0, )) return service_instances
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def get_at_risk_service_instances( self, draining_hosts: List[str]) -> List[ServiceInstance]: marathon_apps_with_clients = get_marathon_apps_with_clients( clients=self.marathon_clients.get_all_clients(), embed_tasks=True) at_risk_tasks = [] for app, client in marathon_apps_with_clients: for task in app.tasks: if task.host in draining_hosts: at_risk_tasks.append(task) self.log.info(f"At risk tasks: {at_risk_tasks}") service_instances: List[ServiceInstance] = [] for task in at_risk_tasks: app_id = task.app_id.strip("/") service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): service_instances.append( ServiceInstance( service=service, instance=instance, bounce_by=time.time(), wait_until=time.time(), watcher=type(self).__name__, failures=0, enqueue_time=time.time(), bounce_start_time=time.time(), )) return service_instances
def cleanup_apps(soa_dir): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) for app_id in running_app_ids: log.debug("Checking app id %s", app_id) try: service, instance, _, __ = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue if (service, instance) not in valid_services: delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper( marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def get_at_risk_service_instances(self, draining_hosts): marathon_apps = get_all_marathon_apps(self.marathon_client, embed_tasks=True) at_risk_tasks = [ task for app in marathon_apps for task in app.tasks if task.host in draining_hosts ] self.log.info("At risk tasks: {}".format(at_risk_tasks)) service_instances = [] for task in at_risk_tasks: app_id = task.app_id.strip('/') service, instance, _, __ = deformat_job_id(app_id) # check we haven't already added this instance, # no need to add the same instance to the bounce queue # more than once if not any([(service, instance) == (si.service, si.instance) for si in service_instances]): service_instances.append( ServiceInstance( service=service, instance=instance, cluster=self.config.get_cluster(), bounce_by=int(time.time()), watcher=type(self).__name__, bounce_timers=None, failures=0, )) return service_instances
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='paasta_bounce_progress.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float( len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def get_active_shas_for_marathon_apps( marathon_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]], ) -> Set[Tuple[str, str]]: ret = set() for (app, client) in marathon_apps_with_clients: git_sha = get_git_sha_from_dockerurl(app.container.docker.image, long=True) _, _, _, config_sha = marathon_tools.deformat_job_id(app.id) if config_sha.startswith("config"): config_sha = config_sha[len("config"):] ret.add((git_sha, config_sha)) return ret
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical("Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )