Example #1
0
 def get_at_risk_service_instances(self, draining_hosts) -> List[ServiceInstance]:
     marathon_apps_with_clients = get_marathon_apps_with_clients(
         clients=self.marathon_clients.get_all_clients(),
         embed_tasks=True,
     )
     at_risk_tasks = []
     for app, client in marathon_apps_with_clients:
         for task in app.tasks:
             if task.host in draining_hosts:
                 at_risk_tasks.append(task)
     self.log.info(f"At risk tasks: {at_risk_tasks}")
     service_instances: List[ServiceInstance] = []
     for task in at_risk_tasks:
         app_id = task.app_id.strip('/')
         service, instance, _, __ = deformat_job_id(app_id)
         # check we haven't already added this instance,
         # no need to add the same instance to the bounce queue
         # more than once
         if not any([(service, instance) == (si.service, si.instance) for si in service_instances]):
             # https://github.com/python/mypy/issues/2852
             service_instances.append(ServiceInstance(  # type: ignore
                 service=service,
                 instance=instance,
                 cluster=self.config.get_cluster(),
                 bounce_by=int(time.time()),
                 watcher=type(self).__name__,
                 bounce_timers=None,
                 failures=0,
             ))
     return service_instances
def cleanup_apps(soa_dir):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    for app_id in running_app_ids:
        log.debug("Checking app id %s", app_id)
        try:
            service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        if (service, instance) not in valid_services:
            delete_app(
                app_id=app_id,
                client=client,
                soa_dir=soa_dir,
            )
def delete_app(app_id, client):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    try:
        with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)):
            bounce_lib.delete_marathon_app(app_id, client)
            log_line = "Deleted stale marathon job that looks lost: %s" % app_id
            _log(service=service,
                 component='deploy',
                 level='event',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=log_line)
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(service=service,
                 component='deploy',
                 level='debug',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=logline)
        raise
Example #4
0
 def get_at_risk_service_instances(
         self, draining_hosts: List[str]) -> List[ServiceInstance]:
     marathon_apps_with_clients = get_marathon_apps_with_clients(
         clients=self.marathon_clients.get_all_clients(), embed_tasks=True)
     at_risk_tasks = []
     for app, client in marathon_apps_with_clients:
         for task in app.tasks:
             if task.host in draining_hosts:
                 at_risk_tasks.append(task)
     self.log.info(f"At risk tasks: {at_risk_tasks}")
     service_instances: List[ServiceInstance] = []
     for task in at_risk_tasks:
         app_id = task.app_id.strip("/")
         service, instance, _, __ = deformat_job_id(app_id)
         # check we haven't already added this instance,
         # no need to add the same instance to the bounce queue
         # more than once
         if not any([(service, instance) == (si.service, si.instance)
                     for si in service_instances]):
             service_instances.append(
                 ServiceInstance(
                     service=service,
                     instance=instance,
                     bounce_by=time.time(),
                     wait_until=time.time(),
                     watcher=type(self).__name__,
                     failures=0,
                     enqueue_time=time.time(),
                     bounce_start_time=time.time(),
                 ))
     return service_instances
Example #5
0
def cleanup_apps(soa_dir):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(),
                                                marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon',
                                              soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    for app_id in running_app_ids:
        log.debug("Checking app id %s", app_id)
        try:
            service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn(
                "%s doesn't conform to paasta naming conventions? Skipping." %
                app_id)
            continue
        if (service, instance) not in valid_services:
            delete_app(
                app_id=app_id,
                client=client,
                soa_dir=soa_dir,
            )
Example #6
0
def delete_app(app_id, client):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    try:
        with bounce_lib.bounce_lock_zookeeper(
                marathon_tools.compose_job_id(service, instance)):
            bounce_lib.delete_marathon_app(app_id, client)
            log_line = "Deleted stale marathon job that looks lost: %s" % app_id
            _log(service=service,
                 component='deploy',
                 level='event',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=log_line)
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(service=service,
                 component='deploy',
                 level='debug',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=logline)
        raise
Example #7
0
 def get_at_risk_service_instances(self, draining_hosts):
     marathon_apps = get_all_marathon_apps(self.marathon_client,
                                           embed_tasks=True)
     at_risk_tasks = [
         task for app in marathon_apps for task in app.tasks
         if task.host in draining_hosts
     ]
     self.log.info("At risk tasks: {}".format(at_risk_tasks))
     service_instances = []
     for task in at_risk_tasks:
         app_id = task.app_id.strip('/')
         service, instance, _, __ = deformat_job_id(app_id)
         # check we haven't already added this instance,
         # no need to add the same instance to the bounce queue
         # more than once
         if not any([(service, instance) == (si.service, si.instance)
                     for si in service_instances]):
             service_instances.append(
                 ServiceInstance(
                     service=service,
                     instance=instance,
                     cluster=self.config.get_cluster(),
                     bounce_by=int(time.time()),
                     watcher=type(self).__name__,
                     bounce_timers=None,
                     failures=0,
                 ))
     return service_instances
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='paasta_bounce_progress.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Example #9
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='paasta_bounce_progress.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Example #10
0
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(
        marathon_config.get_url(),
        marathon_config.get_username(),
        marathon_config.get_password(),
    )

    valid_services = get_services_for_cluster(instance_type='marathon',
                                              soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    running_apps = []
    for app_id in running_app_ids:
        try:
            app_id = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn(
                "%s doesn't conform to paasta naming conventions? Skipping." %
                app_id)
            continue
        running_apps.append(app_id)
    apps_to_kill = [(service, instance, git_sha, config_sha)
                    for service, instance, git_sha, config_sha in running_apps
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % running_apps)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if running_apps:
        above_kill_threshold = float(len(apps_to_kill)) / float(
            len(running_apps)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical(
                "Paasta was about to kill more than %s of the running services, this "
                "is probably a BAD mistake!, run again with --force if you "
                "really need to destroy everything" % kill_threshold, )
            raise DontKillEverythingError
    for running_app in apps_to_kill:
        app_id = marathon_tools.format_job_id(*running_app)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )
Example #11
0
def get_active_shas_for_marathon_apps(
    marathon_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]],
) -> Set[Tuple[str, str]]:
    ret = set()
    for (app, client) in marathon_apps_with_clients:
        git_sha = get_git_sha_from_dockerurl(app.container.docker.image,
                                             long=True)
        _, _, _, config_sha = marathon_tools.deformat_job_id(app.id)
        if config_sha.startswith("config"):
            config_sha = config_sha[len("config"):]
        ret.add((git_sha, config_sha))
    return ret
Example #12
0
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False):
    """Clean up old or invalid jobs/apps from marathon. Retrieves
    both a list of apps currently in marathon and a list of valid
    app ids in order to determine what to kill.

    :param soa_dir: The SOA config directory to read from
    :param kill_threshold: The decimal fraction of apps we think is
        sane to kill when this job runs.
    :param force: Force the cleanup if we are above the kill_threshold"""
    log.info("Loading marathon configuration")
    marathon_config = marathon_tools.load_marathon_config()
    log.info("Connecting to marathon")
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir)
    running_app_ids = marathon_tools.list_all_marathon_app_ids(client)

    running_apps = []
    for app_id in running_app_ids:
        try:
            app_id = marathon_tools.deformat_job_id(app_id)
        except InvalidJobNameError:
            log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id)
            continue
        running_apps.append(app_id)
    apps_to_kill = [(service, instance, git_sha, config_sha)
                    for service, instance, git_sha, config_sha in running_apps
                    if (service, instance) not in valid_services]

    log.debug("Running apps: %s" % running_apps)
    log.debug("Valid apps: %s" % valid_services)
    log.debug("Terminating: %s" % apps_to_kill)
    if running_apps:
        above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold)
        if above_kill_threshold and not force:
            log.critical("Paasta was about to kill more than %s of the running services, this "
                         "is probably a BAD mistake!, run again with --force if you "
                         "really need to destroy everything" % kill_threshold)
            raise DontKillEverythingError
    for running_app in apps_to_kill:
        app_id = marathon_tools.format_job_id(*running_app)
        delete_app(
            app_id=app_id,
            client=client,
            soa_dir=soa_dir,
        )