Ejemplo n.º 1
0
 def test_bounce_lock_zookeeper(self):
     lock_name = 'watermelon'
     fake_lock = mock.Mock()
     fake_zk = mock.MagicMock(Lock=mock.Mock(return_value=fake_lock))
     fake_zk_hosts = 'awjti42ior'
     with mock.patch(
             'paasta_tools.bounce_lib.KazooClient',
             return_value=fake_zk,
             autospec=True,
     ) as client_patch, mock.patch(
             'paasta_tools.bounce_lib.load_system_paasta_config',
             return_value=mock.Mock(get_zk_hosts=lambda: fake_zk_hosts),
             autospec=True,
     ) as hosts_patch:
         with bounce_lib.bounce_lock_zookeeper(lock_name):
             pass
         hosts_patch.assert_called_once_with()
         client_patch.assert_called_once_with(
             hosts=fake_zk_hosts,
             timeout=bounce_lib.ZK_LOCK_CONNECT_TIMEOUT_S,
         )
         fake_zk.start.assert_called_once_with()
         fake_zk.Lock.assert_called_once_with(
             f'{bounce_lib.ZK_LOCK_PATH}/{lock_name}')
         fake_lock.acquire.assert_called_once_with(timeout=1)
         fake_lock.release.assert_called_once_with()
         fake_zk.stop.assert_called_once_with()
Ejemplo n.º 2
0
 def test_bounce_lock_zookeeper(self):
     lock_name = 'watermelon'
     fake_lock = mock.Mock()
     fake_zk = mock.MagicMock(Lock=mock.Mock(return_value=fake_lock))
     fake_zk_hosts = 'awjti42ior'
     with contextlib.nested(
         mock.patch('paasta_tools.bounce_lib.KazooClient', return_value=fake_zk, autospec=True),
         mock.patch(
             'paasta_tools.bounce_lib.load_system_paasta_config',
             return_value=mock.Mock(
                 get_zk_hosts=lambda: fake_zk_hosts
             ),
             autospec=True,
         ),
     ) as (
         client_patch,
         hosts_patch,
     ):
         with bounce_lib.bounce_lock_zookeeper(lock_name):
             pass
         hosts_patch.assert_called_once_with()
         client_patch.assert_called_once_with(hosts=fake_zk_hosts,
                                              timeout=bounce_lib.ZK_LOCK_CONNECT_TIMEOUT_S)
         fake_zk.start.assert_called_once_with()
         fake_zk.Lock.assert_called_once_with('%s/%s' % (bounce_lib.ZK_LOCK_PATH, lock_name))
         fake_lock.acquire.assert_called_once_with(timeout=1)
         fake_lock.release.assert_called_once_with()
         fake_zk.stop.assert_called_once_with()
Ejemplo n.º 3
0
def delete_app(app_id, client):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    try:
        with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)):
            bounce_lib.delete_marathon_app(app_id, client)
            log_line = "Deleted stale marathon job that looks lost: %s" % app_id
            _log(service=service,
                 component='deploy',
                 level='event',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=log_line)
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(service=service,
                 component='deploy',
                 level='debug',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=logline)
        raise
Ejemplo n.º 4
0
def delete_app(app_id, client):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    try:
        with bounce_lib.bounce_lock_zookeeper(
                marathon_tools.compose_job_id(service, instance)):
            bounce_lib.delete_marathon_app(app_id, client)
            log_line = "Deleted stale marathon job that looks lost: %s" % app_id
            _log(service=service,
                 component='deploy',
                 level='event',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=log_line)
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(service=service,
                 component='deploy',
                 level='debug',
                 cluster=load_system_paasta_config().get_cluster(),
                 instance=instance,
                 line=logline)
        raise
Ejemplo n.º 5
0
def deploy_marathon_service(service, instance, client, soa_dir,
                            marathon_config, marathon_apps):
    """deploy the service instance given and proccess return code
    if there was an error we send a sensu alert.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param client: A MarathonClient object
    :param soa_dir: Path to yelpsoa configs
    :param marathon_config: The service instance's configuration dict
    :param marathon_apps: A list of all marathon app objects
    :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd
        bounce_in_seconds instructs how long until the deployd should try another bounce
        None means that it is in a steady state and doesn't need to bounce again
    """
    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(short_id):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config_no_cache(
                    service,
                    instance,
                    load_system_paasta_config().get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug(
                    "No deployments found for %s.%s in cluster %s. Skipping." %
                    (service, instance,
                     load_system_paasta_config().get_cluster()))
                return 0, None
            except NoConfigurationForServiceError:
                error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                            (service, instance, load_system_paasta_config().get_cluster())
                log.error(error_msg)
                return 1, None

            try:
                status, output, bounce_again_in_seconds = setup_service(
                    service,
                    instance,
                    client,
                    service_instance_config,
                    marathon_apps,
                    soa_dir,
                )
                sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
                send_event(service, instance, soa_dir, sensu_status, output)
                return 0, bounce_again_in_seconds
            except (KeyError, TypeError, AttributeError, InvalidInstanceConfig,
                    NoSlavesAvailableError):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(service, instance, soa_dir,
                           pysensu_yelp.Status.CRITICAL, error_str)
                return 1, None
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0, None
Ejemplo n.º 6
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='paasta_bounce_progress.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Ejemplo n.º 7
0
def delete_app(app_id, client, soa_dir):
    """Deletes a marathon app safely and logs to notify the user that it
    happened"""
    log.warn("%s appears to be old; attempting to delete" % app_id)
    service, instance, _, __ = marathon_tools.deformat_job_id(app_id)
    cluster = load_system_paasta_config().get_cluster()
    try:
        short_app_id = marathon_tools.compose_job_id(service, instance)
        with bounce_lib.bounce_lock_zookeeper(short_app_id):
            bounce_lib.delete_marathon_app(app_id, client)
        send_event(
            service=service,
            check_name='check_marathon_services_replication.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='setup_marathon_job.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        send_event(
            service=service,
            check_name='paasta_bounce_progress.%s' % short_app_id,
            soa_dir=soa_dir,
            status=pysensu_yelp.Status.OK,
            overrides={},
            output="This instance was removed and is no longer running",
        )
        log_line = "Deleted stale marathon job that looks lost: %s" % app_id
        _log(
            service=service,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
            line=log_line,
        )
    except IOError:
        log.debug("%s is being bounced, skipping" % app_id)
    except Exception:
        loglines = ['Exception raised during cleanup of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            _log(
                service=service,
                component='deploy',
                level='debug',
                cluster=load_system_paasta_config().get_cluster(),
                instance=instance,
                line=logline,
            )
        raise
Ejemplo n.º 8
0
def deploy_marathon_service(service, instance, client, soa_dir,
                            marathon_config, marathon_apps):
    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(short_id):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config(
                    service,
                    instance,
                    load_system_paasta_config().get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug(
                    "No deployments found for %s.%s in cluster %s. Skipping." %
                    (service, instance,
                     load_system_paasta_config().get_cluster()))
                return 0
            except NoConfigurationForServiceError:
                error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                            (service, instance, load_system_paasta_config().get_cluster())
                log.error(error_msg)
                return 1

            try:
                status, output = setup_service(service, instance, client,
                                               service_instance_config,
                                               marathon_apps, soa_dir)
                sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
                send_event(service, instance, soa_dir, sensu_status, output)
                return 0
            except (KeyError, TypeError, AttributeError,
                    InvalidInstanceConfig):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(service, instance, soa_dir,
                           pysensu_yelp.Status.CRITICAL, error_str)
                return 1
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0
Ejemplo n.º 9
0
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :returns: A tuple of (status, output) to be used with send_sensu_event"""
    def log_deploy_error(errormsg, level='event'):
        return _log(service=service,
                    line=errormsg,
                    component='deploy',
                    level='event',
                    cluster=cluster,
                    instance=instance)

    short_id = marathon_tools.format_job_id(service, instance)

    cluster = load_system_paasta_config().get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service,
                                                     instance,
                                                     client,
                                                     embed_failures=True)
    new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']]
    other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" %
                             len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service,
                                                     nerve_ns,
                                                     **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg)

    old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks(
        other_apps, drain_method, service, nerve_ns, bounce_health_params)

    if new_app_running:
        protected_draining_tasks = set()
        if new_app.instances < config['instances']:
            client.scale_app(app_id=new_app.id,
                             instances=config['instances'],
                             force=True)
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_old_happy_unhappy_draining_tasks_for_app(
                new_app,
                drain_method,
                service,
                nerve_ns,
                bounce_health_params,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[new_app.id] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            protected_draining_tasks.update(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[new_app.id] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[new_app.id] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]
        # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with
        # `paasta mark-for-deployment`), then we should undrain them.
        for task in new_app.tasks:
            if task not in protected_draining_tasks:
                drain_method.stop_draining(task)

    # Re-drain any already draining tasks on old apps
    for tasks in old_app_draining_tasks.values():
        for task in tasks:
            drain_method.drain(task)

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_happy_tasks=old_app_live_happy_tasks,
                    old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        loglines = ['Exception raised during deploy of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            log_deploy_error(logline, level='debug')
        raise

    return (0, 'Service deployed.')
Ejemplo n.º 10
0
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :returns: A tuple of (status, output) to be used with send_sensu_event"""

    def log_deploy_error(errormsg, level='event'):
        return _log(
            service=service,
            line=errormsg,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance
        )

    short_id = marathon_tools.format_job_id(service, instance)

    cluster = load_system_paasta_config().get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True)
    new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']]
    other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" % len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg)

    old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks(
        other_apps,
        drain_method,
        service,
        nerve_ns,
        bounce_health_params
    )

    if new_app_running:
        protected_draining_tasks = set()
        if new_app.instances < config['instances']:
            client.scale_app(app_id=new_app.id, instances=config['instances'], force=True)
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0)
            task_dict = get_old_happy_unhappy_draining_tasks_for_app(
                new_app,
                drain_method,
                service,
                nerve_ns,
                bounce_health_params,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale)
            old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining])
            protected_draining_tasks.update(scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale)
            old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale)
            old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]
        # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with
        # `paasta mark-for-deployment`), then we should undrain them.
        for task in new_app.tasks:
            if task not in protected_draining_tasks:
                drain_method.stop_draining(task)

    # Re-drain any already draining tasks on old apps
    for tasks in old_app_draining_tasks.values():
        for task in tasks:
            drain_method.drain(task)

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_happy_tasks=old_app_live_happy_tasks,
                    old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        loglines = ['Exception raised during deploy of service %s:' % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            log_deploy_error(logline, level='debug')
        raise

    return (0, 'Service deployed.')
Ejemplo n.º 11
0
def deploy_marathon_service(
    service: str,
    instance: str,
    clients: marathon_tools.MarathonClients,
    soa_dir: str,
    marathon_apps_with_clients: Optional[Sequence[Tuple[MarathonApp,
                                                        MarathonClient]]],
    system_paasta_config: Optional[SystemPaastaConfig] = None,
) -> Tuple[int, float]:
    """deploy the service instance given and process return code
    if there was an error we send a sensu alert.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param clients: A MarathonClients object
    :param soa_dir: Path to yelpsoa configs
    :param marathon_apps: A list of all marathon app objects
    :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd
        bounce_in_seconds instructs how long until the deployd should try another bounce
        None means that it is in a steady state and doesn't need to bounce again
    """
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()

    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(
                short_id, system_paasta_config=system_paasta_config):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config_no_cache(
                    service,
                    instance,
                    system_paasta_config.get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug(
                    "No deployments found for %s.%s in cluster %s. Skipping." %
                    (service, instance, system_paasta_config.get_cluster()))
                return 0, None
            except NoConfigurationForServiceError:
                error_msg = (
                    "Could not read marathon configuration file for %s.%s in cluster %s"
                    % (service, instance, system_paasta_config.get_cluster()))
                log.error(error_msg)
                return 1, None

            if marathon_apps_with_clients is None:
                marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
                    clients=clients.get_all_clients_for_service(
                        job_config=service_instance_config),
                    service_name=service,
                    instance_name=instance,
                    embed_tasks=True,
                )

            try:
                with a_sync.idle_event_loop():
                    status, output, bounce_again_in_seconds = setup_service(
                        service=service,
                        instance=instance,
                        clients=clients,
                        job_config=service_instance_config,
                        marathon_apps_with_clients=marathon_apps_with_clients,
                        soa_dir=soa_dir,
                        system_paasta_config=system_paasta_config,
                    )
                sensu_status = (pysensu_yelp.Status.CRITICAL
                                if status else pysensu_yelp.Status.OK)
                send_event(
                    service,
                    instance,
                    soa_dir,
                    sensu_status,
                    output,
                    system_paasta_config,
                    service_instance_config,
                )
                return 0, bounce_again_in_seconds
            except (
                    KeyError,
                    TypeError,
                    AttributeError,
                    InvalidInstanceConfig,
                    NoSlavesAvailableError,
            ):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(
                    service,
                    instance,
                    soa_dir,
                    pysensu_yelp.Status.CRITICAL,
                    error_str,
                    system_paasta_config,
                    service_instance_config,
                )
                return 1, None
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0, None
Ejemplo n.º 12
0
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
    bounce_margin_factor=1.0,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained
    :returns: A tuple of (status, output) to be used with send_sensu_event"""
    def log_deploy_error(errormsg, level='event'):
        return _log(service=service,
                    line=errormsg,
                    component='deploy',
                    level='event',
                    cluster=cluster,
                    instance=instance)

    short_id = marathon_tools.format_job_id(service, instance)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service,
                                                     instance,
                                                     client,
                                                     embed_failures=True)
    new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']]
    other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" %
                             len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service,
                                                     nerve_ns,
                                                     system_paasta_config,
                                                     **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg)

    (
        old_app_live_happy_tasks,
        old_app_live_unhappy_tasks,
        old_app_draining_tasks,
        old_app_at_risk_tasks,
    ) = get_tasks_by_state(
        other_apps,
        drain_method,
        service,
        nerve_ns,
        bounce_health_params,
        system_paasta_config,
    )

    if new_app_running:
        num_at_risk_tasks = get_num_at_risk_tasks(new_app)
        if new_app.instances < config['instances'] + num_at_risk_tasks:
            log.debug("Scaling %s from %d to %d instances." %
                      (new_app.id, new_app.instances,
                       config['instances'] + num_at_risk_tasks))
            client.scale_app(app_id=new_app.id,
                             instances=config['instances'] + num_at_risk_tasks,
                             force=True)
        # If we have more than the specified number of instances running, we will want to drain some of them.
        # We will start by draining any tasks running on at-risk hosts.
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_tasks_by_state_for_app(
                new_app,
                drain_method,
                service,
                nerve_ns,
                bounce_health_params,
                system_paasta_config,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])
            scaling_app_at_risk_tasks = list(task_dict['at_risk'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[new_app.id] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[new_app.id] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks),
                                        num_tasks_to_scale)
            old_app_at_risk_tasks[new_app.id] = set(
                scaling_app_at_risk_tasks[:tasks_to_move_at_risk])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[new_app.id] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]

        # TODO: don't take actions in deploy_service.
        undrain_tasks(
            to_undrain=new_app.tasks,
            leave_draining=old_app_draining_tasks.get(new_app.id, []),
            drain_method=drain_method,
            log_deploy_error=log_deploy_error,
        )

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_happy_tasks=old_app_live_happy_tasks,
                    old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    old_app_at_risk_tasks=old_app_at_risk_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                    bounce_margin_factor=bounce_margin_factor,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        logline = 'Exception raised during deploy of service %s:\n%s' % (
            service, traceback.format_exc())
        log_deploy_error(logline, level='debug')
        raise

    return (0, 'Service deployed.')
Ejemplo n.º 13
0
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    marathon_apps,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
    bounce_margin_factor=1.0,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained
    :returns: A tuple of (status, output) to be used with send_sensu_event"""

    def log_deploy_error(errormsg, level='event'):
        return _log(
            service=service,
            line=errormsg,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance
        )

    short_id = marathon_tools.format_job_id(service, instance)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps)
    new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']]
    other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" % len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config,
                                                     **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg)

    (old_app_live_happy_tasks,
     old_app_live_unhappy_tasks,
     old_app_draining_tasks,
     old_app_at_risk_tasks,
     ) = get_tasks_by_state(
        other_apps,
        drain_method,
        service,
        nerve_ns,
        bounce_health_params,
        system_paasta_config,
    )

    if new_app_running:
        num_at_risk_tasks = get_num_at_risk_tasks(new_app)
        if new_app.instances < config['instances'] + num_at_risk_tasks:
            log.info("Scaling %s from %d to %d instances." %
                     (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks))
            client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True)
        # If we have more than the specified number of instances running, we will want to drain some of them.
        # We will start by draining any tasks running on at-risk hosts.
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0)
            task_dict = get_tasks_by_state_for_app(
                new_app,
                drain_method,
                service,
                nerve_ns,
                bounce_health_params,
                system_paasta_config,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])
            scaling_app_at_risk_tasks = list(task_dict['at_risk'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale)
            old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale)
            old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale)
            old_app_at_risk_tasks[new_app.id] = set(scaling_app_at_risk_tasks[:tasks_to_move_at_risk])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk

            tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale)
            old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]

        # TODO: don't take actions in deploy_service.
        undrain_tasks(
            to_undrain=new_app.tasks,
            leave_draining=old_app_draining_tasks.get(new_app.id, []),
            drain_method=drain_method,
            log_deploy_error=log_deploy_error,
        )

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_happy_tasks=old_app_live_happy_tasks,
                    old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    old_app_at_risk_tasks=old_app_at_risk_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                    bounce_margin_factor=bounce_margin_factor,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        logline = 'Exception raised during deploy of service %s:\n%s' % (service, traceback.format_exc())
        log_deploy_error(logline, level='debug')
        raise

    return (0, 'Service deployed.')
def deploy_service(
    service,
    instance,
    marathon_jobid,
    config,
    client,
    bounce_method,
    drain_method_name,
    drain_method_params,
    nerve_ns,
    bounce_health_params,
    soa_dir,
):
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param client: A MarathonClient object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :returns: A tuple of (status, output) to be used with send_sensu_event"""

    def log_deploy_error(errormsg, level="event"):
        return _log(
            service=service, line=errormsg, component="deploy", level="event", cluster=cluster, instance=instance
        )

    short_id = marathon_tools.format_job_id(service, instance)

    cluster = load_system_paasta_config().get_cluster()
    existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True)
    new_app_list = [a for a in existing_apps if a.id == "/%s" % config["id"]]
    other_apps = [a for a in existing_apps if a.id != "/%s" % config["id"]]
    serviceinstance = "%s.%s" % (service, instance)

    if new_app_list:
        new_app = new_app_list[0]
        if len(new_app_list) != 1:
            raise ValueError("Only expected one app per ID; found %d" % len(new_app_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params)
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = "ERROR: drain_method not recognized: %s. Must be one of (%s)" % (
            drain_method_name,
            ", ".join(drain_lib.list_drain_methods()),
        )
        log_deploy_error(errormsg)
        return (1, errormsg)

    old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks(other_apps, drain_method)

    # Re-drain any already draining tasks on old apps
    for tasks in old_app_draining_tasks.values():
        for task in tasks:
            drain_method.drain(task)

    # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with
    # `paasta mark-for-deployment`), then we should undrain them.
    if new_app_running:
        for task in new_app.tasks:
            drain_method.stop_draining(task)

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = "ERROR: bounce_method not recognized: %s. Must be one of (%s)" % (
                bounce_method,
                ", ".join(bounce_lib.list_bounce_methods()),
            )
            log_deploy_error(errormsg)
            return (1, errormsg)

        try:
            with bounce_lib.bounce_lock_zookeeper(short_id):
                do_bounce(
                    bounce_func=bounce_func,
                    drain_method=drain_method,
                    config=config,
                    new_app_running=new_app_running,
                    happy_new_tasks=happy_new_tasks,
                    old_app_live_tasks=old_app_live_tasks,
                    old_app_draining_tasks=old_app_draining_tasks,
                    service=service,
                    bounce_method=bounce_method,
                    serviceinstance=serviceinstance,
                    cluster=cluster,
                    instance=instance,
                    marathon_jobid=marathon_jobid,
                    client=client,
                    soa_dir=soa_dir,
                )

        except bounce_lib.LockHeldException:
            log.error("Instance %s already being bounced. Exiting", short_id)
            return (1, "Instance %s is already being bounced." % short_id)
    except Exception:
        loglines = ["Exception raised during deploy of service %s:" % service]
        loglines.extend(traceback.format_exc().rstrip().split("\n"))
        for logline in loglines:
            log_deploy_error(logline, level="debug")
        raise

    return (0, "Service deployed.")