Example #1
0
def get_service_instances_needing_update(
    marathon_clients: MarathonClients,
    instances: Collection[Tuple[str, str]],
    cluster: str,
) -> List[Tuple[str, str, MarathonServiceConfig, str]]:
    marathon_apps = {}
    for marathon_client in marathon_clients.get_all_clients():
        marathon_apps.update(
            {app.id: app
             for app in get_all_marathon_apps(marathon_client)})

    marathon_app_ids = marathon_apps.keys()
    service_instances = []
    for service, instance in instances:
        try:
            config = load_marathon_service_config_no_cache(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=DEFAULT_SOA_DIR,
            )
            config_app = config.format_marathon_app_dict()
            app_id = "/{}".format(config_app["id"])
        # Not ideal but we rely on a lot of user input to create the app dict
        # and we really can't afford to bail if just one app definition is malformed
        except Exception as e:
            print("ERROR: Skipping {}.{} because: '{}'".format(
                service, instance, str(e)))
            continue
        if (app_id not in marathon_app_ids
                or marathon_apps[app_id].instances != config_app["instances"]):
            service_instances.append((service, instance, config, app_id))
    return service_instances
Example #2
0
def test_get_service_instances_that_need_bouncing_at_risk():
    with mock.patch(
            "paasta_tools.list_marathon_service_instances.get_desired_marathon_configs",
            autospec=True,
    ) as mock_get_desired_marathon_configs, mock.patch(
            "paasta_tools.list_marathon_service_instances.get_num_at_risk_tasks",
            autospec=True,
    ) as mock_get_num_at_risk_tasks, mock.patch(
            "paasta_tools.list_marathon_service_instances.get_draining_hosts",
            autospec=True):
        mock_get_desired_marathon_configs.return_value = (
            {
                "fake--service.fake--instance.sha.config": {
                    "instances": 5
                }
            },
            {
                "fake--service.fake--instance.sha.config":
                mock.Mock(get_marathon_shard=mock.Mock(return_value=None))
            },
        )
        fake_apps = [
            mock.MagicMock(instances=5,
                           id="/fake--service.fake--instance.sha.config")
        ]
        mock_client = mock.MagicMock(list_apps=mock.MagicMock(
            return_value=fake_apps))
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        mock_get_num_at_risk_tasks.return_value = 1
        assert set(
            list_marathon_service_instances.
            get_service_instances_that_need_bouncing(
                marathon_clients=fake_clients,
                soa_dir="/fake/soa/dir")) == {"fake_service.fake_instance"}
Example #3
0
def get_service_instances_needing_update(
    marathon_clients: MarathonClients,
    instances: Collection[Tuple[str, str]],
    cluster: str,
) -> List[Tuple[str, str]]:
    marathon_apps = {}
    for marathon_client in marathon_clients.get_all_clients():
        marathon_apps.update(
            {app.id: app
             for app in get_all_marathon_apps(marathon_client)})

    marathon_app_ids = marathon_apps.keys()
    service_instances = []
    for service, instance in instances:
        try:
            config = load_marathon_service_config_no_cache(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=DEFAULT_SOA_DIR,
            )
            config_app = config.format_marathon_app_dict()
            app_id = '/{}'.format(config_app['id'])
        except (NoDockerImageError, InvalidJobNameError,
                NoDeploymentsAvailable) as e:
            print("DEBUG: Skipping %s.%s because: '%s'" %
                  (service, instance, str(e)))
            continue
        if app_id not in marathon_app_ids:
            service_instances.append((service, instance))
        elif marathon_apps[app_id].instances != config_app['instances']:
            service_instances.append((service, instance))
    return service_instances
Example #4
0
def test_create_marathon_dashboard(
    mock_get_services_for_cluster, mock_pscl, mock_load_system_paasta_config
):
    soa_dir = "/fake/soa/dir"
    cluster = "fake_cluster"
    mock_load_system_paasta_config.return_value = SystemPaastaConfig(
        {"dashboard_links": {}}, "fake_directory"
    )
    mock_get_services_for_cluster.return_value = [
        ("fake_service", "foo"),
        ("fake_service", "bar"),
    ]
    mock_pscl.return_value.instance_configs.return_value = [
        MarathonServiceConfig("fake_service", "fake_cluster", "foo", {}, {}, soa_dir),
        MarathonServiceConfig("fake_service", "fake_cluster", "bar", {}, {}, soa_dir),
    ]

    mock_client = mock.Mock(servers=["hi"])
    mock_clients = MarathonClients(current=[mock_client], previous=[mock_client])

    expected_output = {
        "fake_cluster": [
            {"service": "fake_service", "instance": "foo", "shard_url": "hi"},
            {"service": "fake_service", "instance": "bar", "shard_url": "hi"},
        ]
    }
    assert (
        marathon_dashboard.create_marathon_dashboard(
            cluster=cluster, soa_dir=soa_dir, marathon_clients=mock_clients
        )
        == expected_output
    )
Example #5
0
def create_marathon_dashboard(
    cluster: str,
    soa_dir: str = DEFAULT_SOA_DIR,
    marathon_clients: MarathonClients = None,
    system_paasta_config: SystemPaastaConfig = None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(
        system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(
            marathon_servers=marathon_servers, cached=False)
    for service_instance in instances:
        service: str = service_instance[0]
        instance: str = service_instance[1]
        service_config: MarathonServiceConfig = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=cluster,
            load_deployments=False,
            soa_dir=soa_dir,
        )
        client: MarathonClient = marathon_clients.get_current_client_for_service(
            job_config=service_config)
        dashboard_links: Dict = system_paasta_config.get_dashboard_links()
        shard_url: str = client.servers[0]
        if 'Marathon RO' in dashboard_links[cluster]:
            marathon_links = dashboard_links[cluster]['Marathon RO']
            if isinstance(marathon_links, list):
                for shard_number, shard in enumerate(marathon_servers.current):
                    if shard.url[0] == shard_url:
                        shard_url = marathon_links[shard_number]
            elif isinstance(marathon_links, str):
                shard_url = marathon_links.split(' ')[0]
        service_info: Marathon_Dashboard_Item = {
            'service': service,
            'instance': instance,
            'shard_url': shard_url,
        }
        dashboard[cluster].append(service_info)
    return dashboard
Example #6
0
def status_marathon_job_verbose(
    service: str,
    instance: str,
    clients: marathon_tools.MarathonClients,
    cluster: str,
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    dashboards: Dict[marathon_tools.MarathonClient, str],
) -> Tuple[List[MarathonTask], str]:
    """Returns detailed information about a marathon apps for a service
    and instance. Does not make assumptions about what the *exact*
    appid is, but instead does a fuzzy match on any marathon apps
    that match the given service.instance"""
    all_tasks: List[MarathonTask] = []
    all_output: List[str] = []
    # For verbose mode, we want to see *any* matching app. As it may
    # not be the one that we think should be deployed. For example
    # during a bounce we want to see the old and new ones.
    marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
        clients=clients.get_all_clients_for_service(job_config),
        embed_tasks=True,
    )

    autoscaling_info = get_autoscaling_info(clients, job_config)
    if autoscaling_info:
        all_output.append("  Autoscaling Info:")
        headers = [
            field.replace("_", " ").capitalize()
            for field in ServiceAutoscalingInfo._fields
        ]
        table = [headers, autoscaling_info]
        all_output.append('\n'.join(
            ["    %s" % line for line in format_table(table)]))

    for app, client in marathon_tools.get_matching_apps_with_clients(
            service, instance, marathon_apps_with_clients):
        tasks, output = get_verbose_status_of_marathon_app(
            marathon_client=client,
            app=app,
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
        )
        all_tasks.extend(tasks)
        all_output.append(output)
    return all_tasks, "\n".join(all_output)
Example #7
0
def test_get_service_instances_that_need_bouncing():
    with mock.patch(
            'paasta_tools.list_marathon_service_instances.get_desired_marathon_configs',
            autospec=True,
    ) as mock_get_desired_marathon_configs, mock.patch(
            'paasta_tools.list_marathon_service_instances.get_num_at_risk_tasks',
            autospec=True,
    ) as mock_get_num_at_risk_tasks, mock.patch(
            'paasta_tools.list_marathon_service_instances.get_draining_hosts',
            autospec=True,
    ):
        mock_get_desired_marathon_configs.return_value = (
            {
                'fake--service.fake--instance.sha.config': {
                    'instances': 5
                },
                'fake--service2.fake--instance.sha.config': {
                    'instances': 5
                },
            },
            {
                'fake--service.fake--instance.sha.config':
                mock.Mock(get_marathon_shard=mock.Mock(return_value=None)),
                'fake--service2.fake--instance.sha.config':
                mock.Mock(get_marathon_shard=mock.Mock(return_value=None)),
            },
        )

        fake_apps = [
            mock.MagicMock(instances=5,
                           id='/fake--service.fake--instance.sha.config2'),
            mock.MagicMock(instances=5,
                           id='/fake--service2.fake--instance.sha.config'),
        ]
        mock_client = mock.MagicMock(list_apps=mock.MagicMock(
            return_value=fake_apps))
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])

        mock_get_num_at_risk_tasks.return_value = 0
        assert set(
            list_marathon_service_instances.
            get_service_instances_that_need_bouncing(
                marathon_clients=fake_clients,
                soa_dir='/fake/soa/dir',
            )) == {'fake_service.fake_instance'}
Example #8
0
def test_create_marathon_dashboard(mock_get_services_for_cluster, mock_pscl,
                                   mock_load_system_paasta_config):
    soa_dir = '/fake/soa/dir'
    cluster = 'fake_cluster'
    mock_load_system_paasta_config.return_value = SystemPaastaConfig(
        {
            'dashboard_links': {},
        },
        'fake_directory',
    )
    mock_get_services_for_cluster.return_value = [
        ('fake_service', 'foo'),
        ('fake_service', 'bar'),
    ]
    mock_pscl.return_value.instance_configs.return_value = [
        MarathonServiceConfig('fake_service', 'fake_cluster', 'foo', {}, {},
                              soa_dir),
        MarathonServiceConfig('fake_service', 'fake_cluster', 'bar', {}, {},
                              soa_dir),
    ]

    mock_client = mock.Mock(servers=['hi'])
    mock_clients = MarathonClients(current=[mock_client],
                                   previous=[mock_client])

    expected_output = {
        'fake_cluster': [
            {
                'service': 'fake_service',
                'instance': 'foo',
                'shard_url': 'hi',
            },
            {
                'service': 'fake_service',
                'instance': 'bar',
                'shard_url': 'hi',
            },
        ],
    }
    assert marathon_dashboard.create_marathon_dashboard(
        cluster=cluster,
        soa_dir=soa_dir,
        marathon_clients=mock_clients,
    ) == expected_output
Example #9
0
def deploy_marathon_service(
    service: str,
    instance: str,
    clients: marathon_tools.MarathonClients,
    soa_dir: str,
    marathon_apps_with_clients: Optional[Collection[Tuple[MarathonApp,
                                                          MarathonClient]]],
) -> Tuple[int, float]:
    """deploy the service instance given and proccess return code
    if there was an error we send a sensu alert.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param clients: A MarathonClients object
    :param soa_dir: Path to yelpsoa configs
    :param marathon_apps: A list of all marathon app objects
    :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd
        bounce_in_seconds instructs how long until the deployd should try another bounce
        None means that it is in a steady state and doesn't need to bounce again
    """
    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(short_id):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config_no_cache(
                    service,
                    instance,
                    load_system_paasta_config().get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug(
                    "No deployments found for %s.%s in cluster %s. Skipping." %
                    (service, instance,
                     load_system_paasta_config().get_cluster()))
                return 0, None
            except NoConfigurationForServiceError:
                error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                            (service, instance, load_system_paasta_config().get_cluster())
                log.error(error_msg)
                return 1, None

            if marathon_apps_with_clients is None:
                marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
                    clients=clients.get_all_clients_for_service(
                        job_config=service_instance_config),
                    embed_tasks=True,
                )

            try:
                with a_sync.idle_event_loop():
                    status, output, bounce_again_in_seconds = setup_service(
                        service=service,
                        instance=instance,
                        clients=clients,
                        job_config=service_instance_config,
                        marathon_apps_with_clients=marathon_apps_with_clients,
                        soa_dir=soa_dir,
                    )
                sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
                send_event(service, instance, soa_dir, sensu_status, output)
                return 0, bounce_again_in_seconds
            except (KeyError, TypeError, AttributeError, InvalidInstanceConfig,
                    NoSlavesAvailableError):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(service, instance, soa_dir,
                           pysensu_yelp.Status.CRITICAL, error_str)
                return 1, None
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0, None
Example #10
0
def deploy_service(
    service: str,
    instance: str,
    marathon_jobid: str,
    config: marathon_tools.FormattedMarathonAppDict,
    clients: marathon_tools.MarathonClients,
    marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]],
    bounce_method: str,
    drain_method_name: str,
    drain_method_params: Dict[str, Any],
    nerve_ns: str,
    bounce_health_params: Dict[str, Any],
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    bounce_margin_factor: float = 1.0,
) -> Tuple[int, str, Optional[float]]:
    """Deploy the service to marathon, either directly or via a bounce if needed.
    Called by setup_service when it's time to actually deploy.

    :param service: The name of the service to deploy
    :param instance: The instance of the service to deploy
    :param marathon_jobid: Full id of the marathon job
    :param config: The complete configuration dict to send to marathon
    :param clients: A MarathonClients object
    :param bounce_method: The bounce method to use, if needed
    :param drain_method_name: The name of the traffic draining method to use.
    :param nerve_ns: The nerve namespace to look in.
    :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks.
    :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained
    :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event"""
    def log_deploy_error(errormsg: str, level: str = 'event') -> None:
        return _log(
            service=service,
            line=errormsg,
            component='deploy',
            level='event',
            cluster=cluster,
            instance=instance,
        )

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients(
        service=service,
        instance=instance,
        marathon_apps_with_clients=marathon_apps_with_clients,
    )

    new_client = clients.get_current_client_for_service(job_config)

    new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = []
    other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = []

    for a, c in existing_apps_with_clients:
        if a.id == '/%s' % config['id'] and c == new_client:
            new_apps_with_clients_list.append((a, c))
        else:
            other_apps_with_clients.append((a, c))

    serviceinstance = "%s.%s" % (service, instance)

    if new_apps_with_clients_list:
        new_app, new_client = new_apps_with_clients_list[0]
        if len(new_apps_with_clients_list) != 1:
            raise ValueError(
                "Only expected one app per ID per shard; found %d" %
                len(new_apps_with_clients_list))
        new_app_running = True
        happy_new_tasks = bounce_lib.get_happy_tasks(
            new_app,
            service,
            nerve_ns,
            system_paasta_config,
            **bounce_health_params,
        )
    else:
        new_app_running = False
        happy_new_tasks = []

    try:
        drain_method = drain_lib.get_drain_method(
            drain_method_name,
            service=service,
            instance=instance,
            nerve_ns=nerve_ns,
            drain_method_params=drain_method_params,
        )
    except KeyError:
        errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \
            (drain_method_name, ', '.join(drain_lib.list_drain_methods()))
        log_deploy_error(errormsg)
        return (1, errormsg, None)

    try:
        draining_hosts = get_draining_hosts()
    except ReadTimeout as e:
        errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e
        return (1, errormsg, 60)

    (
        old_app_live_happy_tasks,
        old_app_live_unhappy_tasks,
        old_app_draining_tasks,
        old_app_at_risk_tasks,
    ) = get_tasks_by_state(
        other_apps_with_clients=other_apps_with_clients,
        drain_method=drain_method,
        service=service,
        nerve_ns=nerve_ns,
        bounce_health_params=bounce_health_params,
        system_paasta_config=system_paasta_config,
        log_deploy_error=log_deploy_error,
        draining_hosts=draining_hosts,
    )

    # The first thing we need to do is take up the "slack" of old apps, to stop
    # them from launching new things that we are going to have to end up draining
    # and killing anyway.
    for a, c in other_apps_with_clients:
        marathon_tools.take_up_slack(app=a, client=c)

    num_at_risk_tasks = 0
    if new_app_running:
        num_at_risk_tasks = get_num_at_risk_tasks(
            new_app, draining_hosts=draining_hosts)
        if new_app.instances < config['instances'] + num_at_risk_tasks:
            log.info("Scaling %s up from %d to %d instances." %
                     (new_app.id, new_app.instances,
                      config['instances'] + num_at_risk_tasks))
            new_client.scale_app(app_id=new_app.id,
                                 instances=config['instances'] +
                                 num_at_risk_tasks,
                                 force=True)
        # If we have more than the specified number of instances running, we will want to drain some of them.
        # We will start by draining any tasks running on at-risk hosts.
        elif new_app.instances > config['instances']:
            num_tasks_to_scale = max(
                min(len(new_app.tasks), new_app.instances) -
                config['instances'], 0)
            task_dict = get_tasks_by_state_for_app(
                app=new_app,
                drain_method=drain_method,
                service=service,
                nerve_ns=nerve_ns,
                bounce_health_params=bounce_health_params,
                system_paasta_config=system_paasta_config,
                log_deploy_error=log_deploy_error,
                draining_hosts=draining_hosts,
            )
            scaling_app_happy_tasks = list(task_dict['happy'])
            scaling_app_unhappy_tasks = list(task_dict['unhappy'])
            scaling_app_draining_tasks = list(task_dict['draining'])
            scaling_app_at_risk_tasks = list(task_dict['at_risk'])

            tasks_to_move_draining = min(len(scaling_app_draining_tasks),
                                         num_tasks_to_scale)
            old_app_draining_tasks[(new_app.id, new_client)] = set(
                scaling_app_draining_tasks[:tasks_to_move_draining])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining

            tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks),
                                        num_tasks_to_scale)
            old_app_live_unhappy_tasks[(new_app.id, new_client)] = set(
                scaling_app_unhappy_tasks[:tasks_to_move_unhappy], )
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy

            tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks),
                                        num_tasks_to_scale)
            old_app_at_risk_tasks[(new_app.id, new_client)] = set(
                scaling_app_at_risk_tasks[:tasks_to_move_at_risk])
            num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk

            tasks_to_move_happy = min(len(scaling_app_happy_tasks),
                                      num_tasks_to_scale)
            old_app_live_happy_tasks[(new_app.id, new_client)] = set(
                scaling_app_happy_tasks[:tasks_to_move_happy])
            happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:]

            # slack represents remaining the extra remaining instances that are configured
            # in marathon that don't have a launched task yet. When scaling down we want to
            # reduce this slack so marathon doesn't get a chance to launch a new task in
            # that space that we will then have to drain and kill again.
            marathon_tools.take_up_slack(client=new_client, app=new_app)

        # TODO: don't take actions in deploy_service.
        undrain_tasks(
            to_undrain=new_app.tasks,
            leave_draining=old_app_draining_tasks.get((new_app.id, new_client),
                                                      []),
            drain_method=drain_method,
            log_deploy_error=log_deploy_error,
        )

    # log all uncaught exceptions and raise them again
    try:
        try:
            bounce_func = bounce_lib.get_bounce_method_func(bounce_method)
        except KeyError:
            errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \
                (bounce_method, ', '.join(bounce_lib.list_bounce_methods()))
            log_deploy_error(errormsg)
            return (1, errormsg, None)

        bounce_again_in_seconds = do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=config,
            new_app_running=new_app_running,
            happy_new_tasks=happy_new_tasks,
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            old_app_at_risk_tasks=old_app_at_risk_tasks,
            service=service,
            bounce_method=bounce_method,
            serviceinstance=serviceinstance,
            cluster=cluster,
            instance=instance,
            marathon_jobid=marathon_jobid,
            clients=clients,
            soa_dir=soa_dir,
            job_config=job_config,
            bounce_margin_factor=bounce_margin_factor,
        )
    except bounce_lib.LockHeldException:
        logline = 'Failed to get lock to create marathon app for %s.%s' % (
            service, instance)
        log_deploy_error(logline, level='debug')
        return (0, "Couldn't get marathon lock, skipping until next time",
                None)
    except Exception:
        logline = 'Exception raised during deploy of service %s:\n%s' % (
            service, traceback.format_exc())
        log_deploy_error(logline, level='debug')
        raise
    if num_at_risk_tasks:
        bounce_again_in_seconds = 60
    elif new_app_running:
        if new_app.instances > config['instances']:
            bounce_again_in_seconds = 60
    return (0, 'Service deployed.', bounce_again_in_seconds)
Example #11
0
def do_bounce(
    bounce_func: bounce_lib.BounceMethod,
    drain_method: drain_lib.DrainMethod,
    config: marathon_tools.FormattedMarathonAppDict,
    new_app_running: bool,
    happy_new_tasks: List[Tuple[MarathonTask, MarathonClient]],
    old_app_live_happy_tasks: Dict[Tuple[str, MarathonClient],
                                   Set[MarathonTask]],
    old_app_live_unhappy_tasks: Dict[Tuple[str, MarathonClient],
                                     Set[MarathonTask]],
    old_app_draining_tasks: Dict[Tuple[str, MarathonClient],
                                 Set[MarathonTask]],
    old_app_at_risk_tasks: Dict[Tuple[str, MarathonClient], Set[MarathonTask]],
    service: str,
    bounce_method: str,
    serviceinstance: str,
    cluster: str,
    instance: str,
    marathon_jobid: str,
    clients: marathon_tools.MarathonClients,
    soa_dir: str,
    job_config: marathon_tools.MarathonServiceConfig,
    bounce_margin_factor: float = 1.0,
) -> Optional[float]:
    def log_bounce_action(line: str, level: str = 'debug') -> None:
        return _log(
            service=service,
            line=line,
            component='deploy',
            level=level,
            cluster=cluster,
            instance=instance,
        )

    # log if we're not in a steady state.
    if any([
        (not new_app_running),
            old_app_live_happy_tasks.keys(),
    ]):
        log_bounce_action(
            line=' '.join([
                '%s bounce in progress on %s.' %
                (bounce_method, serviceinstance),
                'New marathon app %s %s.' %
                (marathon_jobid,
                 ('exists' if new_app_running else 'not created yet')),
                '%d new tasks to bring up.' %
                (config['instances'] - len(happy_new_tasks)),
                '%d old tasks receiving traffic and happy.' %
                len(bounce_lib.flatten_tasks(old_app_live_happy_tasks)),
                '%d old tasks unhappy.' %
                len(bounce_lib.flatten_tasks(old_app_live_unhappy_tasks)),
                '%d old tasks draining.' %
                len(bounce_lib.flatten_tasks(old_app_draining_tasks)),
                '%d old tasks at risk.' %
                len(bounce_lib.flatten_tasks(old_app_at_risk_tasks)),
                '%d old apps.' % len(old_app_live_happy_tasks.keys()),
            ]),
            level='event',
        )
    else:
        log.debug("Nothing to do, bounce is in a steady state")

    new_client = clients.get_current_client_for_service(job_config)

    old_non_draining_tasks = list(
        old_app_tasks_to_task_client_pairs(old_app_live_happy_tasks), ) + list(
            old_app_tasks_to_task_client_pairs(old_app_live_unhappy_tasks),
        ) + list(old_app_tasks_to_task_client_pairs(old_app_at_risk_tasks), )

    actions = bounce_func(
        new_config=config,
        new_app_running=new_app_running,
        happy_new_tasks=happy_new_tasks,
        old_non_draining_tasks=old_non_draining_tasks,
        margin_factor=bounce_margin_factor,
    )

    if actions['create_app'] and not new_app_running:
        log_bounce_action(line='%s bounce creating new app with app_id %s' %
                          (bounce_method, marathon_jobid), )
        with requests_cache.disabled():
            try:
                bounce_lib.create_marathon_app(
                    app_id=marathon_jobid,
                    config=config,
                    client=new_client,
                )
            except MarathonHttpError as e:
                if e.status_code == 409:
                    log.warning(
                        "Failed to create, app %s already exists. This means another bounce beat us to it."
                        " Skipping the rest of the bounce for this run" %
                        marathon_jobid, )
                    return 60
                raise

    tasks_to_kill = drain_tasks_and_find_tasks_to_kill(
        tasks_to_drain=actions['tasks_to_drain'],
        already_draining_tasks=old_app_tasks_to_task_client_pairs(
            old_app_draining_tasks),
        drain_method=drain_method,
        log_bounce_action=log_bounce_action,
        bounce_method=bounce_method,
        at_risk_tasks=old_app_tasks_to_task_client_pairs(
            old_app_at_risk_tasks),
    )

    tasks_to_kill_by_client: Dict[MarathonClient,
                                  List[MarathonTask]] = defaultdict(list)
    for task, client in tasks_to_kill:
        tasks_to_kill_by_client[client].append(task)

    for client, tasks in tasks_to_kill_by_client.items():
        kill_given_tasks(client=client,
                         task_ids=[task.id for task in tasks],
                         scale=True)

    for task in bounce_lib.flatten_tasks(old_app_at_risk_tasks):
        if task in tasks_to_kill:
            hostname = task.host
            try:
                reserve_all_resources([hostname])
            except HTTPError:
                log.warning("Failed to reserve resources on %s" % hostname)

    apps_to_kill: List[Tuple[str, MarathonClient]] = []
    for app, client in old_app_live_happy_tasks.keys():
        if app != '/%s' % marathon_jobid or client != new_client:
            live_happy_tasks = old_app_live_happy_tasks[(app, client)]
            live_unhappy_tasks = old_app_live_unhappy_tasks[(app, client)]
            draining_tasks = old_app_draining_tasks[(app, client)]
            at_risk_tasks = old_app_at_risk_tasks[(app, client)]

            remaining_tasks = (live_happy_tasks | live_unhappy_tasks
                               | draining_tasks | at_risk_tasks)
            for task, _ in tasks_to_kill:
                remaining_tasks.discard(task)

            if 0 == len(remaining_tasks):
                apps_to_kill.append((app, client))

    if apps_to_kill:
        log_bounce_action(
            line='%s bounce removing old unused apps with app_ids: %s' % (
                bounce_method,
                ', '.join([app for app, client in apps_to_kill]),
            ), )
        with requests_cache.disabled():
            for app_id, client in apps_to_kill:
                bounce_lib.kill_old_ids([app_id], client)

    all_old_tasks: Set[MarathonTask] = set()
    all_old_tasks = set.union(all_old_tasks,
                              *old_app_live_happy_tasks.values())
    all_old_tasks = set.union(all_old_tasks,
                              *old_app_live_unhappy_tasks.values())
    all_old_tasks = set.union(all_old_tasks, *old_app_draining_tasks.values())
    all_old_tasks = set.union(all_old_tasks, *old_app_at_risk_tasks.values())

    if all_old_tasks or (not new_app_running):
        # Still have work more work to do, try again in 60 seconds
        return 60
    else:
        # log if we appear to be finished
        if all([
            (apps_to_kill or tasks_to_kill),
                apps_to_kill == list(old_app_live_happy_tasks),
                tasks_to_kill == all_old_tasks,
        ]):
            log_bounce_action(
                line='%s bounce on %s finishing. Now running %s' % (
                    bounce_method,
                    serviceinstance,
                    marathon_jobid,
                ),
                level='event',
            )
        return None
Example #12
0
def create_marathon_dashboard(
        cluster: str,
        soa_dir: str=DEFAULT_SOA_DIR,
        marathon_clients: MarathonClients=None,
        system_paasta_config: SystemPaastaConfig=None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(marathon_servers=marathon_servers, cached=False)

    dashboard_links: Dict = system_paasta_config.get_dashboard_links()
    marathon_links = dashboard_links.get(cluster, {}).get('Marathon RO')

    # e.g. 'http://10.64.97.75:5052': 'http://marathon-norcal-prod.yelpcorp.com'
    shard_url_to_marathon_link_dict: Dict[str, str] = {}
    if isinstance(marathon_links, list):
        # Sanity check and log error if necessary
        if len(marathon_links) != len(marathon_servers.current):
            log.error('len(marathon_links) != len(marathon_servers.current). This may be a cause of concern')
        for shard_number, shard in enumerate(marathon_servers.current):
            shard_url_to_marathon_link_dict[shard.url[0]] = marathon_links[shard_number]
    elif isinstance(marathon_links, str):
        # In this case, the shard url will be the same for every service instance
        static_shard_url = marathon_links.split(' ')[0]
        return {cluster: [{'service': si[0], 'instance': si[1], 'shard_url': static_shard_url} for si in instances]}

    # Setup with service as key since will instantiate 1 PSCL per service
    service_instances_dict: Dict[str, Set[str]] = defaultdict(set)
    for si in instances:
        service, instance = si[0], si[1]
        service_instances_dict[service].add(instance)

    for service, instance_set in service_instances_dict.items():
        pscl = PaastaServiceConfigLoader(
            service=service,
            soa_dir=soa_dir,
            load_deployments=False,
        )
        for marathon_service_config in pscl.instance_configs(cluster, MarathonServiceConfig):
            if marathon_service_config.get_instance() in instance_set:
                client: MarathonClient = \
                    marathon_clients.get_current_client_for_service(job_config=marathon_service_config)
                ip_url: str = client.servers[0]
                # Convert to a marathon link if possible else default to the originalIP address
                shard_url: str = shard_url_to_marathon_link_dict.get(ip_url, ip_url)
                service_info: Marathon_Dashboard_Item = {
                    'service': service,
                    'instance': instance,
                    'shard_url': shard_url,
                }
                dashboard[cluster].append(service_info)
    return dashboard
Example #13
0
def test_get_service_instances_needing_update():
    with mock.patch(
            "paasta_tools.deployd.common.get_all_marathon_apps", autospec=True
    ) as mock_get_marathon_apps, mock.patch(
            "paasta_tools.deployd.common.load_marathon_service_config_no_cache",
            autospec=True,
    ) as mock_load_marathon_service_config:
        mock_marathon_apps = [
            mock.Mock(id="/universe.c137.c1.g1", instances=2),
            mock.Mock(id="/universe.c138.c1.g1", instances=2),
        ]
        mock_get_marathon_apps.return_value = mock_marathon_apps
        mock_service_instances = [("universe", "c137"), ("universe", "c138")]
        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c137.c1.g1",
                "instances": 2
            })),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert mock_get_marathon_apps.called
        calls = [
            mock.call(
                service="universe",
                instance="c137",
                cluster="westeros-prod",
                soa_dir=DEFAULT_SOA_DIR,
            ),
            mock.call(
                service="universe",
                instance="c138",
                cluster="westeros-prod",
                soa_dir=DEFAULT_SOA_DIR,
            ),
        ]
        mock_load_marathon_service_config.assert_has_calls(calls)
        assert ret == [("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c137.c1.g1",
                "instances": 3
            })),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c137", mock.ANY),
                       ("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=NoDockerImageError)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=NoSlavesAvailableError)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=InvalidJobNameError)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=NoDeploymentsAvailable)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c138", mock.ANY)]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=Exception)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                "id": "universe.c138.c2.g2",
                "instances": 2
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   "westeros-prod")
        assert ret == [("universe", "c138", mock.ANY)]
Example #14
0
def perform_command(
    command: str,
    service: str,
    instance: str,
    cluster: str,
    verbose: int,
    soa_dir: str,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    app_id: str = None,
) -> int:
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()["id"]
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id
            )
            return 1

    normal_instance_count = job_config.get_instances()

    current_client = clients.get_current_client_for_service(job_config)

    if command == "restart":
        restart_marathon_job(service, instance, app_id, current_client, cluster)
    elif command == "status":
        paasta_print(
            status_desired_state(service, instance, current_client, job_config)
        )
        dashboards = get_marathon_dashboard_links(clients, system_config)
        tasks, out = status_marathon_job(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
            normal_instance_count=normal_instance_count,
            clients=clients,
            job_config=job_config,
            desired_app_id=app_id,
            verbose=verbose,
        )
        paasta_print(out)
        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir
        )

        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count, verbose)
        )

        proxy_port = service_namespace_config.get("proxy_port")
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service, instance, cluster
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    service_namespace_config=service_namespace_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(),
                )
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Example #15
0
def status_marathon_job(
    service: str,
    instance: str,
    cluster: str,
    soa_dir: str,
    dashboards: Dict[marathon_tools.MarathonClient, str],
    normal_instance_count: int,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    desired_app_id: str,
    verbose: int,
) -> Tuple[List[MarathonTask], str]:
    marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
        clients=clients.get_all_clients_for_service(job_config),
        embed_tasks=True,
        service_name=service,
    )
    all_tasks = []
    all_output = [
        ""
    ]  # One entry that will be replaced with status_marathon_job_human output later.

    running_instances = 0

    if verbose > 0:
        autoscaling_info = get_autoscaling_info(marathon_apps_with_clients, job_config)
        if autoscaling_info:
            all_output.append("  Autoscaling Info:")
            headers = [
                field.replace("_", " ").capitalize()
                for field in ServiceAutoscalingInfo._fields
            ]
            table = [headers, humanize_autoscaling_info(autoscaling_info)]
            all_output.append(
                "\n".join(["    %s" % line for line in format_table(table)])
            )

    deploy_status_for_desired_app = "Waiting for bounce"
    matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients(
        service, instance, marathon_apps_with_clients
    )
    for app, client in matching_apps_with_clients:
        all_tasks.extend(app.tasks)
        (
            deploy_status_for_current_app,
            running_instances_for_current_app,
            out,
        ) = status_marathon_app(
            marathon_client=client,
            app=app,
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
            verbose=verbose,
        )
        if app.id.lstrip("/") == desired_app_id.lstrip("/"):
            deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring(
                deploy_status_for_current_app
            )

        running_instances += running_instances_for_current_app
        all_output.append(out)

    all_output[0] = status_marathon_job_human(
        service=service,
        instance=instance,
        deploy_status=deploy_status_for_desired_app,
        desired_app_id=desired_app_id,
        app_count=len(matching_apps_with_clients),
        running_instances=running_instances,
        normal_instance_count=normal_instance_count,
    )

    return all_tasks, "\n".join(all_output)
Example #16
0
def test_get_service_instances_needing_update():
    with mock.patch(
            'paasta_tools.deployd.common.get_all_marathon_apps',
            autospec=True,
    ) as mock_get_marathon_apps, mock.patch(
            'paasta_tools.deployd.common.load_marathon_service_config_no_cache',
            autospec=True,
    ) as mock_load_marathon_service_config:
        mock_marathon_apps = [
            mock.Mock(id='/universe.c137.c1.g1', instances=2),
            mock.Mock(id='/universe.c138.c1.g1', instances=2),
        ]
        mock_get_marathon_apps.return_value = mock_marathon_apps
        mock_service_instances = [('universe', 'c137'), ('universe', 'c138')]
        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c137.c1.g1',
                'instances': 2,
            })),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c138.c2.g2',
                'instances': 2,
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   'westeros-prod')
        assert mock_get_marathon_apps.called
        calls = [
            mock.call(
                service='universe',
                instance='c137',
                cluster='westeros-prod',
                soa_dir=DEFAULT_SOA_DIR,
            ),
            mock.call(
                service='universe',
                instance='c138',
                cluster='westeros-prod',
                soa_dir=DEFAULT_SOA_DIR,
            ),
        ]
        mock_load_marathon_service_config.assert_has_calls(calls)
        assert ret == [('universe', 'c138')]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c137.c1.g1',
                'instances': 3,
            })),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c138.c2.g2',
                'instances': 2,
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   'westeros-prod')
        assert ret == [('universe', 'c137'), ('universe', 'c138')]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=NoDockerImageError)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c138.c2.g2',
                'instances': 2,
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   'westeros-prod')
        assert ret == [('universe', 'c138')]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=InvalidJobNameError)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c138.c2.g2',
                'instances': 2,
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   'westeros-prod')
        assert ret == [('universe', 'c138')]

        mock_configs = [
            mock.Mock(format_marathon_app_dict=mock.Mock(
                side_effect=NoDeploymentsAvailable)),
            mock.Mock(format_marathon_app_dict=mock.Mock(return_value={
                'id': 'universe.c138.c2.g2',
                'instances': 2,
            })),
        ]
        mock_load_marathon_service_config.side_effect = mock_configs
        mock_client = mock.Mock(servers=["foo"])
        fake_clients = MarathonClients(current=[mock_client],
                                       previous=[mock_client])
        ret = get_service_instances_needing_update(fake_clients,
                                                   mock_service_instances,
                                                   'westeros-prod')
        assert ret == [('universe', 'c138')]