def test_bounce_lock_zookeeper(self): lock_name = 'watermelon' fake_lock = mock.Mock() fake_zk = mock.MagicMock(Lock=mock.Mock(return_value=fake_lock)) fake_zk_hosts = 'awjti42ior' with mock.patch( 'paasta_tools.bounce_lib.KazooClient', return_value=fake_zk, autospec=True, ) as client_patch, mock.patch( 'paasta_tools.bounce_lib.load_system_paasta_config', return_value=mock.Mock(get_zk_hosts=lambda: fake_zk_hosts), autospec=True, ) as hosts_patch: with bounce_lib.bounce_lock_zookeeper(lock_name): pass hosts_patch.assert_called_once_with() client_patch.assert_called_once_with( hosts=fake_zk_hosts, timeout=bounce_lib.ZK_LOCK_CONNECT_TIMEOUT_S, ) fake_zk.start.assert_called_once_with() fake_zk.Lock.assert_called_once_with( f'{bounce_lib.ZK_LOCK_PATH}/{lock_name}') fake_lock.acquire.assert_called_once_with(timeout=1) fake_lock.release.assert_called_once_with() fake_zk.stop.assert_called_once_with()
def test_bounce_lock_zookeeper(self): lock_name = 'watermelon' fake_lock = mock.Mock() fake_zk = mock.MagicMock(Lock=mock.Mock(return_value=fake_lock)) fake_zk_hosts = 'awjti42ior' with contextlib.nested( mock.patch('paasta_tools.bounce_lib.KazooClient', return_value=fake_zk, autospec=True), mock.patch( 'paasta_tools.bounce_lib.load_system_paasta_config', return_value=mock.Mock( get_zk_hosts=lambda: fake_zk_hosts ), autospec=True, ), ) as ( client_patch, hosts_patch, ): with bounce_lib.bounce_lock_zookeeper(lock_name): pass hosts_patch.assert_called_once_with() client_patch.assert_called_once_with(hosts=fake_zk_hosts, timeout=bounce_lib.ZK_LOCK_CONNECT_TIMEOUT_S) fake_zk.start.assert_called_once_with() fake_zk.Lock.assert_called_once_with('%s/%s' % (bounce_lib.ZK_LOCK_PATH, lock_name)) fake_lock.acquire.assert_called_once_with(timeout=1) fake_lock.release.assert_called_once_with() fake_zk.stop.assert_called_once_with()
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper(marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def delete_app(app_id, client): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) try: with bounce_lib.bounce_lock_zookeeper( marathon_tools.compose_job_id(service, instance)): bounce_lib.delete_marathon_app(app_id, client) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log(service=service, component='deploy', level='event', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=log_line) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log(service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline) raise
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): """deploy the service instance given and proccess return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param client: A MarathonClient object :param soa_dir: Path to yelpsoa configs :param marathon_config: The service instance's configuration dict :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1, None try: status, output, bounce_again_in_seconds = setup_service( service, instance, client, service_instance_config, marathon_apps, soa_dir, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0, bounce_again_in_seconds except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='paasta_bounce_progress.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def delete_app(app_id, client, soa_dir): """Deletes a marathon app safely and logs to notify the user that it happened""" log.warn("%s appears to be old; attempting to delete" % app_id) service, instance, _, __ = marathon_tools.deformat_job_id(app_id) cluster = load_system_paasta_config().get_cluster() try: short_app_id = marathon_tools.compose_job_id(service, instance) with bounce_lib.bounce_lock_zookeeper(short_app_id): bounce_lib.delete_marathon_app(app_id, client) send_event( service=service, check_name='check_marathon_services_replication.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='setup_marathon_job.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) send_event( service=service, check_name='paasta_bounce_progress.%s' % short_app_id, soa_dir=soa_dir, status=pysensu_yelp.Status.OK, overrides={}, output="This instance was removed and is no longer running", ) log_line = "Deleted stale marathon job that looks lost: %s" % app_id _log( service=service, component='deploy', level='event', cluster=cluster, instance=instance, line=log_line, ) except IOError: log.debug("%s is being bounced, skipping" % app_id) except Exception: loglines = ['Exception raised during cleanup of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: _log( service=service, component='deploy', level='debug', cluster=load_system_paasta_config().get_cluster(), instance=instance, line=logline, ) raise
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0 except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1 try: status, output = setup_service(service, instance, client, service_instance_config, marathon_apps, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0 except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1 except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params ) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_marathon_service( service: str, instance: str, clients: marathon_tools.MarathonClients, soa_dir: str, marathon_apps_with_clients: Optional[Sequence[Tuple[MarathonApp, MarathonClient]]], system_paasta_config: Optional[SystemPaastaConfig] = None, ) -> Tuple[int, float]: """deploy the service instance given and process return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param clients: A MarathonClients object :param soa_dir: Path to yelpsoa configs :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ if system_paasta_config is None: system_paasta_config = load_system_paasta_config() short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper( short_id, system_paasta_config=system_paasta_config): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, system_paasta_config.get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, system_paasta_config.get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = ( "Could not read marathon configuration file for %s.%s in cluster %s" % (service, instance, system_paasta_config.get_cluster())) log.error(error_msg) return 1, None if marathon_apps_with_clients is None: marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients( clients=clients.get_all_clients_for_service( job_config=service_instance_config), service_name=service, instance_name=instance, embed_tasks=True, ) try: with a_sync.idle_event_loop(): status, output, bounce_again_in_seconds = setup_service( service=service, instance=instance, clients=clients, job_config=service_instance_config, marathon_apps_with_clients=marathon_apps_with_clients, soa_dir=soa_dir, system_paasta_config=system_paasta_config, ) sensu_status = (pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK) send_event( service, instance, soa_dir, sensu_status, output, system_paasta_config, service_instance_config, ) return 0, bounce_again_in_seconds except ( KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError, ): error_str = traceback.format_exc() log.error(error_str) send_event( service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str, system_paasta_config, service_instance_config, ) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) short_id = marathon_tools.format_job_id(service, instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app) if new_app.instances < config['instances'] + num_at_risk_tasks: log.debug("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_service( service, instance, marathon_jobid, config, client, marathon_apps, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) (old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set(scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % (service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level="event"): return _log( service=service, line=errormsg, component="deploy", level="event", cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == "/%s" % config["id"]] other_apps = [a for a in existing_apps if a.id != "/%s" % config["id"]] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = "ERROR: drain_method not recognized: %s. Must be one of (%s)" % ( drain_method_name, ", ".join(drain_lib.list_drain_methods()), ) log_deploy_error(errormsg) return (1, errormsg) old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks(other_apps, drain_method) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. if new_app_running: for task in new_app.tasks: drain_method.stop_draining(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = "ERROR: bounce_method not recognized: %s. Must be one of (%s)" % ( bounce_method, ", ".join(bounce_lib.list_bounce_methods()), ) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_tasks=old_app_live_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ["Exception raised during deploy of service %s:" % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level="debug") raise return (0, "Service deployed.")