def test_deploy_service_already_bouncing(self): fake_bounce = "areallygoodbouncestrategy" fake_drain_method = "noop" fake_name = "how_many_strings" fake_instance = "will_i_need_to_think_of" fake_id = marathon_tools.format_job_id(fake_name, fake_instance, "gityourmom", "configyourdad") fake_config = {"id": fake_id, "instances": 2} old_app_id = "%s2" % fake_id old_task = mock.Mock(id="old_task_id", app_id=old_app_id) old_app = mock.Mock(id=old_app_id, tasks=[old_task]) fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=[old_app]), kill_task=mock.Mock(spec=lambda app_id, id, scale=False: None) ) fake_bounce_func = mock.create_autospec( bounce_lib.brutal_bounce, return_value={"create_app": True, "tasks_to_drain": [old_task]} ) fake_short_id = marathon_tools.format_job_id(fake_name, fake_instance) with contextlib.nested( mock.patch("paasta_tools.bounce_lib.get_bounce_method_func", return_value=fake_bounce_func, autospec=True), mock.patch( "paasta_tools.bounce_lib.bounce_lock_zookeeper", side_effect=bounce_lib.LockHeldException, autospec=True ), mock.patch( "paasta_tools.bounce_lib.get_happy_tasks", autospec=True, side_effect=lambda x, _, __, **kwargs: x ), mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), ) as (_, _, _, _, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value="fake_cluster") result = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir="fake_soa_dir", ) assert result == (1, "Instance %s is already being bounced." % fake_short_id)
def test_deploy_service_unknown_drain_method(self): fake_bounce = "exists" fake_drain_method = "doesntexist" fake_name = "whoa" fake_instance = "the_earth_is_tiny" fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=("%s2" % fake_id), tasks=[])] fake_client = mock.MagicMock(list_apps=mock.Mock(return_value=fake_apps)) fake_config = {"id": fake_id, "instances": 2} errormsg = "ERROR: drain_method not recognized: doesntexist. Must be one of (exists1, exists2)" expected = (1, errormsg) with contextlib.nested( mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), mock.patch("paasta_tools.drain_lib._drain_methods", new={"exists1": mock.Mock(), "exists2": mock.Mock()}), ) as (mock_log, mock_load_system_paasta_config, mock_drain_methods): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value="fake_cluster") actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir="fake_soa_dir", ) assert mock_log.call_count == 1 assert expected == actual
def test_status_mesos_tasks_verbose(): with contextlib.nested( mock.patch("paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks", autospec=True), mock.patch("paasta_tools.mesos_tools.get_non_running_tasks_from_active_frameworks", autospec=True), mock.patch("paasta_tools.mesos_tools.format_running_mesos_task_row", autospec=True), mock.patch("paasta_tools.mesos_tools.format_non_running_mesos_task_row", autospec=True), ) as ( get_running_mesos_tasks_patch, get_non_running_mesos_tasks_patch, format_running_mesos_task_row_patch, format_non_running_mesos_task_row_patch, ): get_running_mesos_tasks_patch.return_value = ["doing a lap"] get_non_running_mesos_tasks_patch.return_value = ["eating a burrito"] format_running_mesos_task_row_patch.return_value = ["id", "host", "mem", "cpu", "time"] format_non_running_mesos_task_row_patch.return_value = ["id", "host", "time", "state"] job_id = (format_job_id("fake_service", "fake_instance"),) def get_short_task_id(_): return "short_task_id" actual = mesos_tools.status_mesos_tasks_verbose(job_id, get_short_task_id) assert "Running Tasks" in actual assert "Non-Running Tasks" in actual format_running_mesos_task_row_patch.assert_called_once_with("doing a lap", get_short_task_id) format_non_running_mesos_task_row_patch.assert_called_once_with("eating a burrito", get_short_task_id)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def test_get_old_live_draining_tasks_empty(self): fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [ mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[]) ] expected_live_tasks = { fake_apps[0].id: set(), fake_apps[1].id: set(), } expected_draining_tasks = { fake_apps[0].id: set(), fake_apps[1].id: set(), } fake_drain_method = mock.Mock(is_draining=lambda _: True) actual = setup_marathon_job.get_old_live_draining_tasks(fake_apps, fake_drain_method) actual_live_tasks, actual_draining_tasks = actual assert actual_live_tasks == expected_live_tasks assert actual_draining_tasks == expected_draining_tasks
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) # We have to add a spacer at the end to make sure we only return # things for service.main and not service.main_foo filter_string = "%s%s" % (job_id, marathon_tools.MESOS_TASK_SPACER) try: count = len( select_tasks_by_id( get_cached_list_of_running_tasks_from_frameworks(), filter_string)) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string) except ReadTimeout: return "Error: talking to Mesos timed out. It may be overloaded."
def test_deploy_service_logs_exceptions(self): fake_bounce = "WHEEEEEEEEEEEEEEEE" fake_drain_method = "noop" fake_name = "whoa" fake_instance = "the_earth_is_tiny" fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=("%s2" % fake_id), tasks=[])] fake_client = mock.MagicMock(list_apps=mock.Mock(return_value=fake_apps)) fake_config = {"id": fake_id, "instances": 2} with contextlib.nested( mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), mock.patch("paasta_tools.setup_marathon_job.bounce_lib.get_bounce_method_func", side_effect=IOError("foo")), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), ) as (mock_log, mock_bounce, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value="fake_cluster") with raises(IOError): setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir="fake_soa_dir", ) assert fake_name in mock_log.mock_calls[0][2]["line"] assert "Traceback" in mock_log.mock_calls[1][2]["line"]
def test_get_old_live_draining_tasks_not_empty(self): fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) def fake_task(state): return mock.Mock(_drain_state=state) fake_apps = [ mock.Mock(id=fake_id, tasks=[fake_task('up'), fake_task('down')]), mock.Mock(id=('%s2' % fake_id), tasks=[fake_task('up'), fake_task('down')]) ] expected_live_tasks = { fake_apps[0].id: set([fake_apps[0].tasks[0]]), fake_apps[1].id: set([fake_apps[1].tasks[0]]), } expected_draining_tasks = { fake_apps[0].id: set([fake_apps[0].tasks[1]]), fake_apps[1].id: set([fake_apps[1].tasks[1]]), } fake_drain_method = mock.Mock(is_draining=lambda t: t._drain_state == 'down') actual = setup_marathon_job.get_old_live_draining_tasks(fake_apps, fake_drain_method) actual_live_tasks, actual_draining_tasks = actual assert actual_live_tasks == expected_live_tasks assert actual_draining_tasks == expected_draining_tasks
def http_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, endpoint='status', *args, **kwargs): """ Gets the average utilization of a service across all of its tasks, where the utilization of a task is read from a HTTP endpoint on the host. The HTTP endpoint must return JSON with a 'utilization' key with a value from 0 to 1. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's average utilization, from 0 to 1 """ job_id = format_job_id(marathon_service_config.service, marathon_service_config.instance) endpoint = endpoint.lstrip('/') def get_short_job_id(task_id): return MESOS_TASK_SPACER.join(task_id.split(MESOS_TASK_SPACER, 2)[:2]) tasks = [task for task in marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results] utilization = [] for task in tasks: try: utilization.append(float(requests.get('http://%s:%s/%s' % ( task.host, task.ports[0], endpoint)).json()['utilization'])) except Exception: pass if not utilization: raise IngesterNoDataError('Couldn\'t get any data from http endpoint %s for %s.%s' % ( endpoint, marathon_service_config.service, marathon_service_config.instance)) return sum(utilization) / len(utilization)
def test_status_mesos_tasks_verbose(): with contextlib.nested( mock.patch('paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks', autospec=True,), mock.patch('paasta_tools.mesos_tools.get_non_running_tasks_from_active_frameworks', autospec=True,), mock.patch('paasta_tools.mesos_tools.format_running_mesos_task_row', autospec=True,), mock.patch('paasta_tools.mesos_tools.format_non_running_mesos_task_row', autospec=True,), ) as ( get_running_mesos_tasks_patch, get_non_running_mesos_tasks_patch, format_running_mesos_task_row_patch, format_non_running_mesos_task_row_patch, ): get_running_mesos_tasks_patch.return_value = ['doing a lap'] get_non_running_mesos_tasks_patch.return_value = ['eating a burrito'] format_running_mesos_task_row_patch.return_value = ['id', 'host', 'mem', 'cpu', 'disk', 'time'] format_non_running_mesos_task_row_patch.return_value = ['id', 'host', 'time', 'state'] job_id = format_job_id('fake_service', 'fake_instance'), def get_short_task_id(_): return 'short_task_id' actual = mesos_tools.status_mesos_tasks_verbose(job_id, get_short_task_id) assert 'Running Tasks' in actual assert 'Non-Running Tasks' in actual format_running_mesos_task_row_patch.assert_called_once_with('doing a lap', get_short_task_id) format_non_running_mesos_task_row_patch.assert_called_once_with('eating a burrito', get_short_task_id)
def test_deploy_service_logs_exceptions(self): fake_bounce = 'WHEEEEEEEEEEEEEEEE' fake_drain_method = 'noop' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} with contextlib.nested( mock.patch('setup_marathon_job._log', autospec=True), mock.patch('setup_marathon_job.bounce_lib.get_bounce_method_func', side_effect=IOError('foo')), mock.patch('setup_marathon_job.load_system_paasta_config', autospec=True), ) as (mock_log, mock_bounce, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') with raises(IOError): setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert fake_name in mock_log.mock_calls[0][2]["line"] assert 'Traceback' in mock_log.mock_calls[1][2]["line"]
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances( ) and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() all_marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks( '') # empty string matches all app ids with ZookeeperPool(): for config in configs: if config.get_autoscaling_params( )['decision_policy'] != 'bespoke': try: job_id = format_job_id(config.service, config.instance) marathon_tasks = { task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks" ) mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: raise e write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def test_get_old_live_draining_tasks_not_empty(self): fake_name = "whoa" fake_instance = "the_earth_is_tiny" fake_id = marathon_tools.format_job_id(fake_name, fake_instance) def fake_task(state): return mock.Mock(_drain_state=state) fake_apps = [ mock.Mock(id=fake_id, tasks=[fake_task("up"), fake_task("down")]), mock.Mock(id=("%s2" % fake_id), tasks=[fake_task("up"), fake_task("down")]), ] expected_live_tasks = { fake_apps[0].id: set([fake_apps[0].tasks[0]]), fake_apps[1].id: set([fake_apps[1].tasks[0]]), } expected_draining_tasks = { fake_apps[0].id: set([fake_apps[0].tasks[1]]), fake_apps[1].id: set([fake_apps[1].tasks[1]]), } fake_drain_method = mock.Mock(is_draining=lambda t: t._drain_state == "down") actual = setup_marathon_job.get_old_live_draining_tasks(fake_apps, fake_drain_method) actual_live_tasks, actual_draining_tasks = actual assert actual_live_tasks == expected_live_tasks assert actual_draining_tasks == expected_draining_tasks
def filter_autoscaling_tasks(marathon_apps, all_mesos_tasks, config): job_id_prefix = "%s%s" % (format_job_id( service=config.service, instance=config.instance), MESOS_TASK_SPACER) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy (unless they are "old" in which case we # assume that marathon has screwed up and stopped healthchecking but that # they are healthy log.info("Inspecting %s for autoscaling" % job_id_prefix) marathon_tasks = {} for app in marathon_apps: for task in app.tasks: if task.id.startswith(job_id_prefix) and ( is_task_healthy(task) or not app.health_checks or is_old_task_missing_healthchecks(task, app)): marathon_tasks[task.id] = task if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks") mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] return (marathon_tasks, mesos_tasks)
def test_deploy_service_logs_exceptions(self): fake_bounce = 'WHEEEEEEEEEEEEEEEE' fake_drain_method = 'noop' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} with contextlib.nested( mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.setup_marathon_job.bounce_lib.get_bounce_method_func', side_effect=IOError('foo')), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), ) as (mock_log, mock_bounce, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') with raises(IOError): setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert fake_name in mock_log.mock_calls[0][2]["line"] assert 'Traceback' in mock_log.mock_calls[1][2]["line"]
def test_setup_service_srv_already_exists(self): fake_name = "if_trees_could_talk" fake_instance = "would_they_scream" fake_client = mock.MagicMock(get_app=mock.Mock(return_value=True)) full_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_complete = {"seven": "full", "eight": "frightened", "nine": "eaten", "id": full_id} with contextlib.nested( mock.patch("paasta_tools.marathon_tools.create_complete_config", return_value=fake_complete, autospec=True), mock.patch( "paasta_tools.marathon_tools.load_marathon_config", return_value=self.fake_marathon_config, autospec=True, ), mock.patch("paasta_tools.setup_marathon_job.deploy_service", autospec=True), ) as (create_config_patch, get_config_patch, deploy_service_patch): setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) create_config_patch.assert_called_once_with(fake_name, fake_instance, self.fake_marathon_config) assert deploy_service_patch.call_count == 1
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): """deploy the service instance given and proccess return code if there was an error we send a sensu alert. :param service: The service name to setup :param instance: The instance of the service to setup :param client: A MarathonClient object :param soa_dir: Path to yelpsoa configs :param marathon_config: The service instance's configuration dict :param marathon_apps: A list of all marathon app objects :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd bounce_in_seconds instructs how long until the deployd should try another bounce None means that it is in a steady state and doesn't need to bounce again """ short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config_no_cache( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0, None except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1, None try: status, output, bounce_again_in_seconds = setup_service( service, instance, client, service_instance_config, marathon_apps, soa_dir, ) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0, bounce_again_in_seconds except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1, None except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0, None
def write_to_log(config, line, level='event'): _log( service=config.service, line="%s: %s" % (format_job_id(config.service, config.instance), line), component='deploy', level=level, cluster=config.cluster, instance=config.instance, )
async def marathon_mesos_status(service: str, instance: str, verbose: int) -> MutableMapping[str, Any]: mesos_status: MutableMapping[str, Any] = {} job_id = marathon_tools.format_job_id(service, instance) job_id_filter_string = f"{job_id}{marathon_tools.MESOS_TASK_SPACER}" try: running_and_active_tasks = select_tasks_by_id( await get_cached_list_of_running_tasks_from_frameworks(), job_id=job_id_filter_string, ) except (ReadTimeout, asyncio.TimeoutError): return { "error_message": "Talking to Mesos timed out. It may be overloaded." } mesos_status["running_task_count"] = len(running_and_active_tasks) if verbose > 0: num_tail_lines = calculate_tail_lines(verbose) running_task_dict_futures = [] for task in running_and_active_tasks: running_task_dict_futures.append( asyncio.ensure_future( get_mesos_running_task_dict(task, num_tail_lines))) non_running_tasks = select_tasks_by_id( await get_cached_list_of_not_running_tasks_from_frameworks(), job_id=job_id_filter_string, ) non_running_tasks.sort( key=lambda task: get_first_status_timestamp(task) or 0) non_running_tasks = list(reversed(non_running_tasks[-10:])) non_running_task_dict_futures = [] for task in non_running_tasks: non_running_task_dict_futures.append( asyncio.ensure_future( get_mesos_non_running_task_dict(task, num_tail_lines))) all_task_dict_futures = (running_task_dict_futures + non_running_task_dict_futures) if len(all_task_dict_futures): await asyncio.wait(all_task_dict_futures) mesos_status["running_tasks"] = [ task_future.result() for task_future in running_task_dict_futures ] mesos_status["non_running_tasks"] = [ task_future.result() for task_future in non_running_task_dict_futures ] return mesos_status
def marathon_log_line_passes_filter(line, levels, service, components, clusters): """Given a (JSON-formatted) log line where the message is a Marathon log line, return True if the line should be displayed given the provided service; return False otherwise.""" try: parsed_line = json.loads(line) except ValueError: log.debug('Trouble parsing line as json. Skipping. Line: %r' % line) return False return format_job_id(service, '') in parsed_line.get('message', '')
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client( marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password(), ) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn( "%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float( len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical( "Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold, ) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def check_healthy_marathon_tasks_for_service_instance(instance_config, expected_count, all_tasks): app_id = format_job_id(instance_config.service, instance_config.instance) num_healthy_tasks = filter_healthy_marathon_instances_for_short_app_id( all_tasks=all_tasks, app_id=app_id) log.info("Checking %s in marathon as it is not in smartstack" % app_id) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=expected_count, num_available=num_healthy_tasks, )
def check_healthy_marathon_tasks_for_service_instance(client, service, instance, cluster, soa_dir, expected_count): app_id = format_job_id(service, instance) log.info("Checking %s in marathon as it is not in smartstack" % app_id) num_healthy_tasks = get_healthy_marathon_instances_for_short_app_id(client, app_id) send_event_if_under_replication( service=service, instance=instance, cluster=cluster, expected_count=expected_count, num_available=num_healthy_tasks, soa_dir=soa_dir, )
def marathon_log_line_passes_filter(line, levels, service, components, clusters, start_time=None, end_time=None): """Given a (JSON-formatted) log line where the message is a Marathon log line, return True if the line should be displayed given the provided service; return False otherwise.""" try: parsed_line = json.loads(line) except ValueError: log.debug('Trouble parsing line as json. Skipping. Line: %r' % line) return False timestamp = isodate.parse_datetime(parsed_line.get('timestamp')) if not check_timestamp_in_range(timestamp, start_time, end_time): return False return format_job_id(service, '') in parsed_line.get('message', '')
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) running_and_active_tasks = get_running_tasks_from_active_frameworks(job_id) count = len(running_and_active_tasks) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \ and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke': configs.append(service_config) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def cleanup_apps(soa_dir, kill_threshold=0.5, force=False): """Clean up old or invalid jobs/apps from marathon. Retrieves both a list of apps currently in marathon and a list of valid app ids in order to determine what to kill. :param soa_dir: The SOA config directory to read from :param kill_threshold: The decimal fraction of apps we think is sane to kill when this job runs. :param force: Force the cleanup if we are above the kill_threshold""" log.info("Loading marathon configuration") marathon_config = marathon_tools.load_marathon_config() log.info("Connecting to marathon") client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(), marathon_config.get_password()) valid_services = get_services_for_cluster(instance_type='marathon', soa_dir=soa_dir) running_app_ids = marathon_tools.list_all_marathon_app_ids(client) running_apps = [] for app_id in running_app_ids: try: app_id = marathon_tools.deformat_job_id(app_id) except InvalidJobNameError: log.warn("%s doesn't conform to paasta naming conventions? Skipping." % app_id) continue running_apps.append(app_id) apps_to_kill = [(service, instance, git_sha, config_sha) for service, instance, git_sha, config_sha in running_apps if (service, instance) not in valid_services] log.debug("Running apps: %s" % running_apps) log.debug("Valid apps: %s" % valid_services) log.debug("Terminating: %s" % apps_to_kill) if running_apps: above_kill_threshold = float(len(apps_to_kill)) / float(len(running_apps)) > float(kill_threshold) if above_kill_threshold and not force: log.critical("Paasta was about to kill more than %s of the running services, this " "is probably a BAD mistake!, run again with --force if you " "really need to destroy everything" % kill_threshold) raise DontKillEverythingError for running_app in apps_to_kill: app_id = marathon_tools.format_job_id(*running_app) delete_app( app_id=app_id, client=client, soa_dir=soa_dir, )
def test_status_mesos_tasks_verbose(test_case): tail_lines, expected_format_tail_call_count = test_case filter_string = format_job_id('fake_service', 'fake_instance') with asynctest.patch( 'paasta_tools.mesos_tools.get_cached_list_of_running_tasks_from_frameworks', autospec=True, return_value=[{'id': filter_string}], ), asynctest.patch( 'paasta_tools.mesos_tools.get_cached_list_of_not_running_tasks_from_frameworks', autospec=True, ) as get_cached_list_of_not_running_tasks_from_frameworks_patch, asynctest.patch( 'paasta_tools.mesos_tools.format_running_mesos_task_row', autospec=True, ) as format_running_mesos_task_row_patch, asynctest.patch( 'paasta_tools.mesos_tools.format_non_running_mesos_task_row', autospec=True, ) as format_non_running_mesos_task_row_patch, asynctest.patch( 'paasta_tools.mesos_tools.format_stdstreams_tail_for_task', autospec=True, ) as format_stdstreams_tail_for_task_patch: template_task_return = { 'id': filter_string, 'statuses': [{'timestamp': '##########'}], 'state': 'NOT_RUNNING', } non_running_mesos_tasks = [] for _ in range(15): # excercise the code that sorts/truncates the list of non running tasks task_return = template_task_return.copy() task_return['statuses'][0]['timestamp'] = str(1457109986 + random.randrange(-60 * 60 * 24, 60 * 60 * 24)) non_running_mesos_tasks.append(task_return) get_cached_list_of_not_running_tasks_from_frameworks_patch.return_value = non_running_mesos_tasks format_running_mesos_task_row_patch.return_value = ['id', 'host', 'mem', 'cpu', 'time'] format_non_running_mesos_task_row_patch.return_value = ['id', 'host', 'time', 'state'] format_stdstreams_tail_for_task_patch.return_value = ['tail'] actual = mesos_tools.status_mesos_tasks_verbose( filter_string=filter_string, get_short_task_id=mock.sentinel.get_short_task_id, tail_lines=tail_lines, ) assert 'Running Tasks' in actual assert 'Non-Running Tasks' in actual format_running_mesos_task_row_patch.assert_called_once_with( {'id': filter_string}, mock.sentinel.get_short_task_id, ) assert format_non_running_mesos_task_row_patch.call_count == 10 # maximum n of tasks we display assert format_stdstreams_tail_for_task_patch.call_count == expected_format_tail_call_count
def test_status_mesos_tasks_verbose(test_case): tail_lines, expected_format_tail_call_count = test_case with contextlib.nested( mock.patch('paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks', autospec=True,), mock.patch('paasta_tools.mesos_tools.get_non_running_tasks_from_active_frameworks', autospec=True,), mock.patch('paasta_tools.mesos_tools.format_running_mesos_task_row', autospec=True,), mock.patch('paasta_tools.mesos_tools.format_non_running_mesos_task_row', autospec=True,), mock.patch('paasta_tools.mesos_tools.format_stdstreams_tail_for_task', autospec=True,), ) as ( get_running_mesos_tasks_patch, get_non_running_mesos_tasks_patch, format_running_mesos_task_row_patch, format_non_running_mesos_task_row_patch, format_stdstreams_tail_for_task_patch, ): get_running_mesos_tasks_patch.return_value = ['doing a lap'] template_task_return = { 'statuses': [{'timestamp': '##########'}], 'state': 'NOT_RUNNING', } non_running_mesos_tasks = [] for _ in xrange(15): # excercise the code that sorts/truncates the list of non running tasks task_return = template_task_return.copy() task_return['statuses'][0]['timestamp'] = str(1457109986 + random.randrange(-60 * 60 * 24, 60 * 60 * 24)) non_running_mesos_tasks.append(task_return) get_non_running_mesos_tasks_patch.return_value = non_running_mesos_tasks format_running_mesos_task_row_patch.return_value = ['id', 'host', 'mem', 'cpu', 'time'] format_non_running_mesos_task_row_patch.return_value = ['id', 'host', 'time', 'state'] format_stdstreams_tail_for_task_patch.return_value = ['tail'] job_id = format_job_id('fake_service', 'fake_instance'), def get_short_task_id(_): return 'short_task_id' actual = mesos_tools.status_mesos_tasks_verbose( job_id=job_id, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) assert 'Running Tasks' in actual assert 'Non-Running Tasks' in actual format_running_mesos_task_row_patch.assert_called_once_with('doing a lap', get_short_task_id) assert format_non_running_mesos_task_row_patch.call_count == 10 # maximum n of tasks we display assert format_stdstreams_tail_for_task_patch.call_count == expected_format_tail_call_count
def test_setup_service_srv_already_exists(self): fake_name = 'if_trees_could_talk' fake_instance = 'would_they_scream' fake_client = mock.MagicMock(get_app=mock.Mock(return_value=True)) full_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_complete = { 'seven': 'full', 'eight': 'frightened', 'nine': 'eaten', 'id': full_id, } with contextlib.nested( mock.patch( 'paasta_tools.marathon_tools.create_complete_config', return_value=fake_complete, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_marathon_config', return_value=self.fake_marathon_config, autospec=True, ), mock.patch( 'setup_marathon_job.deploy_service', autospec=True, ), ) as ( create_config_patch, get_config_patch, deploy_service_patch, ): setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) create_config_patch.assert_called_once_with( fake_name, fake_instance, self.fake_marathon_config, ) assert deploy_service_patch.call_count == 1
def test_setup_service_srv_already_exists(self): fake_name = 'if_trees_could_talk' fake_instance = 'would_they_scream' fake_client = mock.MagicMock(get_app=mock.Mock(return_value=True)) full_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_complete = { 'seven': 'full', 'eight': 'frightened', 'nine': 'eaten', 'id': full_id, } with contextlib.nested( mock.patch( 'paasta_tools.marathon_tools.create_complete_config', return_value=fake_complete, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_marathon_config', return_value=self.fake_marathon_config, autospec=True, ), mock.patch( 'paasta_tools.setup_marathon_job.deploy_service', autospec=True, ), ) as ( create_config_patch, get_config_patch, deploy_service_patch, ): setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) create_config_patch.assert_called_once_with( fake_name, fake_instance, self.fake_marathon_config, ) assert deploy_service_patch.call_count == 1
def test_status_mesos_tasks_verbose(): with contextlib.nested( mock.patch( 'paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks', autospec=True, ), mock.patch( 'paasta_tools.mesos_tools.get_non_running_tasks_from_active_frameworks', autospec=True, ), mock.patch( 'paasta_tools.mesos_tools.format_running_mesos_task_row', autospec=True, ), mock.patch( 'paasta_tools.mesos_tools.format_non_running_mesos_task_row', autospec=True, ), ) as ( get_running_mesos_tasks_patch, get_non_running_mesos_tasks_patch, format_running_mesos_task_row_patch, format_non_running_mesos_task_row_patch, ): get_running_mesos_tasks_patch.return_value = ['doing a lap'] get_non_running_mesos_tasks_patch.return_value = ['eating a burrito'] format_running_mesos_task_row_patch.return_value = [ 'id', 'host', 'mem', 'cpu', 'time' ] format_non_running_mesos_task_row_patch.return_value = [ 'id', 'host', 'time', 'state' ] job_id = format_job_id('fake_service', 'fake_instance'), def get_short_task_id(_): return 'short_task_id' actual = mesos_tools.status_mesos_tasks_verbose( job_id, get_short_task_id) assert 'Running Tasks' in actual assert 'Non-Running Tasks' in actual format_running_mesos_task_row_patch.assert_called_once_with( 'doing a lap', get_short_task_id) format_non_running_mesos_task_row_patch.assert_called_once_with( 'eating a burrito', get_short_task_id)
def status_mesos_tasks(service, instance, normal_instance_count): job_id = marathon_tools.format_job_id(service, instance) # We have to add a spacer at the end to make sure we only return # things for service.main and not service.main_foo filter_string = "%s%s" % (job_id, marathon_tools.MESOS_TASK_SPACER) running_and_active_tasks = get_running_tasks_from_active_frameworks(filter_string) count = len(running_and_active_tasks) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') return "Mesos: %s - %s tasks in the %s state." % (status, count, running_string)
def status_mesos_tasks( service: str, instance: str, normal_instance_count: int, verbose: int, ) -> str: job_id = marathon_tools.format_job_id(service, instance) # We have to add a spacer at the end to make sure we only return # things for service.main and not service.main_foo filter_string = f"{job_id}{marathon_tools.MESOS_TASK_SPACER}" try: count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), filter_string)) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count_str = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count_str = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count_str = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') output = f"Mesos: {status} - {count_str} tasks in the {running_string} state." except ReadTimeout: return "Error: talking to Mesos timed out. It may be overloaded." if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) output += '\n' + status_mesos_tasks_verbose( filter_string=filter_string, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) return output
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps): short_id = marathon_tools.format_job_id(service, instance) try: with bounce_lib.bounce_lock_zookeeper(short_id): try: service_instance_config = marathon_tools.load_marathon_service_config( service, instance, load_system_paasta_config().get_cluster(), soa_dir=soa_dir, ) except NoDeploymentsAvailable: log.debug( "No deployments found for %s.%s in cluster %s. Skipping." % (service, instance, load_system_paasta_config().get_cluster())) return 0 except NoConfigurationForServiceError: error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \ (service, instance, load_system_paasta_config().get_cluster()) log.error(error_msg) return 1 try: status, output = setup_service(service, instance, client, service_instance_config, marathon_apps, soa_dir) sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK send_event(service, instance, soa_dir, sensu_status, output) return 0 except (KeyError, TypeError, AttributeError, InvalidInstanceConfig): error_str = traceback.format_exc() log.error(error_str) send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str) return 1 except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return 0
def test_status_mesos_tasks_verbose(test_case): tail_stdstreams, expected_format_tail_call_count = test_case with contextlib.nested( mock.patch("paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks", autospec=True), mock.patch("paasta_tools.mesos_tools.get_non_running_tasks_from_active_frameworks", autospec=True), mock.patch("paasta_tools.mesos_tools.format_running_mesos_task_row", autospec=True), mock.patch("paasta_tools.mesos_tools.format_non_running_mesos_task_row", autospec=True), mock.patch("paasta_tools.mesos_tools.format_stdstreams_tail_for_task", autospec=True), ) as ( get_running_mesos_tasks_patch, get_non_running_mesos_tasks_patch, format_running_mesos_task_row_patch, format_non_running_mesos_task_row_patch, format_stdstreams_tail_for_task_patch, ): get_running_mesos_tasks_patch.return_value = ["doing a lap"] template_task_return = {"statuses": [{"timestamp": "##########"}], "state": "NOT_RUNNING"} non_running_mesos_tasks = [] for _ in xrange(15): # excercise the code that sorts/truncates the list of non running tasks task_return = template_task_return.copy() task_return["statuses"][0]["timestamp"] = str(1457109986 + random.randrange(-60 * 60 * 24, 60 * 60 * 24)) non_running_mesos_tasks.append(task_return) get_non_running_mesos_tasks_patch.return_value = non_running_mesos_tasks format_running_mesos_task_row_patch.return_value = ["id", "host", "mem", "cpu", "time"] format_non_running_mesos_task_row_patch.return_value = ["id", "host", "time", "state"] format_stdstreams_tail_for_task_patch.return_value = ["tail"] job_id = (format_job_id("fake_service", "fake_instance"),) def get_short_task_id(_): return "short_task_id" actual = mesos_tools.status_mesos_tasks_verbose(job_id, get_short_task_id, tail_stdstreams) assert "Running Tasks" in actual assert "Non-Running Tasks" in actual format_running_mesos_task_row_patch.assert_called_once_with("doing a lap", get_short_task_id) assert format_non_running_mesos_task_row_patch.call_count == 10 # maximum n of tasks we display assert format_stdstreams_tail_for_task_patch.call_count == expected_format_tail_call_count
def test_deploy_service_unknown_drain_method(self): fake_bounce = 'exists' fake_drain_method = 'doesntexist' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} errormsg = 'ERROR: drain_method not recognized: doesntexist. Must be one of (exists1, exists2)' expected = (1, errormsg) with contextlib.nested( mock.patch('setup_marathon_job._log', autospec=True), mock.patch('setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch( 'paasta_tools.drain_lib._drain_methods', new={'exists1': mock.Mock(), 'exists2': mock.Mock()}, ) ) as (mock_log, mock_load_system_paasta_config, mock_drain_methods): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert mock_log.call_count == 1 assert expected == actual
def test_deploy_service_unknown_drain_method(self): fake_bounce = 'exists' fake_drain_method = 'doesntexist' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} errormsg = 'ERROR: drain_method not recognized: doesntexist. Must be one of (exists1, exists2)' expected = (1, errormsg) with contextlib.nested( mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch( 'paasta_tools.drain_lib._drain_methods', new={'exists1': mock.Mock(), 'exists2': mock.Mock()}, ) ) as (mock_log, mock_load_system_paasta_config, mock_drain_methods): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert mock_log.call_count == 1 assert expected == actual
def test_deploy_service_unknown_bounce(self): fake_bounce = "WHEEEEEEEEEEEEEEEE" fake_drain_method = "noop" fake_name = "whoa" fake_instance = "the_earth_is_tiny" fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=("%s2" % fake_id), tasks=[])] fake_client = mock.MagicMock(list_apps=mock.Mock(return_value=fake_apps)) fake_config = {"id": fake_id, "instances": 2} errormsg = "ERROR: bounce_method not recognized: %s. Must be one of (%s)" % ( fake_bounce, ", ".join(list_bounce_methods()), ) expected = (1, errormsg) with contextlib.nested( mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), ) as (mock_log, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value="fake_cluster") actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir="fake_soa_dir", ) assert mock_log.call_count == 1 assert expected == actual fake_client.list_apps.assert_called_once_with(embed_failures=True) assert fake_client.create_app.call_count == 0
def test_deploy_service_unknown_bounce(self): fake_bounce = 'WHEEEEEEEEEEEEEEEE' fake_drain_method = 'noop' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (fake_bounce, ', '.join(list_bounce_methods())) expected = (1, errormsg) with contextlib.nested( mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), ) as (mock_log, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert mock_log.call_count == 1 assert expected == actual fake_client.list_apps.assert_called_once_with(embed_failures=True) assert fake_client.create_app.call_count == 0
def test_deploy_service_unknown_bounce(self): fake_bounce = 'WHEEEEEEEEEEEEEEEE' fake_drain_method = 'noop' fake_name = 'whoa' fake_instance = 'the_earth_is_tiny' fake_id = marathon_tools.format_job_id(fake_name, fake_instance) fake_apps = [mock.Mock(id=fake_id, tasks=[]), mock.Mock(id=('%s2' % fake_id), tasks=[])] fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=fake_apps)) fake_config = {'id': fake_id, 'instances': 2} errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (fake_bounce, ', '.join(list_bounce_methods())) expected = (1, errormsg) with contextlib.nested( mock.patch('setup_marathon_job._log', autospec=True), mock.patch('setup_marathon_job.load_system_paasta_config', autospec=True), ) as (mock_log, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') actual = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert mock_log.call_count == 1 assert expected == actual fake_client.list_apps.assert_called_once_with(embed_failures=True) assert fake_client.create_app.call_count == 0
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_service( service, instance, marathon_jobid, config, client, marathon_apps, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) (old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set(scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % (service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def test_setup_service_srv_does_not_exist(self): fake_name = 'if_talk_was_cheap' fake_instance = 'psychatrists_would_be_broke' fake_response = mock.Mock( json=mock.Mock(return_value={'message': 'test'})) fake_client = mock.MagicMock(get_app=mock.Mock( side_effect=marathon.exceptions.NotFoundError(fake_response))) full_id = marathon_tools.format_job_id(fake_name, fake_instance, 'oogabooga', 'bananafanafofooga') fake_complete = { 'do': 'you', 'even': 'dota', 'id': full_id, 'docker_image': 'fake_docker_registry/fake_docker_image', } fake_bounce = 'trampoline' fake_drain_method = 'noop' fake_drain_method_params = {} with contextlib.nested( mock.patch( 'paasta_tools.marathon_tools.create_complete_config', return_value=fake_complete, autospec=True, ), mock.patch( 'setup_marathon_job.deploy_service', return_value=(111, 'Never'), autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_bounce_method', return_value=fake_bounce, autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_drain_method', return_value=fake_drain_method, autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_drain_method_params', return_value=fake_drain_method_params, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_marathon_service_config', return_value=self.fake_marathon_service_config, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_service_namespace_config', return_value=self.fake_service_namespace_config, autospec=True, ), ) as ( create_config_patch, deploy_service_patch, get_bounce_patch, get_drain_method_patch, get_drain_method_params_patch, read_service_conf_patch, read_namespace_conf_patch, ): status, output = setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) assert status == 111 assert output == 'Never' create_config_patch.assert_called_once_with( fake_name, fake_instance, self.fake_marathon_config ) get_bounce_patch.assert_called_once_with() get_drain_method_patch.assert_called_once_with(read_namespace_conf_patch.return_value) deploy_service_patch.assert_called_once_with( service=fake_name, instance=fake_instance, marathon_jobid=full_id, config=fake_complete, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params=fake_drain_method_params, nerve_ns=self.fake_marathon_service_config.get_nerve_namespace(), bounce_health_params=self.fake_marathon_service_config.get_bounce_health_params( read_namespace_conf_patch.return_value), soa_dir=None, )
def test_deploy_service_already_bouncing(self): fake_bounce = 'areallygoodbouncestrategy' fake_drain_method = 'noop' fake_name = 'how_many_strings' fake_instance = 'will_i_need_to_think_of' fake_id = marathon_tools.format_job_id(fake_name, fake_instance, 'gityourmom', 'configyourdad') fake_config = {'id': fake_id, 'instances': 2} old_app_id = ('%s2' % fake_id) old_task = mock.Mock(id="old_task_id", app_id=old_app_id) old_app = mock.Mock(id=old_app_id, tasks=[old_task]) fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=[old_app]), kill_task=mock.Mock(spec=lambda app_id, id, scale=False: None), ) fake_bounce_func = mock.create_autospec( bounce_lib.brutal_bounce, return_value={ "create_app": True, "tasks_to_drain": [old_task], } ) fake_short_id = marathon_tools.format_job_id(fake_name, fake_instance) with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_bounce_method_func', return_value=fake_bounce_func, autospec=True, ), mock.patch( 'paasta_tools.bounce_lib.bounce_lock_zookeeper', side_effect=bounce_lib.LockHeldException, autospec=True ), mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, side_effect=lambda x, _, __, **kwargs: x, ), mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), ) as (_, _, _, _, mock_load_system_paasta_config): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') result = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert result == (1, "Instance %s is already being bounced." % fake_short_id)
def test_deploy_service_known_bounce(self): fake_bounce = 'areallygoodbouncestrategy' fake_drain_method_name = 'noop' fake_name = 'how_many_strings' fake_instance = 'will_i_need_to_think_of' fake_id = marathon_tools.format_job_id(fake_name, fake_instance, 'git11111111', 'config11111111') fake_config = {'id': fake_id, 'instances': 2} old_app_id = marathon_tools.format_job_id(fake_name, fake_instance, 'git22222222', 'config22222222') old_task_to_drain = mock.Mock(id="old_task_to_drain", app_id=old_app_id) old_task_is_draining = mock.Mock(id="old_task_is_draining", app_id=old_app_id) old_task_dont_drain = mock.Mock(id="old_task_dont_drain", app_id=old_app_id) old_app = mock.Mock(id="/%s" % old_app_id, tasks=[old_task_to_drain, old_task_is_draining, old_task_dont_drain]) fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=[old_app]), kill_task=mock.Mock(spec=lambda app_id, id, scale=False: None), ) fake_bounce_func = mock.create_autospec( bounce_lib.brutal_bounce, return_value={ "create_app": True, "tasks_to_drain": [old_task_to_drain], } ) fake_drain_method = mock.Mock(is_draining=lambda t: t is old_task_is_draining, is_safe_to_kill=lambda t: True) with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_bounce_method_func', return_value=fake_bounce_func, autospec=True, ), mock.patch( 'paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True ), mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, side_effect=lambda x, _, __, **kwargs: x, ), mock.patch('paasta_tools.bounce_lib.kill_old_ids', autospec=True), mock.patch('paasta_tools.bounce_lib.create_marathon_app', autospec=True), mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch('paasta_tools.drain_lib.get_drain_method', return_value=fake_drain_method), ) as (_, _, _, kill_old_ids_patch, create_marathon_app_patch, mock_log, mock_load_system_paasta_config, _): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value='fake_cluster') result = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method_name, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir='fake_soa_dir', ) assert result[0] == 0, "Expected successful result; got (%d, %s)" % result fake_client.list_apps.assert_called_once_with(embed_failures=True) assert fake_client.create_app.call_count == 0 fake_bounce_func.assert_called_once_with( new_config=fake_config, new_app_running=False, happy_new_tasks=[], old_app_live_tasks={old_app.id: set([old_task_to_drain, old_task_dont_drain])}, ) assert fake_drain_method.drain.call_count == 2 fake_drain_method.drain.assert_any_call(old_task_is_draining) fake_drain_method.drain.assert_any_call(old_task_to_drain) assert fake_client.kill_task.call_count == 2 fake_client.kill_task.assert_any_call(old_app_id, old_task_is_draining.id, scale=True) fake_client.kill_task.assert_any_call(old_app_id, old_task_to_drain.id, scale=True) create_marathon_app_patch.assert_called_once_with(fake_config['id'], fake_config, fake_client) assert kill_old_ids_patch.call_count == 0 # We should call _log 5 times: # 1. bounce starts # 2. create new app # 3. draining old tasks # 4. remove old apps # 5. bounce finishes assert mock_log.call_count == 5
def test_setup_service_srv_does_not_exist(self): fake_name = 'if_talk_was_cheap' fake_instance = 'psychatrists_would_be_broke' fake_response = mock.Mock( json=mock.Mock(return_value={'message': 'test'})) fake_client = mock.MagicMock(get_app=mock.Mock( side_effect=marathon.exceptions.NotFoundError(fake_response))) full_id = marathon_tools.format_job_id(fake_name, fake_instance, 'oogabooga', 'bananafanafofooga') fake_complete = { 'do': 'you', 'even': 'dota', 'id': full_id, 'docker_image': 'fake_docker_registry/fake_docker_image', } fake_bounce = 'trampoline' fake_drain_method = 'noop' fake_drain_method_params = {} with contextlib.nested( mock.patch( 'paasta_tools.marathon_tools.create_complete_config', return_value=fake_complete, autospec=True, ), mock.patch( 'paasta_tools.setup_marathon_job.deploy_service', return_value=(111, 'Never'), autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_bounce_method', return_value=fake_bounce, autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_drain_method', return_value=fake_drain_method, autospec=True, ), mock.patch.object( self.fake_marathon_service_config, 'get_drain_method_params', return_value=fake_drain_method_params, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_marathon_service_config', return_value=self.fake_marathon_service_config, autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.load_service_namespace_config', return_value=self.fake_service_namespace_config, autospec=True, ), ) as ( create_config_patch, deploy_service_patch, get_bounce_patch, get_drain_method_patch, get_drain_method_params_patch, read_service_conf_patch, read_namespace_conf_patch, ): status, output = setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) assert status == 111 assert output == 'Never' create_config_patch.assert_called_once_with( fake_name, fake_instance, self.fake_marathon_config ) get_bounce_patch.assert_called_once_with() get_drain_method_patch.assert_called_once_with(read_namespace_conf_patch.return_value) deploy_service_patch.assert_called_once_with( service=fake_name, instance=fake_instance, marathon_jobid=full_id, config=fake_complete, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params=fake_drain_method_params, nerve_ns=self.fake_marathon_service_config.get_nerve_namespace(), bounce_health_params=self.fake_marathon_service_config.get_bounce_health_params( read_namespace_conf_patch.return_value), soa_dir=None, )
def test_setup_service_srv_does_not_exist(self): fake_name = "if_talk_was_cheap" fake_instance = "psychatrists_would_be_broke" fake_response = mock.Mock(json=mock.Mock(return_value={"message": "test"})) fake_client = mock.MagicMock(get_app=mock.Mock(side_effect=marathon.exceptions.NotFoundError(fake_response))) full_id = marathon_tools.format_job_id(fake_name, fake_instance, "oogabooga", "bananafanafofooga") fake_complete = { "do": "you", "even": "dota", "id": full_id, "docker_image": "fake_docker_registry/fake_docker_image", } fake_bounce = "trampoline" fake_drain_method = "noop" fake_drain_method_params = {} with contextlib.nested( mock.patch("paasta_tools.marathon_tools.create_complete_config", return_value=fake_complete, autospec=True), mock.patch("paasta_tools.setup_marathon_job.deploy_service", return_value=(111, "Never"), autospec=True), mock.patch.object( self.fake_marathon_service_config, "get_bounce_method", return_value=fake_bounce, autospec=True ), mock.patch.object( self.fake_marathon_service_config, "get_drain_method", return_value=fake_drain_method, autospec=True ), mock.patch.object( self.fake_marathon_service_config, "get_drain_method_params", return_value=fake_drain_method_params, autospec=True, ), mock.patch( "paasta_tools.marathon_tools.load_marathon_service_config", return_value=self.fake_marathon_service_config, autospec=True, ), mock.patch( "paasta_tools.marathon_tools.load_service_namespace_config", return_value=self.fake_service_namespace_config, autospec=True, ), ) as ( create_config_patch, deploy_service_patch, get_bounce_patch, get_drain_method_patch, get_drain_method_params_patch, read_service_conf_patch, read_namespace_conf_patch, ): status, output = setup_marathon_job.setup_service( service=fake_name, instance=fake_instance, client=fake_client, marathon_config=self.fake_marathon_config, service_marathon_config=self.fake_marathon_service_config, soa_dir=None, ) assert status == 111 assert output == "Never" create_config_patch.assert_called_once_with(fake_name, fake_instance, self.fake_marathon_config) get_bounce_patch.assert_called_once_with() get_drain_method_patch.assert_called_once_with(read_namespace_conf_patch.return_value) deploy_service_patch.assert_called_once_with( service=fake_name, instance=fake_instance, marathon_jobid=full_id, config=fake_complete, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method, drain_method_params=fake_drain_method_params, nerve_ns=self.fake_marathon_service_config.get_nerve_namespace(), bounce_health_params=self.fake_marathon_service_config.get_bounce_health_params( read_namespace_conf_patch.return_value ), soa_dir=None, )
def test_deploy_service_known_bounce(self): fake_bounce = "areallygoodbouncestrategy" fake_drain_method_name = "noop" fake_name = "how_many_strings" fake_instance = "will_i_need_to_think_of" fake_id = marathon_tools.format_job_id(fake_name, fake_instance, "git11111111", "config11111111") fake_config = {"id": fake_id, "instances": 2} old_app_id = marathon_tools.format_job_id(fake_name, fake_instance, "git22222222", "config22222222") old_task_to_drain = mock.Mock(id="old_task_to_drain", app_id=old_app_id) old_task_is_draining = mock.Mock(id="old_task_is_draining", app_id=old_app_id) old_task_dont_drain = mock.Mock(id="old_task_dont_drain", app_id=old_app_id) old_app = mock.Mock(id="/%s" % old_app_id, tasks=[old_task_to_drain, old_task_is_draining, old_task_dont_drain]) fake_client = mock.MagicMock( list_apps=mock.Mock(return_value=[old_app]), kill_task=mock.Mock(spec=lambda app_id, id, scale=False: None) ) fake_bounce_func = mock.create_autospec( bounce_lib.brutal_bounce, return_value={"create_app": True, "tasks_to_drain": [old_task_to_drain]} ) fake_drain_method = mock.Mock(is_draining=lambda t: t is old_task_is_draining, is_safe_to_kill=lambda t: True) with contextlib.nested( mock.patch("paasta_tools.bounce_lib.get_bounce_method_func", return_value=fake_bounce_func, autospec=True), mock.patch("paasta_tools.bounce_lib.bounce_lock_zookeeper", autospec=True), mock.patch( "paasta_tools.bounce_lib.get_happy_tasks", autospec=True, side_effect=lambda x, _, __, **kwargs: x ), mock.patch("paasta_tools.bounce_lib.kill_old_ids", autospec=True), mock.patch("paasta_tools.bounce_lib.create_marathon_app", autospec=True), mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), mock.patch("paasta_tools.drain_lib.get_drain_method", return_value=fake_drain_method), ) as (_, _, _, kill_old_ids_patch, create_marathon_app_patch, mock_log, mock_load_system_paasta_config, _): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value="fake_cluster") result = setup_marathon_job.deploy_service( service=fake_name, instance=fake_instance, marathon_jobid=fake_id, config=fake_config, client=fake_client, bounce_method=fake_bounce, drain_method_name=fake_drain_method_name, drain_method_params={}, nerve_ns=fake_instance, bounce_health_params={}, soa_dir="fake_soa_dir", ) assert result[0] == 0, "Expected successful result; got (%d, %s)" % result fake_client.list_apps.assert_called_once_with(embed_failures=True) assert fake_client.create_app.call_count == 0 fake_bounce_func.assert_called_once_with( new_config=fake_config, new_app_running=False, happy_new_tasks=[], old_app_live_tasks={old_app.id: set([old_task_to_drain, old_task_dont_drain])}, ) assert fake_drain_method.drain.call_count == 2 fake_drain_method.drain.assert_any_call(old_task_is_draining) fake_drain_method.drain.assert_any_call(old_task_to_drain) assert fake_client.kill_task.call_count == 2 fake_client.kill_task.assert_any_call(app_id=old_app_id, task_id=old_task_is_draining.id, scale=True) fake_client.kill_task.assert_any_call(app_id=old_app_id, task_id=old_task_to_drain.id, scale=True) create_marathon_app_patch.assert_called_once_with(fake_config["id"], fake_config, fake_client) assert kill_old_ids_patch.call_count == 0 # We should call _log 5 times: # 1. bounce starts # 2. create new app # 3. draining old tasks # 4. remove old apps # 5. bounce finishes assert mock_log.call_count == 5
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params ) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def filter_autoscaling_tasks( marathon_apps: Sequence[MarathonApp], all_mesos_tasks: Sequence[Task], config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, ) -> Tuple[Mapping[str, MarathonTask], Sequence[Task]]: """Find the tasks that are serving traffic. We care about this because many tasks have a period of high CPU when they first start up, during which they warm up code, load and process data, etc., and we don't want this high load to drag our overall load estimate upwards. Allowing these tasks to count towards overall load could cause a cycle of scaling up, seeing high load due to new warming-up containers, scaling up, until we hit max_instances. However, accidentally omitting a task that actually is serving traffic will cause us to underestimate load; this is generally much worse than overestimating, since it can cause us to incorrectly scale down or refuse to scale up when necessary. For this reason, we look at several sources of health information, and if they disagree, assume the task is serving traffic. """ job_id_prefix = "{}{}".format( format_job_id(service=config.service, instance=config.instance), MESOS_TASK_SPACER, ) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined are healthy. # We assume tasks with no healthcheck results but a defined healthcheck to be unhealthy, unless they are "old" in # which case we assume that Marathon has screwed up and stopped healthchecking but that they are healthy. log.info("Inspecting %s for autoscaling" % job_id_prefix) relevant_tasks_by_app: Dict[MarathonApp, List[MarathonTask]] = { app: app.tasks for app in marathon_apps if app.id.lstrip("/").startswith(job_id_prefix) } healthy_marathon_tasks: Dict[str, MarathonTask] = {} for app, tasks in relevant_tasks_by_app.items(): for task in tasks: if (is_task_healthy(task) or not app.health_checks or is_old_task_missing_healthchecks(task, app)): healthy_marathon_tasks[task.id] = task service_namespace_config = load_service_namespace_config( service=config.service, namespace=config.get_nerve_namespace()) if service_namespace_config.is_in_smartstack(): for task in filter_tasks_in_smartstack( tasks=[ task for tasks in relevant_tasks_by_app.values() for task in tasks ], service=config.service, nerve_ns=config.get_nerve_namespace(), system_paasta_config=system_paasta_config, max_hosts_to_query=20, haproxy_min_fraction_up= 0.01, # Be very liberal. See docstring above for rationale. ): healthy_marathon_tasks[task.id] = task if not healthy_marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks") mesos_tasks = [ task for task in all_mesos_tasks if task["id"] in healthy_marathon_tasks ] return (healthy_marathon_tasks, mesos_tasks)