def restart_job(self, cluster, role, environment, jobname, jobspec=None, instances=[]): """Method to restart aurora job""" job_key = AuroraJobKey(cluster, role, environment, jobname) logger.info("request to restart => %s", job_key.to_path()) instances = self.pack_instance_list(instances) try: config = self.make_job_config(job_key, jobspec) except Exception as e: return(job_key.to_path(), ["Failed to restart Aurora job", "Can not create job configuration object because", str(e)]) # these are the default values from apache.aurora.client.commands.core.restart() updater_config = UpdaterConfig( 1, # options.batch_size 60, # options.restart_threshold 30, # options.watch_secs 0, # options.max_per_shard_failures 0 # options.max_total_failures ) api = make_client(job_key.cluster) # instances = all shards, health check = 3 sec resp = api.restart(job_key, instances, updater_config, 3, config=config) if resp.responseCode != ResponseCode.OK: logger.warning("aurora -- restart job failed") responseStr = self.response_string(resp) logger.warning(responseStr) return(job_key.to_path(), ["Error reported by aurora client:", responseStr]) logger.info("aurora -- restart job successful") return(job_key.to_path(), None)
def get_status_query_response(cls): query_response = Response() query_response.responseCode = ResponseCode.OK query_response.result = Result() summaries = GetJobUpdateSummariesResult() query_response.result.getJobUpdateSummariesResult = summaries summaries.updateSummaries = [ JobUpdateSummary( updateId="hello", jobKey=AuroraJobKey('west', 'mcc', 'test', 'hello'), user="******", state=JobUpdateState(status=JobUpdateStatus.ROLLING_FORWARD, createdTimestampMs=1411404927, lastModifiedTimestampMs=14114056030)), JobUpdateSummary( updateId="goodbye", jobKey=AuroraJobKey('west', 'mch', 'prod', 'goodbye'), user="******", state=JobUpdateState(status=JobUpdateStatus.ROLLING_BACK, createdTimestampMs=1411300632, lastModifiedTimestampMs=14114092632)), JobUpdateSummary(updateId="gasp", jobKey=AuroraJobKey('west', 'mcq', 'devel', 'gasp'), user="******", state=JobUpdateState( status=JobUpdateStatus.ROLL_FORWARD_PAUSED, createdTimestampMs=1411600891, lastModifiedTimestampMs=1411800891)) ] return query_response
def execute(self, context): cluster = context.options.cluster api = context.get_api(cluster) response = api.query_job_updates( role=context.options.role, job_key=context.options.jobspec, user=context.options.user, update_statuses=context.options.status) context.log_response_and_raise(response) if context.options.write_json: result = [] for summary in response.result.getJobUpdateSummariesResult.updateSummaries: job_entry = { "jobkey": AuroraJobKey.from_thrift(cluster, summary.jobKey).to_path(), "id": summary.updateId, "user": summary.user, "started": summary.state.createdTimestampMs, "lastModified": summary.state.lastModifiedTimestampMs, "status": JobUpdateStatus._VALUES_TO_NAMES[summary.state.status] } result.append(job_entry) context.print_out(json.dumps(result, indent=2, separators=[',', ': '], sort_keys=False)) else: for summary in response.result.getJobUpdateSummariesResult.updateSummaries: created = summary.state.createdTimestampMs lastMod = summary.state.lastModifiedTimestampMs context.print_out("Job: %s, Id: %s, User: %s, Status: %s" % ( AuroraJobKey.from_thrift(cluster, summary.jobKey).to_path(), summary.updateId, summary.user, JobUpdateStatus._VALUES_TO_NAMES[summary.state.status])) context.print_out("Created: %s, Last Modified %s" % (created, lastMod), indent=2) return EXIT_OK
def execute(self, context): update_filter = context.options.filter cluster = update_filter.cluster if (update_filter.role is not None and update_filter.env is not None and update_filter.job is not None): job_key = AuroraJobKey( cluster=cluster, role=update_filter.role, env=update_filter.env, name=update_filter.job) else: job_key = None api = context.get_api(cluster) filter_statuses = set() for status in context.options.status: filter_statuses = filter_statuses.union(set(self.STATUS_GROUPS[status])) response = api.query_job_updates( role=update_filter.role if job_key is None else None, job_key=job_key, update_statuses=filter_statuses if filter_statuses else None, user=context.options.user) context.log_response_and_raise(response) # The API does not offer a way to query by environment, so if that filter is requested, we # perform a more broad role-based query and filter here. summaries = response.result.getJobUpdateSummariesResult.updateSummaries if job_key is None and update_filter.env is not None: summaries = [s for s in summaries if s.key.job.environment == update_filter.env] if context.options.write_json: result = [] for summary in summaries: job_entry = { "job": AuroraJobKey.from_thrift(cluster, summary.key.job).to_path(), "id": summary.key.id, "user": summary.user, "started": format_timestamp(summary.state.createdTimestampMs), "last_modified": format_timestamp(summary.state.lastModifiedTimestampMs), "status": JobUpdateStatus._VALUES_TO_NAMES[summary.state.status] } result.append(job_entry) context.print_out(json.dumps(result, indent=2, separators=[',', ': '], sort_keys=False)) else: if summaries: context.print_out(self.HEADER) for summary in summaries: context.print_out(self.FORMAT_STR.format( AuroraJobKey.from_thrift(cluster, summary.key.job).to_path(), summary.key.id, JobUpdateStatus._VALUES_TO_NAMES[summary.state.status], summary.user, format_timestamp(summary.state.createdTimestampMs), format_timestamp(summary.state.lastModifiedTimestampMs)) ) return EXIT_OK
def test_query_job_updates(self): """Test querying job updates.""" api, mock_proxy = self.mock_api() job_key = AuroraJobKey("foo", "role", "env", "name") query = JobUpdateQuery(jobKey=job_key.to_thrift(), updateStatuses=set([JobUpdateStatus.ROLLING_FORWARD])) api.query_job_updates(job_key=job_key, update_statuses=query.updateStatuses) mock_proxy.getJobUpdateSummaries.assert_called_once_with(query)
def setUp(self): self._scheduler = Mock() self._sla = Sla(self._scheduler) self._cluster = Cluster(name='cl') self._role = 'mesos' self._name = 'job' self._env = 'test' self._job_key = AuroraJobKey(self._cluster.name, self._role, self._env, self._name) self._min_count = 1
def test_query_job_updates(self): """Test querying job updates.""" api, mock_proxy = self.mock_api() job_key = AuroraJobKey("foo", "role", "env", "name") query = JobUpdateQuery( jobKey=job_key.to_thrift(), updateStatuses={JobUpdateStatus.ROLLING_FORWARD}) api.query_job_updates(job_key=job_key, update_statuses=query.updateStatuses) mock_proxy.getJobUpdateSummaries.assert_called_once_with(query)
def test_add_instances(self): """Test adding instances.""" api, mock_proxy = self.mock_api() job_key = AuroraJobKey("foo", "role", "env", "name") mock_proxy.addInstances.return_value = self.create_simple_success_response() api.add_instances(job_key, 1, 10) mock_proxy.addInstances.assert_called_once_with( InstanceKey(jobKey=job_key.to_thrift(), instanceId=1), 10)
def test_add_instances(self): """Test adding instances.""" api, mock_proxy = self.mock_api() job_key = AuroraJobKey("foo", "role", "env", "name") mock_proxy.addInstances.return_value = self.create_simple_success_response( ) api.add_instances(job_key, 1, 10) mock_proxy.addInstances.assert_called_once_with( InstanceKey(jobKey=job_key.to_thrift(), instanceId=1), 10)
def assert_kill_calls(cls, api, instance_range=None, instances=None): if instances: kill_calls = [ call(AuroraJobKey.from_path(cls.TEST_JOBSPEC), instances) ] else: kill_calls = [ call(AuroraJobKey.from_path(cls.TEST_JOBSPEC), [i]) for i in instance_range ] assert api.kill_job.mock_calls == kill_calls
def test_inequality(self): base = AuroraJobKey('cluster', 'role', 'env', 'name') keys = [ AuroraJobKey('XXXXXXX', 'role', 'env', 'name'), AuroraJobKey('cluster', 'XXXX', 'env', 'name'), AuroraJobKey('cluster', 'role', 'XXX', 'name'), AuroraJobKey('cluster', 'role', 'env', 'XXXX') ] for key in keys: assert base != key assert not (base == key)
def instance_specifier(spec_str): if spec_str is None or spec_str == '': raise ValueError('Instance specifier must be non-empty') parts = spec_str.split('/') if len(parts) == 4: jobkey = AuroraJobKey(*parts) return TaskInstanceKey(jobkey, ALL_INSTANCES) elif len(parts) != 5: raise ArgumentTypeError('Instance specifier must be a CLUSTER/ROLE/ENV/JOB/INSTANCES tuple') (cluster, role, env, name, instance_str) = parts jobkey = AuroraJobKey(cluster, role, env, name) instances = parse_instances(instance_str) return TaskInstanceKey(jobkey, instances)
def test_task_query(self): jobs = set([ AuroraJobKey(self._cluster.name, self._role, self._env, 'j1'), AuroraJobKey(self._cluster.name, self._role, self._env, 'j2'), AuroraJobKey(self._cluster.name, self._role, self._env, 'j3'), AuroraJobKey(self._cluster.name, self._role, self._env, 'j4') ]) query = task_query(job_keys=jobs) assert len(jobs) == len( query.jobKeys), 'Expected length:%s, Actual:%s' % ( len(jobs), len(query.jobKeys)) assert LIVE_STATES == query.statuses, 'Expected:%s, Actual:%s' % ( LIVE_STATES, query.statuses)
def test_disambiguate_args_or_die_unambiguous_with_no_config(self): expected = (self._api, AuroraJobKey(self.CLUSTER.name, self.ROLE, self.ENV, self.NAME), None) result = LiveJobDisambiguator.disambiguate_args_or_die( [self.JOB_PATH], None, client_factory=lambda *_: self._api) assert result == expected
def test_killall_job(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() mock_scheduler_proxy = Mock() with contextlib.nested( patch('time.sleep'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') mock_scheduler_proxy.getTasksStatus.return_value = self.create_status_call_result() api.kill_job.return_value = self.get_kill_job_response() mock_scheduler_proxy.killTasks.return_value = self.get_kill_job_response() mock_context.add_expected_status_query_result(self.create_status_call_result( self.create_mock_task(ScheduleStatus.KILLED))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute(['job', 'killall', '--no-batching', '--config=%s' % fp.name, 'west/bozo/test/hello']) # Now check that the right API calls got made. assert api.kill_job.call_count == 1 api.kill_job.assert_called_with(AuroraJobKey.from_path('west/bozo/test/hello'), None) self.assert_scheduler_called(api, self.get_expected_task_query(), 2)
def test_start_update_command_line_succeeds(self): mock_context = FakeAuroraCommandContext() resp = self.create_simple_success_response() resp.result = Result(startJobUpdateResult=StartJobUpdateResult(updateId="id")) with contextlib.nested( patch('apache.aurora.client.cli.update.Update.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): mock_api = mock_context.get_api(self.TEST_CLUSTER) mock_api.start_job_update.return_value = resp with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() result = cmd.execute(['beta-update', 'start', self.TEST_JOBSPEC, fp.name]) assert result == EXIT_OK update_url_msg = StartUpdate.UPDATE_MSG_TEMPLATE % ( mock_context.get_update_page(mock_api, AuroraJobKey.from_path(self.TEST_JOBSPEC), "id")) assert mock_api.start_job_update.call_count == 1 args, kwargs = mock_api.start_job_update.call_args assert isinstance(args[0], AuroraConfig) assert args[1] is None assert mock_context.get_out() == [update_url_msg] assert mock_context.get_err() == []
def test_start_update_command_line_succeeds(self): mock_context = FakeAuroraCommandContext() resp = self.create_simple_success_response() resp.result = Result(startJobUpdateResult=StartJobUpdateResult(updateId="id")) with contextlib.nested( patch('apache.aurora.client.cli.update.Update.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): mock_api = mock_context.get_api('west') mock_api.start_job_update.return_value = resp with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() result = cmd.execute(['beta-update', 'start', self.TEST_JOBSPEC, fp.name]) assert result == EXIT_OK update_url_msg = StartUpdate.UPDATE_MSG_TEMPLATE % ( mock_context.get_update_page(mock_api, AuroraJobKey.from_path(self.TEST_JOBSPEC), "id")) assert mock_api.start_job_update.call_count == 1 args, kwargs = mock_api.start_job_update.call_args assert isinstance(args[0], AuroraConfig) assert args[1] is None assert mock_context.get_out() == [update_url_msg] assert mock_context.get_err() == []
def test_kill_lock_error_batches(self): """Verify that the batch kill path short circuits and includes the lock error message.""" command = KillCommand() jobkey = AuroraJobKey("cluster", "role", "env", "job") mock_options = mock_verb_options(command) mock_options.instance_spec = TaskInstanceKey(jobkey, [1]) mock_options.no_batching = False fake_context = FakeAuroraCommandContext() fake_context.set_options(mock_options) fake_context.add_expected_query_result( AuroraClientCommandTest.create_query_call_result( AuroraClientCommandTest.create_scheduled_task( 1, ScheduleStatus.RUNNING))) mock_api = fake_context.get_api('test') mock_api.kill_job.return_value = AuroraClientCommandTest.create_blank_response( ResponseCode.LOCK_ERROR, "Error.") with pytest.raises(Context.CommandError): command.execute(fake_context) mock_api.kill_job.assert_called_once_with( jobkey, mock_options.instance_spec.instance) self.assert_lock_message(fake_context)
def test_simple_successful_killall_job(self): """Run a test of the "kill" command against a mocked-out API: Verifies that the kill command sends the right API RPCs, and performs the correct tests on the result.""" mock_options = self.setup_mock_options() mock_config = Mock() (mock_api, mock_scheduler_proxy) = self.create_mock_api() mock_api.kill_job.return_value = self.get_kill_job_response() mock_scheduler_proxy.killTasks.return_value = self.get_kill_job_response() mock_query_results = [ self.create_mock_status_query_result(ScheduleStatus.RUNNING), self.create_mock_status_query_result(ScheduleStatus.KILLING), self.create_mock_status_query_result(ScheduleStatus.KILLED), ] mock_scheduler_proxy.getTasksWithoutConfigs.side_effect = mock_query_results with contextlib.nested( patch('time.sleep'), patch('apache.aurora.client.commands.core.make_client', return_value=mock_api), patch('twitter.common.app.get_options', return_value=mock_options), patch('apache.aurora.client.commands.core.get_job_config', return_value=mock_config)): with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() killall(['west/mchucarroll/test/hello', fp.name], mock_options) # Now check that the right API calls got made. self.assert_kill_job_called(mock_api) mock_api.kill_job.assert_called_with( AuroraJobKey(cluster=self.TEST_CLUSTER, role=self.TEST_ROLE, env=self.TEST_ENV, name=self.TEST_JOB), None, config=mock_config) self.assert_scheduler_called(mock_api, self.get_expected_task_query(), 3)
def test_killall_job(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() mock_scheduler_proxy = Mock() with contextlib.nested( patch('threading._Event.wait'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') mock_scheduler_proxy.getTasksWithoutConfigs.return_value = self.create_status_call_result( ) api.kill_job.return_value = self.get_kill_job_response() mock_scheduler_proxy.killTasks.return_value = self.get_kill_job_response( ) mock_context.add_expected_status_query_result( self.create_status_call_result( self.create_mock_task(ScheduleStatus.KILLED))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute([ 'job', 'killall', '--no-batching', '--config=%s' % fp.name, 'west/bozo/test/hello' ]) # Now check that the right API calls got made. assert api.kill_job.call_count == 1 api.kill_job.assert_called_with( AuroraJobKey.from_path('west/bozo/test/hello'), None) self.assert_scheduler_called(api, self.get_expected_task_query(), 2)
def test_successful_batched_killall_job(self): """Run a test of the "kill" command against a mocked-out API: Verifies that the kill command sends the right API RPCs, and performs the correct tests on the result.""" mock_options = self.setup_mock_options() mock_options.batch_size = 5 mock_config = Mock() (mock_api, mock_scheduler_proxy) = self.create_mock_api() mock_api.kill_job.return_value = self.get_kill_job_response() mock_api.check_status.return_value = self.create_status_call_result() with contextlib.nested( patch('apache.aurora.client.commands.core.make_client', return_value=mock_api), patch('twitter.common.app.get_options', return_value=mock_options), patch('apache.aurora.client.commands.core.get_job_config', return_value=mock_config), patch('apache.aurora.client.commands.core.JobMonitor')): with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() killall(['west/mchucarroll/test/hello', fp.name], mock_options) # Now check that the right API calls got made. assert mock_api.kill_job.call_count == 4 mock_api.kill_job.assert_called_with( AuroraJobKey(cluster=self.TEST_CLUSTER, role=self.TEST_ROLE, env=self.TEST_ENV, name=self.TEST_JOB), [15, 16, 17, 18, 19])
def test_kill_job_with_instances_batched_large(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() with contextlib.nested( patch('threading._Event.wait'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') status_result = self.create_status_call_result() mock_context.add_expected_status_query_result(status_result) api.kill_job.return_value = self.get_kill_job_response() mock_context.add_expected_status_query_result( self.create_status_call_result( self.create_mock_task(ScheduleStatus.KILLED))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute([ 'job', 'kill', '--config=%s' % fp.name, 'west/bozo/test/hello/0,2,4-13' ]) # Now check that the right API calls got made. assert api.kill_job.call_count == 3 api.kill_job.assert_called_with( AuroraJobKey.from_path('west/bozo/test/hello'), [12, 13]) # Expect total 5 calls (3 from JobMonitor). self.assert_scheduler_called( api, self.get_expected_task_query([12, 13]), 5)
def test_create_with_lock(self): command = CreateJobCommand() jobkey = AuroraJobKey("cluster", "role", "env", "job") mock_options = mock_verb_options(command) mock_options.jobspec = jobkey mock_options.config_file = "/tmp/whatever" fake_context = FakeAuroraCommandContext() fake_context.set_options(mock_options) mock_config = create_autospec(spec=AuroraConfig, spec_set=True, instance=True) mock_job_config = Mock() mock_job_config.has_cron_schedule.return_value = False mock_config.raw.return_value = mock_job_config fake_context.get_job_config = Mock(return_value=mock_config) mock_api = fake_context.get_api("test") mock_api.create_job.return_value = AuroraClientCommandTest.create_blank_response( ResponseCode.JOB_UPDATING_ERROR, "Error.") with pytest.raises(Context.CommandError): command.execute(fake_context) mock_api.create_job.assert_called_once_with(mock_config) self.assert_lock_message(fake_context)
def test_update_status(self): mock_context = FakeAuroraCommandContext() api = mock_context.get_api('west') api.query_job_updates.return_value = self.get_status_query_response() api.get_job_update_details.return_value = self.get_update_details_response( ) with contextlib.nested( patch('apache.aurora.client.cli.update.Update.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): cmd = AuroraCommandLine() result = cmd.execute( ["beta-update", "status", "west/mcc/test/hello"]) assert result == EXIT_OK assert mock_context.get_out() == [ "Job: west/mcc/test/hello, UpdateID: fake-update-identifier", "Started YYYY-MM-DD HH:MM:SS, last updated: YYYY-MM-DD HH:MM:SS", "Current status: ROLLING_FORWARD", "Update events:", " Status: ROLLING_FORWARD at YYYY-MM-DD HH:MM:SS", " Status: ROLL_FORWARD_PAUSED at YYYY-MM-DD HH:MM:SS", " Status: ROLLING_FORWARD at YYYY-MM-DD HH:MM:SS", "Instance events:", " Instance 1 at YYYY-MM-DD HH:MM:SS: INSTANCE_UPDATING", " Instance 2 at YYYY-MM-DD HH:MM:SS: INSTANCE_UPDATING", " Instance 1 at YYYY-MM-DD HH:MM:SS: INSTANCE_UPDATED", " Instance 2 at YYYY-MM-DD HH:MM:SS: INSTANCE_UPDATED" ] mock_context.get_api("west").query_job_updates.assert_called_with( job_key=AuroraJobKey('west', 'mcc', 'test', 'hello'))
def create_probe_hosts(cls, num_hosts, predicted, safe, safe_in): hosts = defaultdict(list) for i in range(num_hosts): host_name = "h%s" % i job = AuroraJobKey.from_path("west/role/env/job%s" % i) hosts[host_name].append(DomainUpTimeSlaVector.JobUpTimeDetails(job, predicted, safe, safe_in)) return hosts
def test_killall_job_something_else(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() mock_scheduler_proxy = create_autospec(spec=SchedulerThriftApiSpec, instance=True) with contextlib.nested( patch('threading._Event.wait'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') api.kill_job.return_value = self.get_kill_job_response() mock_context.add_expected_status_query_result(self.create_status_call_result()) mock_scheduler_proxy.killTasks.return_value = self.get_kill_job_response() mock_context.add_expected_status_query_result(self.create_status_call_result( self.create_mock_task(ScheduleStatus.KILLED))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute(['job', 'killall', '--config=%s' % fp.name, 'west/bozo/test/hello']) # Now check that the right API calls got made. assert api.kill_job.call_count == 4 instances = [15, 16, 17, 18, 19] api.kill_job.assert_called_with(AuroraJobKey.from_path('west/bozo/test/hello'), instances) self.assert_scheduler_called(api, self.get_expected_task_query(instances), 6)
def disambiguate_args_or_die(cls, args, options, client_factory=AuroraClientAPI): """ Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple if one can be found given the args, potentially querying the scheduler with the returned client. Calls die() with an appropriate error message otherwise. Arguments: args: args from app command invocation. options: options from app command invocation. must have env and cluster attributes. client_factory: a callable (cluster) -> AuroraClientAPI. """ if not len(args) > 0: die('job path is required') try: job_key = AuroraJobKey.from_path(args[0]) client = client_factory(job_key.cluster) config_file = args[1] if len(args) > 1 else None # the config for hooks return client, job_key, config_file except AuroraJobKey.Error: log.warning("Failed to parse job path, falling back to compatibility mode") role = args[0] if len(args) > 0 else None name = args[1] if len(args) > 1 else None env = None config_file = None # deprecated form does not support hooks functionality cluster = options.cluster if not cluster: die('cluster is required') client = client_factory(cluster) return client, cls._disambiguate_or_die(client, role, env, name), config_file
def run(args, options): """usage: run cluster/role/env/job cmd Runs a shell command on all machines currently hosting shards of a single job. This feature supports the same command line wildcards that are used to populate a job's commands. This means anything in the {{mesos.*}} and {{thermos.*}} namespaces. """ # TODO(William Farner): Add support for invoking on individual shards. # TODO(Kevin Sweeney): Restore the ability to run across jobs with globs (See MESOS-3010). if not args: die("job path is required") job_path = args.pop(0) new_cmd = ["task", "run"] instances_spec = job_path if options.num_threads != 1: new_cmd.append("--threads=%s" % options.num_threads) if options.ssh_user is not None: new_cmd.append("--ssh-user=%s" % options.ssh_user) if options.executor_sandbox: new_cmd.append("--executor-sandbox") new_cmd.append('"%s"' % " ".join(args)) v1_deprecation_warning("ssh", new_cmd) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) command = " ".join(args) cluster = CLUSTERS[cluster_name] dcr = DistributedCommandRunner(cluster, role, env, [name], options.ssh_user) dcr.run(command, parallelism=options.num_threads, executor_sandbox=options.executor_sandbox)
def test_safe_domain_override_jobs(self): """Test successful execution of the sla_list_safe_domain command with override_jobs option.""" mock_vector = self.create_mock_vector(self.create_hosts(3, 80, 100)) with temporary_file() as fp: fp.write('west/role/env/job1 30 200s') fp.flush() mock_options = self.setup_mock_options(override=fp.name) with contextlib.nested( patch('apache.aurora.client.commands.admin.AuroraClientAPI', new=create_autospec(spec=AuroraClientAPI)), patch('apache.aurora.client.commands.admin.print_results'), patch('apache.aurora.client.commands.admin.CLUSTERS', new=self.TEST_CLUSTERS), patch('twitter.common.app.get_options', return_value=mock_options) ) as ( mock_api, mock_print_results, test_clusters, mock_options): mock_api.return_value.sla_get_safe_domain_vector.return_value = mock_vector sla_list_safe_domain(['west', '50', '100s']) job_key = AuroraJobKey.from_path('west/role/env/job1') override = {job_key: JobUpTimeLimit(job_key, 30, 200)} mock_vector.get_safe_hosts.assert_called_once_with(50.0, 100.0, override, DEFAULT_GROUPING) mock_print_results.assert_called_once_with(['h0', 'h1', 'h2'])
def get_job_config(job_spec, config_file, options): try: job_key = AuroraJobKey.from_path(job_spec) select_cluster = job_key.cluster select_env = job_key.env select_role = job_key.role jobname = job_key.name except AuroraJobKey.Error: deprecation_warning('Please refer to your job in CLUSTER/ROLE/ENV/NAME format.') select_cluster = options.cluster if options.cluster else None select_env = options.env select_role = None jobname = job_spec try: json_option = options.json except AttributeError: json_option = False try: bindings = options.bindings except AttributeError: bindings = () return get_config( jobname, config_file, json_option, bindings, select_cluster=select_cluster, select_role=select_role, select_env=select_env)
def create_hosts(cls, num_hosts, percentage, duration): hosts = defaultdict(list) for i in range(num_hosts): host_name = 'h%s' % i job = AuroraJobKey.from_path('west/role/env/job%s' % i) hosts[host_name].append(JobUpTimeLimit(job, percentage, duration)) return [hosts]
def setUp(self): self.RETURN_VALUE = "foo" test_obj = self class FakeAuroraClientAPI(object): def kill_job(self, job_key, instances=None, lock=None): test_obj.API_CALL = functools.partial(self.kill_job, job_key, instances, lock) return test_obj.RETURN_VALUE def restart(self, job_key, shards, restart_settings): test_obj.API_CALL = functools.partial(self.restart, job_key, shards, restart_settings) return test_obj.RETURN_VALUE def start_cronjob(self, job_key): test_obj.API_CALL = functools.partial(self.start_cronjob, job_key) return test_obj.RETURN_VALUE self._patch_bases(NonHookedAuroraClientAPI, (FakeAuroraClientAPI,)) self.api = NonHookedAuroraClientAPI() # Test args passed in to check that these are proxied un-modified self.test_job_key = AuroraJobKey.from_path("a/b/c/d") self.test_config = "bar" self.test_shards = "baz" self.test_lock = "lock" self.health_check_interval_seconds = "baa"
def test_kill_job_with_instances_batched_large(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() with contextlib.nested( patch('threading._Event.wait'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') status_result = self.create_status_call_result() mock_context.add_expected_status_query_result(status_result) api.kill_job.return_value = self.get_kill_job_response() mock_context.add_expected_status_query_result(self.create_status_call_result( self.create_mock_task(ScheduleStatus.KILLED))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute(['job', 'kill', '--config=%s' % fp.name, 'west/bozo/test/hello/0,2,4-13']) # Now check that the right API calls got made. assert api.kill_job.call_count == 3 api.kill_job.assert_called_with(AuroraJobKey.from_path('west/bozo/test/hello'), [12, 13]) # Expect total 5 calls (3 from JobMonitor). self.assert_scheduler_called(api, self.get_expected_task_query([12, 13]), 5)
def test_safe_domain_override_jobs(self): """Test successful execution of the sla_list_safe_domain command with override_jobs option.""" mock_vector = self.create_mock_vector(self.create_hosts(3, 80, 100)) with temporary_file() as fp: fp.write('west/role/env/job1 30 200s') fp.flush() mock_options = self.setup_mock_options(override=fp.name) with contextlib.nested( patch( 'apache.aurora.client.commands.admin.AuroraClientAPI', new=Mock(spec=AuroraClientAPI)), patch('apache.aurora.client.commands.admin.print_results'), patch('apache.aurora.client.commands.admin.CLUSTERS', new=self.TEST_CLUSTERS), patch('twitter.common.app.get_options', return_value=mock_options)) as (mock_api, mock_print_results, test_clusters, mock_options): mock_api.return_value.sla_get_safe_domain_vector.return_value = mock_vector sla_list_safe_domain(['west', '50', '100s']) job_key = AuroraJobKey.from_path('west/role/env/job1') override = {job_key: JobUpTimeLimit(job_key, 30, 200)} mock_vector.get_safe_hosts.assert_called_once_with( 50.0, 100.0, override, DEFAULT_GROUPING) mock_print_results.assert_called_once_with(['h0', 'h1', 'h2'])
def create_probe_hosts(cls, num_hosts, predicted, safe, safe_in): hosts = defaultdict(list) for i in range(num_hosts): host_name = 'h%s' % i job = AuroraJobKey.from_path('west/role/env/job%s' % i) hosts[host_name].append(JobUpTimeDetails(job, predicted, safe, safe_in)) return [hosts]
def test_killall_job_wait_until_timeout(self): """Test kill client-side API logic.""" mock_context = FakeAuroraCommandContext() mock_scheduler_proxy = create_autospec(spec=SchedulerThriftApiSpec, instance=True) with contextlib.nested( patch('threading._Event.wait'), patch('apache.aurora.client.cli.jobs.Job.create_context', return_value=mock_context), patch('apache.aurora.client.factory.CLUSTERS', new=self.TEST_CLUSTERS)): api = mock_context.get_api('west') mock_scheduler_proxy.getTasksWithoutConfigs.return_value = self.create_status_call_result() api.kill_job.return_value = self.get_kill_job_response() mock_scheduler_proxy.killTasks.return_value = self.get_kill_job_response() for _ in range(8): mock_context.add_expected_status_query_result(self.create_status_call_result( self.create_mock_task(ScheduleStatus.RUNNING))) with temporary_file() as fp: fp.write(self.get_valid_config()) fp.flush() cmd = AuroraCommandLine() assert EXIT_TIMEOUT == cmd.execute( ['job', 'killall', '--no-batching', '--config=%s' % fp.name, 'west/bozo/test/hello']) # Now check that the right API calls got made. assert api.kill_job.call_count == 1 api.kill_job.assert_called_with(AuroraJobKey.from_path('west/bozo/test/hello'), None) self.assert_scheduler_called(api, self.get_expected_task_query(), 8)
def get_job_config(job_spec, config_file, options): try: job_key = AuroraJobKey.from_path(job_spec) select_cluster = job_key.cluster select_env = job_key.env select_role = job_key.role jobname = job_key.name except AuroraJobKey.Error: deprecation_warning( 'Please refer to your job in CLUSTER/ROLE/ENV/NAME format.') select_cluster = options.cluster if options.cluster else None select_env = options.env select_role = None jobname = job_spec try: json_option = options.json except AttributeError: json_option = False try: bindings = options.bindings except AttributeError: bindings = () return get_config(jobname, config_file, json_option, bindings, select_cluster=select_cluster, select_role=select_role, select_env=select_env)
def setUp(self): self.RETURN_VALUE = 'foo' test_obj = self class FakeAuroraClientAPI(object): def cancel_update(self, job_key): test_obj.API_CALL = functools.partial(self.cancel_update, job_key) return test_obj.RETURN_VALUE def kill_job(self, job_key, instances=None, lock=None): test_obj.API_CALL = functools.partial(self.kill_job, job_key, instances, lock) return test_obj.RETURN_VALUE def restart(self, job_key, shards, updater_config, health_check_interval_seconds): test_obj.API_CALL = functools.partial(self.restart, job_key, shards, updater_config, health_check_interval_seconds) return test_obj.RETURN_VALUE def start_cronjob(self, job_key): test_obj.API_CALL = functools.partial(self.start_cronjob, job_key) return test_obj.RETURN_VALUE self._patch_bases(NonHookedAuroraClientAPI, (FakeAuroraClientAPI, )) self.api = NonHookedAuroraClientAPI() # Test args passed in to check that these are proxied un-modified self.test_job_key = AuroraJobKey.from_path('a/b/c/d') self.test_config = 'bar' self.test_shards = 'baz' self.test_lock = 'lock' self.test_updater_config = 'blah' self.health_check_interval_seconds = 'baa'
def test_restart_with_lock(self): command = RestartCommand() jobkey = AuroraJobKey("cluster", "role", "env", "job") mock_options = mock_verb_options(command) mock_options.instance_spec = TaskInstanceKey(jobkey, []) fake_context = FakeAuroraCommandContext() fake_context.set_options(mock_options) mock_api = fake_context.get_api("test") mock_api.restart.return_value = AuroraClientCommandTest.create_blank_response( ResponseCode.LOCK_ERROR, "Error.") with pytest.raises(Context.CommandError): command.execute(fake_context) updater_config = UpdaterConfig(mock_options.batch_size, mock_options.restart_threshold, mock_options.watch_secs, mock_options.max_per_instance_failures, mock_options.max_total_failures) mock_api.restart.assert_called_once_with( jobkey, mock_options.instance_spec.instance, updater_config, mock_options.healthcheck_interval_seconds, config=None) self.assert_lock_message(fake_context)
def setUp(self): self.RETURN_VALUE = 'foo' test_obj = self class FakeAuroraClientAPI(object): def kill_job(self, job_key, instances=None, lock=None): test_obj.API_CALL = functools.partial(self.kill_job, job_key, instances, lock) return test_obj.RETURN_VALUE def restart(self, job_key, shards, restart_settings): test_obj.API_CALL = functools.partial(self.restart, job_key, shards, restart_settings) return test_obj.RETURN_VALUE def start_cronjob(self, job_key): test_obj.API_CALL = functools.partial(self.start_cronjob, job_key) return test_obj.RETURN_VALUE self._patch_bases(NonHookedAuroraClientAPI, (FakeAuroraClientAPI, )) self.api = NonHookedAuroraClientAPI() # Test args passed in to check that these are proxied un-modified self.test_job_key = AuroraJobKey.from_path('a/b/c/d') self.test_config = 'bar' self.test_shards = 'baz' self.test_lock = 'lock' self.health_check_interval_seconds = 'baa'
def really_killall(args, options): """Helper for testing purposes: make it easier to mock out the actual kill process, while testing hooks in the command dispatch process. """ maybe_disable_hooks(options) job_key = AuroraJobKey.from_path(args[0]) config_file = args[1] if len(args) > 1 else None # the config for hooks new_cmd = ["job", "killall", args[0]] if config_file is not None: new_cmd.append("--config=%s" % config_file) if options.open_browser: new_cmd.append("--open-browser") if options.batch_size is not None: new_cmd.append("--batch-size=%s" % options.batch_size) if options.max_total_failures is not None: new_cmd.append("--max-total-failures=%s" % options.max_total_failures) v1_deprecation_warning("killall", new_cmd) config = get_job_config(job_key.to_path(), config_file, options) if config_file else None api = make_client(job_key.cluster) if options.batch_size is not None: kill_in_batches(api, job_key, None, options.batch_size, options.max_failures_option) else: resp = api.kill_job(job_key, None, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name) wait_kill_tasks(api.scheduler_proxy, job_key)
def execute(self, context): job = context.options.instance_spec.jobkey instances = (None if context.options.instance_spec.instance == ALL_INSTANCES else context.options.instance_spec.instance) config = context.get_job_config(job, context.options.config_file) if config.raw().has_cron_schedule(): raise context.CommandError( EXIT_COMMAND_FAILURE, "Cron jobs may only be updated with \"aurora cron schedule\" command") api = context.get_api(config.cluster()) try: resp = api.start_job_update(config, context.options.message, instances) except AuroraClientAPI.UpdateConfigError as e: raise context.CommandError(EXIT_INVALID_CONFIGURATION, e.message) context.log_response_and_raise(resp, err_code=EXIT_API_ERROR, err_msg="Failed to start update due to error:") if resp.result: update_key = resp.result.startJobUpdateResult.key url = get_update_page( api, AuroraJobKey.from_thrift(config.cluster(), update_key.job), resp.result.startJobUpdateResult.key.id) context.print_out(self.UPDATE_MSG_TEMPLATE % url) if context.options.wait: return wait_for_update(context, self._clock, api, update_key) else: context.print_out(combine_messages(resp)) return EXIT_OK
def assert_cancel_update_called(cls, mock_api): # Running cancel update should result in calling the API cancel_update # method once, with an AuroraJobKey parameter. assert mock_api.cancel_update.call_count == 1 assert mock_api.cancel_update.called_with(AuroraJobKey( cls.TEST_CLUSTER, cls.TEST_ROLE, cls.TEST_ENV, cls.TEST_JOB), config=None)
def get_jobs_matching_key(self, key): """Finds all jobs matching a key containing wildcard segments. This is potentially slow! TODO(mchucarroll): insert a warning to users about slowness if the key contains wildcards! """ def is_fully_bound(key): """Helper that checks if a key contains wildcards.""" return not any( '*' in component for component in [key.cluster, key.role, key.env, key.name]) def filter_job_list(jobs, role, env, name): """Filter a list of jobs to get just the jobs that match the pattern from a key""" return [ job for job in jobs if fnmatch(job.role, role) and fnmatch(job.env, env) and fnmatch(job.name, name) ] # For cluster, we can expand the list of things we're looking for directly. # For other key elements, we need to just get a list of the jobs on the clusters, and filter # it for things that match. if key.cluster == '*': clusters_to_search = CLUSTERS else: clusters_to_search = [key.cluster] if is_fully_bound(key): return [AuroraJobKey(key.cluster, key.role, key.env, key.name)] else: jobs = filter_job_list( self.get_job_list(clusters_to_search, key.role), key.role, key.env, key.name) return jobs
def create_hosts(cls, num_hosts, percentage, duration): hosts = defaultdict(list) for i in range(num_hosts): host_name = "h%s" % i job = AuroraJobKey.from_path("west/role/env/job%s" % i) hosts[host_name].append(DomainUpTimeSlaVector.JobUpTimeLimit(job, percentage, duration)) return hosts
def execute(self, context): cluster = context.options.cluster api = context.get_api(cluster) response = api.query_job_updates( role=context.options.role, job_key=context.options.jobspec, user=context.options.user, update_statuses=context.options.status) context.log_response_and_raise(response) if context.options.write_json: result = [] for summary in response.result.getJobUpdateSummariesResult.updateSummaries: job_entry = { "jobkey": AuroraJobKey.from_thrift(cluster, summary.jobKey).to_path(), "id": summary.updateId, "user": summary.user, "started": summary.state.createdTimestampMs, "lastModified": summary.state.lastModifiedTimestampMs, "status": JobUpdateStatus._VALUES_TO_NAMES[summary.state.status] } result.append(job_entry) context.print_out( json.dumps(result, indent=2, separators=[',', ': '], sort_keys=False)) else: for summary in response.result.getJobUpdateSummariesResult.updateSummaries: created = summary.state.createdTimestampMs lastMod = summary.state.lastModifiedTimestampMs context.print_out( "Job: %s, Id: %s, User: %s, Status: %s" % (AuroraJobKey.from_thrift( cluster, summary.jobKey).to_path(), summary.updateId, summary.user, JobUpdateStatus._VALUES_TO_NAMES[summary.state.status])) context.print_out("Created: %s, Last Modified %s" % (created, lastMod), indent=2) return EXIT_OK
def setUp(self): self._command = StartUpdate() self._job_key = AuroraJobKey.from_thrift("cluster", UPDATE_KEY.job) self._mock_options = mock_verb_options(self._command) self._mock_options.instance_spec = TaskInstanceKey(self._job_key, None) self._fake_context = FakeAuroraCommandContext() self._fake_context.set_options(self._mock_options) self._mock_api = self._fake_context.get_api('UNUSED')
def setUp(self): self._command = ListUpdates() self._job_key = AuroraJobKey.from_thrift("cluster", UPDATE_KEY.job) self._mock_options = mock_verb_options(self._command) self._mock_options.filter = UpdateFilter(cluster=self.TEST_CLUSTER, role=None, env=None, job=None) self._fake_context = FakeAuroraCommandContext() self._fake_context.set_options(self._mock_options) self._mock_api = self._fake_context.get_api("UNUSED")
def ssh(args, options): """usage: ssh cluster/role/env/job shard [args...] Initiate an SSH session on the machine that a shard is running on. """ if not args: die('Job path is required') job_path = args.pop(0) try: cluster_name, role, env, name = AuroraJobKey.from_path(job_path) except AuroraJobKey.Error as e: die('Invalid job path "%s": %s' % (job_path, e)) if not args: die('Shard is required') try: shard = int(args.pop(0)) except ValueError: die('Shard must be an integer') newcmd = ["task", "ssh", "%s/%s" % (job_path, shard)] if len(options.tunnels) > 0: newcmd.append("--tunnels=%s" % options.tunnels) if options.ssh_user is not None: newcmd.append("--ssh-user=%s" % options.ssh_user) if options.executor_sandbox: newcmd.append("--executor-sandbox") if len(args) > 0: newcmd.append("--command=\"%s\"" % " ".join(args)) v1_deprecation_warning("ssh", newcmd) api = make_client(cluster_name) resp = api.query(api.build_query(role, name, set([int(shard)]), env=env)) check_and_log_response(resp) first_task = resp.result.scheduleStatusResult.tasks[0] remote_cmd = 'bash' if not args else ' '.join(args) command = DistributedCommandRunner.substitute(remote_cmd, first_task, api.cluster, executor_sandbox=options.executor_sandbox) ssh_command = ['ssh', '-t'] role = first_task.assignedTask.task.owner.role slave_host = first_task.assignedTask.slaveHost for tunnel in options.tunnels: try: port, name = tunnel.split(':') port = int(port) except ValueError: die('Could not parse tunnel: %s. Must be of form PORT:NAME' % tunnel) if name not in first_task.assignedTask.assignedPorts: die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name)) ssh_command += [ '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])] ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command] return subprocess.call(ssh_command)
def setUp(self): self._scheduler = Mock() self._sla = Sla(self._scheduler) self._cluster = Cluster(name="cl") self._role = "mesos" self._name = "job" self._env = "test" self._job_key = AuroraJobKey(self._cluster.name, self._role, self._env, self._name) self._min_count = 1
def test_successful_schedule(self): mock_context = FakeAuroraCommandContext() key = AuroraJobKey("west", "bozo", "test", "hello") with contextlib.nested( patch('apache.aurora.client.cli.cron.CronNoun.create_context', return_value=mock_context)): api = mock_context.get_api('west') api.schedule_cron.return_value = self.create_simple_success_response() with temporary_file() as fp: fp.write(self.get_valid_cron_config()) fp.flush() cmd = AuroraCommandLine() cmd.execute(['cron', 'schedule', key.to_path(), fp.name]) # Now check that the right API calls got made. # Check that create_job was called exactly once, with an AuroraConfig parameter. assert api.schedule_cron.call_count == 1 assert isinstance(api.schedule_cron.call_args[0][0], AuroraConfig) # The last text printed out to the user should contain a url to the job assert mock_context.get_job_page(api, key) in mock_context.out[-1]
def killall(args, options): """usage: killall cluster/role/env/job Kills all tasks in a running job, blocking until all specified tasks have been terminated. """ job_key = AuroraJobKey.from_path(args[0]) config_file = args[1] if len(args) > 1 else None # the config for hooks config = get_job_config(job_key.to_path(), config_file, options) if config_file else None api = make_client(job_key.cluster) resp = api.kill_job(job_key, None, config=config) check_and_log_response(resp) handle_open(api.scheduler_proxy.scheduler_client().url, job_key.role, job_key.env, job_key.name)
def cancel_update_job(self, cluster, role, environment, jobname, jobspec=None): """Method to cancel an update of aurora job""" job_key = AuroraJobKey(cluster, role, environment, jobname) logger.info("request to cancel update of => %s", job_key.to_path()) cmd_output = "" try: cmd_args = [job_key.to_path(),] # aurora client requires jobspec be passed as file, no reading from STDIN jobspec_file = self.make_jobspec_file(jobspec) if jobspec_file is not None: cmd_args.append(jobspec_file.name) cmd_output = subprocess.check_output( [self.aurora_cmd, "cancel_update"] + cmd_args, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.warning("aurora client exit status: %d, details follow" % e.returncode) for s in e.output.splitlines(): logger.warning("> %s" % s) logger.warning("----------------------------------------") return(job_key.to_path(), ["Error reported by aurora client:"] + e.output.splitlines()) finally: if jobspec_file: jobspec_file.close() if self.is_aurora_command_successful(cmd_output): logger.info("aurora -- cancel update successful") return(job_key.to_path(), None) else: logger.warning("aurora -- cancel update job") return(job_key.to_path(), ["Error reported by aurora client:"] + cmd_output.splitlines())