def test_shutdown(self): clusters = [ MesosClusterRepository.get_cluster(address) for address in ['a', 'b', 'c'] ] assert_equal(self.cluster_cls.call_count, 3) MesosClusterRepository.shutdown() for cluster in clusters: assert_equal(cluster.stop.call_count, 1)
def restore_state(self, action_runner): """Use the state manager to retrieve to persisted state and apply it to the configured Jobs. """ log.info('restoring') states = self.state_watcher.restore(self.jobs.get_names()) MesosClusterRepository.restore_state(states.get('mesos_state', {})) self.jobs.restore_state(states.get('job_state', {}), action_runner) self.state_watcher.save_metadata()
def _handle_shutdown(self, sig_num, stack_frame): log.info(f"Shutdown requested via {str(sig_num)}") reactor.callLater(0, reactor.stop) waited = 0 while reactor.running: if waited > 5: log.error("timed out waiting for reactor shutdown") break time.sleep(0.1) waited += 0.1 if self.mcp: self.mcp.shutdown() MesosClusterRepository.shutdown() raise SystemExit(f"Terminating on signal {str(sig_num)}")
def _handle_shutdown(self, sig_num, stack_frame): log.info("Shutdown requested via %s" % sig_num) reactor.callLater(0, reactor.stop) waited = 0 while reactor.running: if waited > 5: log.error("timed out waiting for reactor shutdown") break time.sleep(0.1) waited += 0.1 if self.mcp: self.mcp.shutdown() MesosClusterRepository.shutdown() self.context.terminate(sig_num, stack_frame)
def submit_command(self): serializer = filehandler.OutputStreamSerializer(self.output_path) mesos_cluster = MesosClusterRepository.get_cluster() task = mesos_cluster.create_task( action_run_id=self.id, command=self.command, cpus=self.cpus, mem=self.mem, constraints=self.constraints, docker_image=self.docker_image, docker_parameters=self.docker_parameters, env=self.env, extra_volumes=self.extra_volumes, serializer=serializer, ) if not task: # Mesos is disabled self.fail(None) return self.mesos_task_id = task.get_mesos_id() # Watch before submitting, in case submit causes a transition self.watch(task) mesos_cluster.submit(task) return task
def recover(self): if not self.machine.check('running'): log.error(f'{self} unable to transition from {self.machine.state}' 'to running for recovery') return if self.mesos_task_id is None: log.error(f'{self} no task ID, cannot recover') self.fail_unknown() return log.info(f'{self} recovering Mesos run') serializer = filehandler.OutputStreamSerializer(self.output_path) mesos_cluster = MesosClusterRepository.get_cluster() task = self._create_mesos_task( mesos_cluster, serializer, self.mesos_task_id, ) if not task: log.warning(f'{self} cannot recover, Mesos is disabled or ' f'invalid task ID {self.mesos_task_id!r}') self.fail_unknown() return self.watch(task) mesos_cluster.recover(task) # Reset status self.exit_status = None self.end_time = None self.transition_and_notify('running') return task
def test_configure(self): clusters = [ MesosClusterRepository.get_cluster(address) for address in ['d', 'e'] ] mock_volume = mock.Mock() options = mock.Mock( master_port=5000, secret='/dev/null', principal="fake-principal", role='tron', enabled=False, default_volumes=[mock_volume], dockercfg_location='auth', offer_timeout=1000, ) with mock.patch( 'tron.mesos.get_secret_from_file', autospec=True, return_value='test-secret' ): MesosClusterRepository.configure(options) expected_volume = mock_volume._asdict.return_value for cluster in clusters: cluster.set_enabled.assert_called_once_with(False) cluster.configure_tasks.assert_called_once_with( default_volumes=[expected_volume], dockercfg_location='auth', offer_timeout=1000, ) # Next cluster we get should be initialized with the same settings MesosClusterRepository.get_cluster('f') self.cluster_cls.assert_called_with( mesos_address='f', mesos_master_port=5000, secret='test-secret', principal="fake-principal", mesos_role='tron', framework_id=None, enabled=False, default_volumes=[expected_volume], dockercfg_location='auth', offer_timeout=1000, )
def submit_command(self): serializer = filehandler.OutputStreamSerializer(self.output_path) mesos_cluster = MesosClusterRepository.get_cluster() task = self._create_mesos_task(mesos_cluster, serializer) if not task: # Mesos is disabled self.fail(self.EXIT_MESOS_DISABLED) return self.mesos_task_id = task.get_mesos_id() # Watch before submitting, in case submit causes a transition self.watch(task) mesos_cluster.submit(task) return task
def recover(self): if self.mesos_task_id is None: log.error(f'{self} no task ID, cannot recover') return if not self.machine.check('running'): log.error( f'{self} unable to transition from {self.machine.state}' 'to running for recovery' ) return log.info(f'{self} recovering Mesos run') serializer = filehandler.OutputStreamSerializer(self.output_path) mesos_cluster = MesosClusterRepository.get_cluster() task = mesos_cluster.create_task( action_run_id=self.id, command=self.command, cpus=self.cpus, mem=self.mem, constraints=self.constraints, docker_image=self.docker_image, docker_parameters=self.docker_parameters, env=self.env, extra_volumes=self.extra_volumes, serializer=serializer, task_id=self.mesos_task_id, ) if not task: log.warning( f'{self} cannot recover, Mesos is disabled or ' f'invalid task ID {self.mesos_task_id!r}' ) self.fail_unknown() return self.watch(task) mesos_cluster.recover(task) # Reset status self.exit_status = None self.end_time = None self.transition_and_notify('running') return task
def _kill_mesos_task(self): msgs = [] if not self.is_active: msgs.append(f'Action is {self.state}, not running. Continuing anyway.') mesos_cluster = MesosClusterRepository.get_cluster() if self.mesos_task_id is None: msgs.append("Error: Can't find task id for the action.") else: msgs.append(f"Sending kill for {self.mesos_task_id}...") succeeded = mesos_cluster.kill(self.mesos_task_id) if succeeded: msgs.append("Sent! It can take up to docker_stop_timeout (current setting is 2 mins) to stop.") else: msgs.append("Error while sending kill request. Please try again.") return '\n'.join(msgs)
def recover(self): if not self.machine.check('running'): log.error( f'{self} unable to transition from {self.machine.state}' 'to running for recovery' ) return if self.mesos_task_id is None: log.error(f'{self} no task ID, cannot recover') self.fail_unknown() return log.info(f'{self} recovering Mesos run') serializer = filehandler.OutputStreamSerializer(self.output_path) mesos_cluster = MesosClusterRepository.get_cluster() task = self._create_mesos_task( mesos_cluster, serializer, self.mesos_task_id, ) if not task: log.warning( f'{self} cannot recover, Mesos is disabled or ' f'invalid task ID {self.mesos_task_id!r}' ) self.fail_unknown() return self.watch(task) mesos_cluster.recover(task) # Reset status self.exit_status = None self.end_time = None self.transition_and_notify('running') return task
def _kill_mesos_task(self): msgs = [] if not self.is_active: msgs.append( f'Action is {self.state}, not running. Continuing anyway.' ) mesos_cluster = MesosClusterRepository.get_cluster() if self.mesos_task_id is None: msgs.append("Error: Can't find task id for the action.") else: msgs.append(f"Sending kill for {self.mesos_task_id}...") succeeded = mesos_cluster.kill(self.mesos_task_id) if succeeded: msgs.append( "Sent! It can take up to docker_stop_timeout (current setting is 2 mins) to stop." ) else: msgs.append( "Error while sending kill request. Please try again." ) return '\n'.join(msgs)
def test_get_cluster_repeated_mesos_address(self): first = MesosClusterRepository.get_cluster('master-a.com') second = MesosClusterRepository.get_cluster('master-a.com') assert_equal(first, second) assert_equal(self.cluster_cls.call_count, 1)