def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos') self.connect()
def stop(self): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.queue = PyDeferredQueue() for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key]
def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos' ) self.connect()
def stop(self, fail_tasks=False): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.deferred = None self.queue = PyDeferredQueue() if fail_tasks: for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key]
class MesosCluster: def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos') self.connect() def set_enabled(self, is_enabled): self.enabled = is_enabled if is_enabled: self.connect() else: self.stop(fail_tasks=True) def configure_tasks( self, default_volumes, dockercfg_location, offer_timeout, ): self.default_volumes = default_volumes self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout def connect(self): self.runner = self.get_runner(self.mesos_address, self.queue) self.handle_next_event() def handle_next_event(self, deferred_result=None): if self.deferred and not self.deferred.called: log.warning( 'Already have handlers waiting for next event in queue, ' 'not adding more') return self.deferred = self.queue.get() self.deferred.addCallback(self._process_event) self.deferred.addCallback(self.handle_next_event) self.deferred.addErrback(logError) self.deferred.addErrback(self.handle_next_event) def _check_connection(self): if self.runner.stopping: # Last framework was terminated for some reason, re-connect. log.info('Last framework stopped, re-connecting') self.connect() elif self.deferred.called: # Just in case callbacks are missing, re-add. self.handle_next_event() def submit(self, task): if not task: return if not self.enabled: task.log.info('Task failed to start, Mesos is disabled.') task.exited(1) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task env = task.get_config()['environment'] clusterman_resource_str = env.get('CLUSTERMAN_RESOURCES') clusterman_metrics = get_clusterman_metrics() if clusterman_resource_str and clusterman_metrics: clusterman_resources = json.loads(clusterman_resource_str) cluster = env.get('EXECUTOR_CLUSTER', env.get('PAASTA_CLUSTER')) pool = env.get('EXECUTOR_POOL', env.get('PAASTA_POOL')) aws_region = staticconf.read(f'clusters.{cluster}.aws_region', namespace='clusterman') metrics_client = clusterman_metrics.ClustermanMetricsBotoClient( region_name=aws_region, app_identifier=pool, ) with metrics_client.get_writer( clusterman_metrics.APP_METRICS, aggregate_meteorite_dims=True) as writer: for metric_key, metric_value in clusterman_resources.items(): writer.send((metric_key, int(time.time()), metric_value)) self.runner.run(task.get_config()) log.info( 'Submitting task {} to {}'.format( mesos_task_id, self.mesos_address, ), ) task.report_resources() def recover(self, task): if not task: return if not self.enabled: task.log.info('Could not recover task, Mesos is disabled.') task.exited(None) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task task.log.info( 'TRON RESTARTED! Starting recovery procedure by reconciling state for this task from Mesos' ) task.started() self.runner.reconcile(task.get_config()) task.report_resources() def create_task( self, action_run_id, command, cpus, mem, disk, constraints, docker_image, docker_parameters, env, extra_volumes, serializer, task_id=None, ): if not self.runner: return None uris = [self.dockercfg_location] if self.dockercfg_location else [] volumes = combine_volumes(self.default_volumes, extra_volumes) task_kwargs = { 'name': action_run_id, 'cmd': command, 'cpus': cpus, 'mem': mem, 'disk': disk, 'constraints': constraints, 'image': docker_image, 'docker_parameters': docker_parameters, 'environment': env, 'volumes': volumes, 'uris': uris, 'offer_timeout': self.offer_timeout, } task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs) if task_id is not None: try: task_config = task_config.set_task_id(task_id) except ValueError: log.error(f'Invalid {task_id} for {action_run_id}') return return MesosTask(action_run_id, task_config, serializer) def get_runner(self, mesos_address, queue): if not self.enabled: log.info('Mesos is disabled, not creating a framework.') return None if self.runner and not self.runner.stopping: log.info('Already have a running framework, not creating one.') return self.runner framework_name = 'tron-{}'.format(socket.gethostname()) executor = self.processor.executor_from_config( provider='mesos_task', provider_config={ 'secret': self.secret, 'principal': self.principal, 'mesos_address': get_mesos_leader(mesos_address, self.mesos_master_port), 'role': self.mesos_role, 'framework_name': framework_name, 'framework_id': self.framework_id, 'failover': True, }) def log_output(task_id, message, stream): logger = logging.getLogger('{}.{}.{}'.format( TASK_OUTPUT_LOGGER, task_id, stream, )) logger.info(message) logging_executor = self.processor.executor_from_config( provider='logging', provider_config={ 'downstream_executor': executor, 'handler': log_output, 'format_string': '{line}', }, ) return Subscription(logging_executor, queue) def _process_event(self, event): if event.kind == 'control': message = getattr(event, 'message', None) if message == 'stop': # Framework has been removed, stop it. log.warning('Framework has been stopped: {}'.format(event.raw)) self.stop() MesosClusterRepository.remove(self.mesos_address) elif message == 'unknown': log.warning('Unknown error from Mesos master: {}'.format( event.raw)) elif message == 'registered': framework_id = event.raw['framework_id']['value'] MesosClusterRepository.save(self.mesos_address, framework_id) else: log.warning('Unknown type of control event: {}'.format(event)) elif event.kind == 'task': if not hasattr(event, 'task_id'): log.warning('Task event missing task_id: {}'.format(event)) return if event.task_id not in self.tasks: log.warning( 'Received event for unknown task {}: {}'.format( event.task_id, event, ), ) return task = self.tasks[event.task_id] task.handle_event(event) if task.is_done: del self.tasks[event.task_id] else: log.warning('Unknown type of event: {}'.format(event)) def stop(self, fail_tasks=False): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.deferred = None self.queue = PyDeferredQueue() if fail_tasks: for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key] def kill(self, task_id): return self.runner.kill(task_id)
class MesosCluster: def __init__( self, mesos_address, mesos_master_port=None, secret=None, principal=None, mesos_role=None, framework_id=None, enabled=True, default_volumes=None, dockercfg_location=None, offer_timeout=None, ): self.mesos_address = mesos_address self.mesos_master_port = mesos_master_port self.secret = secret self.principal = principal self.mesos_role = mesos_role self.enabled = enabled self.default_volumes = default_volumes or [] self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout self.framework_id = framework_id self.processor = TaskProcessor() self.queue = PyDeferredQueue() self.deferred = None self.runner = None self.tasks = {} self.processor.load_plugin( provider_module='task_processing.plugins.mesos' ) self.connect() def set_enabled(self, is_enabled): self.enabled = is_enabled if is_enabled: self.connect() else: self.stop(fail_tasks=True) def configure_tasks( self, default_volumes, dockercfg_location, offer_timeout, ): self.default_volumes = default_volumes self.dockercfg_location = dockercfg_location self.offer_timeout = offer_timeout def connect(self): self.runner = self.get_runner(self.mesos_address, self.queue) self.handle_next_event() def handle_next_event(self, deferred_result=None): if self.deferred and not self.deferred.called: log.warning( 'Already have handlers waiting for next event in queue, ' 'not adding more' ) return self.deferred = self.queue.get() self.deferred.addCallback(self._process_event) self.deferred.addCallback(self.handle_next_event) self.deferred.addErrback(logError) self.deferred.addErrback(self.handle_next_event) def _check_connection(self): if self.runner.stopping: # Last framework was terminated for some reason, re-connect. log.info('Last framework stopped, re-connecting') self.connect() elif self.deferred.called: # Just in case callbacks are missing, re-add. self.handle_next_event() def submit(self, task): if not task: return if not self.enabled: task.log.info('Task failed to start, Mesos is disabled.') task.exited(1) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task self.runner.run(task.get_config()) log.info( 'Submitting task {} to {}'.format( mesos_task_id, self.mesos_address, ), ) task.report_resources() def recover(self, task): if not task: return if not self.enabled: task.log.info('Could not recover task, Mesos is disabled.') task.exited(None) return self._check_connection() mesos_task_id = task.get_mesos_id() self.tasks[mesos_task_id] = task task.log.info('Reconciling state for this task from Mesos') task.started() self.runner.reconcile(task.get_config()) task.report_resources() def create_task( self, action_run_id, command, cpus, mem, disk, constraints, docker_image, docker_parameters, env, extra_volumes, serializer, task_id=None, ): if not self.runner: return None uris = [self.dockercfg_location] if self.dockercfg_location else [] volumes = combine_volumes(self.default_volumes, extra_volumes) task_kwargs = { 'name': action_run_id, 'cmd': command, 'cpus': cpus, 'mem': mem, 'disk': disk, 'constraints': constraints, 'image': docker_image, 'docker_parameters': docker_parameters, 'environment': env, 'volumes': volumes, 'uris': uris, 'offer_timeout': self.offer_timeout, } task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs) if task_id is not None: try: task_config = task_config.set_task_id(task_id) except ValueError: log.error(f'Invalid {task_id} for {action_run_id}') return return MesosTask(action_run_id, task_config, serializer) def get_runner(self, mesos_address, queue): if not self.enabled: log.info('Mesos is disabled, not creating a framework.') return None if self.runner and not self.runner.stopping: log.info('Already have a running framework, not creating one.') return self.runner framework_name = 'tron-{}'.format(socket.gethostname()) executor = self.processor.executor_from_config( provider='mesos_task', provider_config={ 'secret': self.secret, 'principal': self.principal, 'mesos_address': get_mesos_leader(mesos_address, self.mesos_master_port), 'role': self.mesos_role, 'framework_name': framework_name, 'framework_id': self.framework_id, 'failover': True, } ) def log_output(task_id, message, stream): logger = logging.getLogger( '{}.{}.{}'.format( TASK_OUTPUT_LOGGER, task_id, stream, ) ) logger.info(message) logging_executor = self.processor.executor_from_config( provider='logging', provider_config={ 'downstream_executor': executor, 'handler': log_output, 'format_string': '{line}', }, ) return Subscription(logging_executor, queue) def _process_event(self, event): if event.kind == 'control': message = getattr(event, 'message', None) if message == 'stop': # Framework has been removed, stop it. log.warning('Framework has been stopped: {}'.format(event.raw)) self.stop() MesosClusterRepository.remove(self.mesos_address) elif message == 'unknown': log.warning( 'Unknown error from Mesos master: {}'.format(event.raw) ) elif message == 'registered': framework_id = event.raw['framework_id']['value'] MesosClusterRepository.save(self.mesos_address, framework_id) else: log.warning('Unknown type of control event: {}'.format(event)) elif event.kind == 'task': if not hasattr(event, 'task_id'): log.warning('Task event missing task_id: {}'.format(event)) return if event.task_id not in self.tasks: log.warning( 'Received event for unknown task {}: {}'.format( event.task_id, event, ), ) return task = self.tasks[event.task_id] task.handle_event(event) if task.is_done: del self.tasks[event.task_id] else: log.warning('Unknown type of event: {}'.format(event)) def stop(self, fail_tasks=False): self.framework_id = None if self.runner: self.runner.stop() # Clear message queue if self.deferred: self.deferred.cancel() self.deferred = None self.queue = PyDeferredQueue() if fail_tasks: for key, task in list(self.tasks.items()): task.exited(None) del self.tasks[key] def kill(self, task_id): return self.runner.kill(task_id)