def kill_children(signal, frame): log.error( 'Received a signal that is trying to terminate this process.' ' Terminating mesos and relay child processes!', extra=dict( mesos_framework_name=ns.mesos_framework_name, signal=signal)) try: mesos.terminate() log.info( 'terminated mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) except: log.exception( 'could not terminate mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) try: relay.terminate() log.info( 'terminated relay', extra=dict(mesos_framework_name=ns.mesos_framework_name)) except: log.exception( 'could not terminate relay', extra=dict(mesos_framework_name=ns.mesos_framework_name)) sys.exit(1)
def _statusUpdate(self, driver, update): log.debug('task status update: %s' % str(update.message), extra=dict( task_id=update.task_id.value, task_state=update.state, slave_id=update.slave_id.value, timestamp=update.timestamp, mesos_framework_name=self.ns.mesos_framework_name)) if self.ns.max_failures == -1: return # don't quit even if you are getting failures m = mesos_pb2 if update.state in [m.TASK_FAILED, m.TASK_LOST]: self.failures += 1 elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]: self.failures = max(self.failures - 1, 0) if self.failures >= self.ns.max_failures: log.error( "Max allowable number of failures reached", extra=dict( max_failures=self.failures, mesos_framework_name=self.ns.mesos_framework_name)) driver.stop() raise MaxFailuresReached(self.failures)
def _create_task_add_task_resources(task, ns): task_resources = dict(ns.mesos_task_resources) seen = set() for key in set(SCALAR_KEYS).intersection(task_resources): seen.add(key) resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.SCALAR typecast = SCALAR_KEYS[key] resource.scalar.value = typecast(task_resources[key]) for key in set(RANGE_KEYS).intersection(task_resources): seen.add(key) resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.RANGES for range_data in task_resources[key]: inst = resource.ranges.range.add() typecast = RANGE_KEYS[key] inst.begin = typecast(range_data[0]) inst.end = typecast(range_data[1]) for key in set(SET_KEYS).intersection(task_resources): typecast = SET_KEYS[key] seen.add(key) resource = task.resources.add() resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.SET for elem in task_resources[key]: resource.set.item.append(typecast(elem)) unrecognized_keys = set(task_resources).difference(seen) if unrecognized_keys: msg = "Unrecognized keys in task_resources!" log.error(msg, extra=dict( unrecognized_keys=unrecognized_keys, mesos_framework_name=ns.mesos_framework_name)) raise UserWarning( "%s unrecognized_keys: %s" % (msg, unrecognized_keys))
def init_mesos_scheduler(ns, MV, exception_sender, mesos_ready): import mesos.interface from mesos.interface import mesos_pb2 try: import mesos.native except ImportError: log.error( "Oops! Mesos native bindings are not installed. You can download" " these binaries from mesosphere.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) raise log.info( 'starting mesos scheduler', extra=dict(mesos_framework_name=ns.mesos_framework_name)) # build framework framework = mesos_pb2.FrameworkInfo() framework.user = "" # Have Mesos fill in the current user. framework.name = "Relay.Mesos: %s" % ns.mesos_framework_name if ns.mesos_framework_principal: framework.principal = ns.mesos_framework_principal if ns.mesos_framework_role: framework.role = ns.mesos_framework_role if ns.mesos_checkpoint: framework.checkpoint = True # build driver driver = mesos.native.MesosSchedulerDriver( Scheduler( MV=MV, exception_sender=exception_sender, mesos_ready=mesos_ready, ns=ns), framework, ns.mesos_master) atexit.register(driver.stop) # run things status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop() # Ensure that the driver process terminates. sys.exit(status)
def _create_task_add_task_resources(task, ns): task_resources = dict(ns.mesos_task_resources) seen = set() for key in set(SCALAR_KEYS).intersection(task_resources): seen.add(key) resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.SCALAR typecast = SCALAR_KEYS[key] resource.scalar.value = typecast(task_resources[key]) for key in set(RANGE_KEYS).intersection(task_resources): seen.add(key) resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.RANGES for range_data in task_resources[key]: inst = resource.ranges.range.add() typecast = RANGE_KEYS[key] inst.begin = typecast(range_data[0]) inst.end = typecast(range_data[1]) for key in set(SET_KEYS).intersection(task_resources): typecast = SET_KEYS[key] seen.add(key) resource = task.resources.add() resource = task.resources.add() resource.name = key resource.type = mesos_pb2.Value.SET for elem in task_resources[key]: resource.set.item.append(typecast(elem)) unrecognized_keys = set(task_resources).difference(seen) if unrecognized_keys: msg = "Unrecognized keys in task_resources!" log.error(msg, extra=dict(unrecognized_keys=unrecognized_keys, mesos_framework_name=ns.mesos_framework_name)) raise UserWarning("%s unrecognized_keys: %s" % (msg, unrecognized_keys))
def _statusUpdate(self, driver, update): log.debug('task status update: %s' % str(update.message), extra=dict( task_id=update.task_id.value, task_state=update.state, slave_id=update.slave_id.value, timestamp=update.timestamp, mesos_framework_name=self.ns.mesos_framework_name)) if self.ns.max_failures == -1: return # don't quit even if you are getting failures m = mesos_pb2 if update.state in [m.TASK_FAILED, m.TASK_LOST]: self.failures += 1 elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]: self.failures = max(self.failures - 1, 0) if self.failures >= self.ns.max_failures: log.error("Max allowable number of failures reached", extra=dict( max_failures=self.failures, mesos_framework_name=self.ns.mesos_framework_name)) driver.stop() raise MaxFailuresReached(self.failures)
def main(ns): """ Run Relay as a Mesos framework. Relay's event loop and the Mesos scheduler each run in separate processes and communicate through a multiprocessing.Pipe. These two processes bounce control back and forth between mesos resourceOffers and Relay's warmer/cooler functions. Relay warmer/cooler functions request that mesos tasks get spun up, but those requests are only filled if the mesos scheduler receives enough relevant offers. Relay's requests don't build up: only the largest request since the last fulfilled request is fulfilled at moment enough mesos resources are available. """ if ns.mesos_master is None: log.error( "Oops! You didn't define --mesos_master", extra=dict(mesos_framework_name=ns.mesos_framework_name)) build_arg_parser().print_usage() sys.exit(1) if not ns.mesos_task_resources: log.warn( "You didn't define '--mesos_task_resources'." " Tasks may not start on slaves", extra=dict(mesos_framework_name=ns.mesos_framework_name)) log.info( "Starting Relay Mesos!", extra={k: str(v) for k, v in ns.__dict__.items()}) # a distributed value storing the num and type of tasks mesos scheduler # should create at any given moment in time. # Sign of MV determines task type: warmer or cooler # ie. A positive value of n means n warmer tasks MV = mp.Array('d', [0, 0]) # max_val is a ctypes.c_int64 # store exceptions that may be raised exception_receiver, exception_sender = mp.Pipe(False) # notify relay when mesos framework is ready mesos_ready = mp.Condition() # copy and then override warmer and cooler ns_relay = ns.__class__(**{k: v for k, v in ns.__dict__.items()}) if ns.warmer: ns_relay.warmer = warmer_cooler_wrapper(MV, ns) if ns.cooler: ns_relay.cooler = warmer_cooler_wrapper(MV, ns) mesos_name = "Relay.Mesos Scheduler" mesos = mp.Process( target=catch(init_mesos_scheduler, exception_sender), kwargs=dict(ns=ns, MV=MV, exception_sender=exception_sender, mesos_ready=mesos_ready), name=mesos_name) relay_name = "Relay.Runner Event Loop" relay = mp.Process( target=catch(init_relay, exception_sender), args=(ns_relay, mesos_ready, ns.mesos_framework_name), name=relay_name) mesos.start() # start mesos framework relay.start() # start relay's loop set_signals(mesos, relay, ns) while True: if exception_receiver.poll(): exception_receiver.recv() log.error( 'Terminating child processes because one of them raised' ' an exception', extra=dict( is_relay_alive=relay.is_alive(), is_mesos_alive=mesos.is_alive(), mesos_framework_name=ns.mesos_framework_name)) break if not relay.is_alive(): log.error( "Relay died. Check logs to see why.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) break if not mesos.is_alive(): log.error( "Mesos Scheduler died and didn't notify me of its exception." " This may be a code bug. Check logs.", extra=dict(mesos_framework_name=ns.mesos_framework_name)) break # save cpu cycles by checking for subprocess failures less often if ns.delay > 5: time.sleep(5) else: time.sleep(ns.delay) relay.terminate() mesos.terminate() sys.exit(1)