Beispiel #1
0
 def kill_children(signal, frame):
     log.error(
         'Received a signal that is trying to terminate this process.'
         ' Terminating mesos and relay child processes!', extra=dict(
             mesos_framework_name=ns.mesos_framework_name,
             signal=signal))
     try:
         mesos.terminate()
         log.info(
             'terminated mesos scheduler',
             extra=dict(mesos_framework_name=ns.mesos_framework_name))
     except:
         log.exception(
             'could not terminate mesos scheduler',
             extra=dict(mesos_framework_name=ns.mesos_framework_name))
     try:
         relay.terminate()
         log.info(
             'terminated relay',
             extra=dict(mesos_framework_name=ns.mesos_framework_name))
     except:
         log.exception(
             'could not terminate relay',
             extra=dict(mesos_framework_name=ns.mesos_framework_name))
     sys.exit(1)
Beispiel #2
0
    def _statusUpdate(self, driver, update):
        log.debug('task status update: %s' % str(update.message), extra=dict(
            task_id=update.task_id.value, task_state=update.state,
            slave_id=update.slave_id.value, timestamp=update.timestamp,
            mesos_framework_name=self.ns.mesos_framework_name))
        if self.ns.max_failures == -1:
            return  # don't quit even if you are getting failures

        m = mesos_pb2
        if update.state in [m.TASK_FAILED, m.TASK_LOST]:
            self.failures += 1
        elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]:
            self.failures = max(self.failures - 1, 0)
        if self.failures >= self.ns.max_failures:
            log.error(
                "Max allowable number of failures reached", extra=dict(
                    max_failures=self.failures,
                    mesos_framework_name=self.ns.mesos_framework_name))
            driver.stop()
            raise MaxFailuresReached(self.failures)
Beispiel #3
0
def _create_task_add_task_resources(task, ns):
    task_resources = dict(ns.mesos_task_resources)
    seen = set()
    for key in set(SCALAR_KEYS).intersection(task_resources):
        seen.add(key)
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.SCALAR
        typecast = SCALAR_KEYS[key]
        resource.scalar.value = typecast(task_resources[key])

    for key in set(RANGE_KEYS).intersection(task_resources):
        seen.add(key)
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.RANGES
        for range_data in task_resources[key]:
            inst = resource.ranges.range.add()
            typecast = RANGE_KEYS[key]
            inst.begin = typecast(range_data[0])
            inst.end = typecast(range_data[1])

    for key in set(SET_KEYS).intersection(task_resources):
        typecast = SET_KEYS[key]
        seen.add(key)
        resource = task.resources.add()
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.SET
        for elem in task_resources[key]:
            resource.set.item.append(typecast(elem))

    unrecognized_keys = set(task_resources).difference(seen)
    if unrecognized_keys:
        msg = "Unrecognized keys in task_resources!"
        log.error(msg, extra=dict(
            unrecognized_keys=unrecognized_keys,
            mesos_framework_name=ns.mesos_framework_name))
        raise UserWarning(
            "%s unrecognized_keys: %s" % (msg, unrecognized_keys))
Beispiel #4
0
def init_mesos_scheduler(ns, MV, exception_sender, mesos_ready):
    import mesos.interface
    from mesos.interface import mesos_pb2
    try:
        import mesos.native
    except ImportError:
        log.error(
            "Oops! Mesos native bindings are not installed.  You can download"
            " these binaries from mesosphere.",
            extra=dict(mesos_framework_name=ns.mesos_framework_name))
        raise

    log.info(
        'starting mesos scheduler',
        extra=dict(mesos_framework_name=ns.mesos_framework_name))

    # build framework
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "Relay.Mesos: %s" % ns.mesos_framework_name
    if ns.mesos_framework_principal:
        framework.principal = ns.mesos_framework_principal
    if ns.mesos_framework_role:
        framework.role = ns.mesos_framework_role
    if ns.mesos_checkpoint:
        framework.checkpoint = True

    # build driver
    driver = mesos.native.MesosSchedulerDriver(
        Scheduler(
            MV=MV, exception_sender=exception_sender, mesos_ready=mesos_ready,
            ns=ns),
        framework,
        ns.mesos_master)
    atexit.register(driver.stop)

    # run things
    status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
    driver.stop()  # Ensure that the driver process terminates.
    sys.exit(status)
Beispiel #5
0
def _create_task_add_task_resources(task, ns):
    task_resources = dict(ns.mesos_task_resources)
    seen = set()
    for key in set(SCALAR_KEYS).intersection(task_resources):
        seen.add(key)
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.SCALAR
        typecast = SCALAR_KEYS[key]
        resource.scalar.value = typecast(task_resources[key])

    for key in set(RANGE_KEYS).intersection(task_resources):
        seen.add(key)
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.RANGES
        for range_data in task_resources[key]:
            inst = resource.ranges.range.add()
            typecast = RANGE_KEYS[key]
            inst.begin = typecast(range_data[0])
            inst.end = typecast(range_data[1])

    for key in set(SET_KEYS).intersection(task_resources):
        typecast = SET_KEYS[key]
        seen.add(key)
        resource = task.resources.add()
        resource = task.resources.add()
        resource.name = key
        resource.type = mesos_pb2.Value.SET
        for elem in task_resources[key]:
            resource.set.item.append(typecast(elem))

    unrecognized_keys = set(task_resources).difference(seen)
    if unrecognized_keys:
        msg = "Unrecognized keys in task_resources!"
        log.error(msg,
                  extra=dict(unrecognized_keys=unrecognized_keys,
                             mesos_framework_name=ns.mesos_framework_name))
        raise UserWarning("%s unrecognized_keys: %s" %
                          (msg, unrecognized_keys))
Beispiel #6
0
    def _statusUpdate(self, driver, update):
        log.debug('task status update: %s' % str(update.message),
                  extra=dict(
                      task_id=update.task_id.value,
                      task_state=update.state,
                      slave_id=update.slave_id.value,
                      timestamp=update.timestamp,
                      mesos_framework_name=self.ns.mesos_framework_name))
        if self.ns.max_failures == -1:
            return  # don't quit even if you are getting failures

        m = mesos_pb2
        if update.state in [m.TASK_FAILED, m.TASK_LOST]:
            self.failures += 1
        elif update.state in [m.TASK_FINISHED, m.TASK_STARTING]:
            self.failures = max(self.failures - 1, 0)
        if self.failures >= self.ns.max_failures:
            log.error("Max allowable number of failures reached",
                      extra=dict(
                          max_failures=self.failures,
                          mesos_framework_name=self.ns.mesos_framework_name))
            driver.stop()
            raise MaxFailuresReached(self.failures)
Beispiel #7
0
def main(ns):
    """
    Run Relay as a Mesos framework.
    Relay's event loop and the Mesos scheduler each run in separate processes
    and communicate through a multiprocessing.Pipe.

    These two processes bounce control back and forth between mesos
    resourceOffers and Relay's warmer/cooler functions.  Relay warmer/cooler
    functions request that mesos tasks get spun up, but those requests are only
    filled if the mesos scheduler receives enough relevant offers.  Relay's
    requests don't build up: only the largest request since the last fulfilled
    request is fulfilled at moment enough mesos resources are available.
    """
    if ns.mesos_master is None:
        log.error(
            "Oops!  You didn't define --mesos_master",
            extra=dict(mesos_framework_name=ns.mesos_framework_name))
        build_arg_parser().print_usage()
        sys.exit(1)
    if not ns.mesos_task_resources:
        log.warn(
            "You didn't define '--mesos_task_resources'."
            "  Tasks may not start on slaves",
            extra=dict(mesos_framework_name=ns.mesos_framework_name))
    log.info(
        "Starting Relay Mesos!",
        extra={k: str(v) for k, v in ns.__dict__.items()})

    # a distributed value storing the num and type of tasks mesos scheduler
    # should create at any given moment in time.
    # Sign of MV determines task type: warmer or cooler
    # ie. A positive value of n means n warmer tasks
    MV = mp.Array('d', [0, 0])  # max_val is a ctypes.c_int64

    # store exceptions that may be raised
    exception_receiver, exception_sender = mp.Pipe(False)
    # notify relay when mesos framework is ready
    mesos_ready = mp.Condition()

    # copy and then override warmer and cooler
    ns_relay = ns.__class__(**{k: v for k, v in ns.__dict__.items()})
    if ns.warmer:
        ns_relay.warmer = warmer_cooler_wrapper(MV, ns)
    if ns.cooler:
        ns_relay.cooler = warmer_cooler_wrapper(MV, ns)

    mesos_name = "Relay.Mesos Scheduler"
    mesos = mp.Process(
        target=catch(init_mesos_scheduler, exception_sender),
        kwargs=dict(ns=ns, MV=MV, exception_sender=exception_sender,
                    mesos_ready=mesos_ready),
        name=mesos_name)
    relay_name = "Relay.Runner Event Loop"
    relay = mp.Process(
        target=catch(init_relay, exception_sender),
        args=(ns_relay, mesos_ready, ns.mesos_framework_name),
        name=relay_name)
    mesos.start()  # start mesos framework
    relay.start()  # start relay's loop
    set_signals(mesos, relay, ns)

    while True:
        if exception_receiver.poll():
            exception_receiver.recv()
            log.error(
                'Terminating child processes because one of them raised'
                ' an exception', extra=dict(
                    is_relay_alive=relay.is_alive(),
                    is_mesos_alive=mesos.is_alive(),
                    mesos_framework_name=ns.mesos_framework_name))
            break
        if not relay.is_alive():
            log.error(
                "Relay died.  Check logs to see why.",
                extra=dict(mesos_framework_name=ns.mesos_framework_name))
            break
        if not mesos.is_alive():
            log.error(
                "Mesos Scheduler died and didn't notify me of its exception."
                "  This may be a code bug.  Check logs.",
                extra=dict(mesos_framework_name=ns.mesos_framework_name))
            break
        # save cpu cycles by checking for subprocess failures less often
        if ns.delay > 5:
            time.sleep(5)
        else:
            time.sleep(ns.delay)

    relay.terminate()
    mesos.terminate()
    sys.exit(1)