Example #1
0
    def start_driver(self):
        name = 'OpenCluster'
        if self.options.name:
            name = "%s-%s" % (name, self.options.name)
        else:
            name = "%s-%s" % (
                name, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"))

        if len(name) > 256:
            name = name[:256] + '...'

        framework = mesos_pb2.FrameworkInfo()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception("OpenCluster is not allowed to run as 'root'")
        framework.name = name
        framework.hostname = socket.gethostname()

        self.driver = MesosSchedulerDriver(self, framework, self.master)
        self.driver.start()
        logger.debug("Mesos Scheudler driver started")

        self.shuttingdown = False
        self.last_finish_time = time.time()
        self.stopped = False
Example #2
0
def main(master):
    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s %(levelname)s] %(message)s')

    # Create a new executor
    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = 'ExampleExecutor'
    executor.name = executor.executor_id.value
    executor.command.value = os.path.abspath('./executor-skeleton.py')

    # Create a new framework
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ''  # the current user
    framework.name = 'ExampleFramework'
    framework.checkpoint = True

    implicitAcknowledgements = 1

    if os.getenv('EXAMPLE_AUTHENTICATE'):
        logging.info('Enabling framework authentication')

        credential = mesos_pb2.Credential()
        credential.principal = os.getenv('EXAMPLE_PRINCIPAL')
        credential.secret = os.getenv('EXAMPLE_SECRET')
        framework.principal = os.getenv('EXAMPLE_PRINCIPAL')

        driver = MesosSchedulerDriver(ExampleScheduler(executor), framework,
                                      master, implicitAcknowledgements,
                                      credential)
    else:
        framework.principal = framework.name

        driver = MesosSchedulerDriver(ExampleScheduler(executor), framework,
                                      master, implicitAcknowledgements)

    def signal_handler(signal, frame):
        logging.info('Shutting down')
        driver.stop()

    # driver.run() blocks, so we run it in a separate thread.
    # This way, we can catch a SIGINT to kill the framework.
    def run_driver_thread():
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()  # Ensure the driver process terminates
        sys.exit(status)

    driver_thread = Thread(target=run_driver_thread, args=())
    driver_thread.start()

    logging.info('Scheduler running, Ctrl-C to exit')
    signal.signal(signal.SIGINT, signal_handler)

    # Block the main thread while the driver thread is alive
    while driver_thread.is_alive():
        time.sleep(1)

    logging.info('Framework finished.')
    sys.exit(0)
Example #3
0
    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        '''

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info(u'Command starting: scale_scheduler')
        logger.info(u' - Master: %s', mesos_master)
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = 'scale'
        executor.command.value = '%s %s scale_executor' % (settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE)
        executor.name = 'Scale Executor (Python)'

        self.scheduler = ScaleScheduler(executor)

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale Framework (Python)'

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1

        # Perform any required clean up operations like stopping background threads
        status = status or self._shutdown()

        logger.info(u'Command completed: scale_scheduler')
        sys.exit(status)
Example #4
0
    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()
        self.scheduler.initialize()
        scheduler_mgr.hostname = socket.getfqdn()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale')
        webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS')

        if webserver_address:
            framework.webui_url = webserver_address

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error(
                    'Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error(
                    'Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown()
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)
Example #5
0
    def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1, *args, **kwargs):
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        scheduler = SchedulerProxy(scheduler)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.stop()

        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.stop)
Example #6
0
    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale')
        webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS')

        if webserver_address:
            framework.webui_url = webserver_address

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)
def create_driver(framework_name,
                  scheduler,
                  system_paasta_config,
                  implicit_acks=False):
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = framework_name
    framework.failover_timeout = 604800
    framework.id.value = find_existing_id_if_exists_or_gen_new(framework.name)
    framework.checkpoint = True

    credential = mesos_pb2.Credential()
    credential.principal = system_paasta_config.get_paasta_native_config(
    )['principal']
    credential.secret = system_paasta_config.get_paasta_native_config(
    )['secret']

    framework.principal = system_paasta_config.get_paasta_native_config(
    )['principal']

    driver = MesosSchedulerDriver(
        scheduler, framework, '%s:%d' %
        (mesos_tools.get_mesos_leader(), mesos_tools.MESOS_MASTER_PORT),
        implicit_acks, credential)
    return driver
Example #8
0
    def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1, *args, **kwargs):
        scheduler = SchedulerProxy(scheduler)
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.driver.stop()
        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.driver.stop)
Example #9
0
class Running(object):

    def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1, *args, **kwargs):
        scheduler = SchedulerProxy(scheduler)
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.stop()

        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.stop)

    def run(self):
        return self.driver.run()

    def start(self):
        status = self.driver.start()
        assert status == mesos_pb2.DRIVER_RUNNING
        return status

    def stop(self):
        return self.driver.stop()

    def join(self):
        return self.driver.join()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.stop()
        self.join()
        if exc_type:
            raise exc_type, exc_value, traceback
Example #10
0
class Running(object):

    def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1, *args, **kwargs):
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        scheduler = SchedulerProxy(scheduler)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.stop()

        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.stop)

    def run(self):
        return self.driver.run()

    def start(self):
        status = self.driver.start()
        assert status == mesos_pb2.DRIVER_RUNNING
        return status

    def stop(self):
        return self.driver.stop()

    def join(self):
        return self.driver.join()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.stop()
        self.join()
        if exc_type:
            raise exc_type, exc_value, traceback
Example #11
0
class Running(object):
    def __init__(self,
                 scheduler,
                 name,
                 user='',
                 master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1,
                 *args,
                 **kwargs):
        scheduler = SchedulerProxy(scheduler)
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.driver.stop()

        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.driver.stop)

    def run(self):
        return self.driver.run()

    def start(self):
        status = self.driver.start()
        assert status == mesos_pb2.DRIVER_RUNNING
        return status

    def stop(self):
        logging.info("Stopping Mesos driver")
        self.driver.stop()
        logging.info("Joining Mesos driver")
        result = self.driver.join()
        logging.info("Joined Mesos driver")
        if result != mesos_pb2.DRIVER_STOPPED:
            raise RuntimeError("Mesos driver failed with %i", result)

    def join(self):
        return self.driver.join()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, type, value, traceback):
        self.stop()
Example #12
0
class Running(object):

    def __init__(self, scheduler, name, user='', master=os.getenv('MESOS_MASTER'),
                 implicit_acknowledge=1, *args, **kwargs):
        scheduler = SchedulerProxy(scheduler)
        framework = FrameworkInfo(name=name, user=user, *args, **kwargs)
        self.driver = MesosSchedulerDriver(scheduler, encode(framework),
                                           master, implicit_acknowledge)

        def shutdown(signal, frame):
            self.driver.stop()
        signal.signal(signal.SIGINT, shutdown)
        signal.signal(signal.SIGTERM, shutdown)
        atexit.register(self.driver.stop)

    def run(self):
        return self.driver.run()

    def start(self):
        status = self.driver.start()
        assert status == mesos_pb2.DRIVER_RUNNING
        return status

    def stop(self):
        logging.info("Stopping Mesos driver")
        self.driver.stop()
        logging.info("Joining Mesos driver")
        result = self.driver.join()
        logging.info("Joined Mesos driver")
        if result != mesos_pb2.DRIVER_STOPPED:
            raise RuntimeError("Mesos driver failed with %i", result)

    def join(self):
        return self.driver.join()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, type, value, traceback):
        self.stop()
Example #13
0
def main(master):
    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = 'MinimalExecutor'
    executor.name = executor.executor_id.value
    executor.command.value = os.path.abspath('./executor-minimal.py')

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ''  # the current user
    framework.name = 'MinimalFramework'
    framework.checkpoint = True
    framework.principal = framework.name

    implicitAcknowledgements = 1

    driver = MesosSchedulerDriver(
        MinimalScheduler(executor),
        framework,
        master,
        implicitAcknowledgements
    )

    def signal_handler(signal, frame):
        driver.stop()

    def run_driver_thread():
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()
        sys.exit(status)

    driver_thread = Thread(target=run_driver_thread, args=())
    driver_thread.start()

    print('Scheduler running, Ctrl-C to quit.')
    signal.signal(signal.SIGINT, signal_handler)

    while driver_thread.is_alive():
        time.sleep(1)

    sys.exit(0)
Example #14
0
    def start_driver(self):
        name = 'OpenCluster'
        if self.options.name :
            name = "%s-%s" % (name,self.options.name)
        else:
            name = "%s-%s" % (name,datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"))

        if len(name) > 256:
            name = name[:256] + '...'

        framework = mesos_pb2.FrameworkInfo()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception("OpenCluster is not allowed to run as 'root'")
        framework.name = name
        framework.hostname = socket.gethostname()

        self.driver = MesosSchedulerDriver(self, framework, self.master)
        self.driver.start()
        logger.debug("Mesos Scheudler driver started")

        self.shuttingdown = False
        self.last_finish_time = time.time()
        self.stopped = False
Example #15
0
class MesosScheduler(Scheduler):
    def __init__(self, manager, master, options):
        Scheduler.__init__(self, manager)
        self.master = master
        self.cpus = options.cpus
        self.mem = parse_mem(options.mem)
        self.gpus = options.gpus
        self.task_per_node = options.parallel or multiprocessing.cpu_count()
        self.options = options
        self.group = options.group
        self.last_finish_time = 0
        self.executor = None
        self.driver = None
        self.lock = threading.RLock()
        self.task_waiting = []
        self.task_launched = {}
        self.slaveTasks = {}
        self.starting = False

    def start_driver(self):
        name = 'OpenCluster'
        if self.options.name:
            name = "%s-%s" % (name, self.options.name)
        else:
            name = "%s-%s" % (
                name, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"))

        if len(name) > 256:
            name = name[:256] + '...'

        framework = mesos_pb2.FrameworkInfo()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception("OpenCluster is not allowed to run as 'root'")
        framework.name = name
        framework.hostname = socket.gethostname()

        self.driver = MesosSchedulerDriver(self, framework, self.master)
        self.driver.start()
        logger.debug("Mesos Scheudler driver started")

        self.shuttingdown = False
        self.last_finish_time = time.time()
        self.stopped = False
        #
        # def check():
        #     while self.started:
        #         now = time.time()
        #         if not self.task_waiting and now - self.last_finish_time > MAX_IDLE_TIME:
        #             logger.info("stop mesos scheduler after %d seconds idle", now - self.last_finish_time)
        #             self.shutdown()
        #             break
        #         time.sleep(1)
        #
        #         if len(self.task_success()) + len(self.task_failed) == self.taskNum:
        #             self.shutdown()
        # spawn(check)

    @safe
    def registered(self, driver, frameworkId, masterInfo):
        self.started = True
        logger.debug("connect to master %s:%s(%s), registered as %s",
                     int2ip(masterInfo.ip), masterInfo.port, masterInfo.id,
                     frameworkId.value)
        self.executor = self.getExecutorInfo(str(frameworkId.value))

    @safe
    def reregistered(self, driver, masterInfo):
        logger.warning("re-connect to mesos master %s:%s(%s)",
                       int2ip(masterInfo.ip), masterInfo.port, masterInfo.id)

    @safe
    def disconnected(self, driver):
        logger.debug("framework is disconnected")

    @safe
    def getExecutorInfo(self, framework_id):
        execInfo = mesos_pb2.ExecutorInfo()
        execInfo.executor_id.value = "multiframework"
        execInfo.command.value = '%s %s' % (
            sys.executable,  # /usr/bin/python.exe or .../python
            os.path.abspath(
                os.path.join(os.path.dirname(__file__), 'simpleexecutor.py')))
        v = execInfo.command.environment.variables.add()
        v.name = 'UID'
        v.value = str(os.getuid())
        v = execInfo.command.environment.variables.add()
        v.name = 'GID'
        v.value = str(os.getgid())

        if hasattr(execInfo, 'framework_id'):
            execInfo.framework_id.value = str(framework_id)

        Script = os.path.realpath(sys.argv[0])
        if hasattr(execInfo, 'name'):
            execInfo.name = Script

        execInfo.data = marshal.dumps(
            (Script, os.getcwd(), sys.path, dict(os.environ),
             self.task_per_node, env.environ))

        return execInfo

    @safe
    def clearCache(self):
        self.task_launched.clear()
        self.slaveTasks.clear()

    @safe
    def submitTasks(self, tasks):
        if not tasks:
            return
        self.completionEvents.join(
        )  #Blocks until all items in the events queue have been gotten and processed.
        self.clearCache()
        self.task_waiting.extend(tasks)
        self.taskNum = self.taskNum + len(tasks)
        logger.debug("Got job with %d tasks", len(tasks))

        if not self.started and not self.starting:
            self.starting = True
            self.start_driver()
        while not self.started:
            self.lock.release()
            time.sleep(0.01)
            self.lock.acquire()

        self.requestMoreResources()
        self.manager.statusUpdate()

    def requestMoreResources(self):
        if self.started:
            self.driver.reviveOffers()

    @safe
    def resourceOffers(self, driver, offers):

        rf = mesos_pb2.Filters()
        if not self.task_waiting:
            rf.refuse_seconds = 5
            for o in offers:
                driver.launchTasks(o.id, [], rf)
            return

        random.shuffle(offers)
        self.last_offer_time = time.time()
        for offer in offers:
            if self.shuttingdown:
                print "Shutting down: declining offer on [%s]" % offer.hostname
                driver.declineOffer(offer.id)
                continue

            attrs = self.getAttributes(offer)
            if self.options.group and attrs.get(
                    'group', 'None') not in self.options.group:
                driver.launchTasks(offer.id, [], rf)
                continue

            cpus, mem, gpus = self.getResources(offer)
            logger.debug(
                "got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s",
                offer.id.value, cpus, mem, gpus, offer.hostname)
            logger.debug("attributes,gpus:%s", attrs.get('gpus', None))
            sid = offer.slave_id.value
            tasks = []
            while (len(self.task_waiting) > 0 and cpus >= self.cpus
                   and mem >= self.mem and
                   (self.gpus == 0 or attrs.get('gpus', None) is not None)):

                logger.debug("Accepting resource on slave %s (%s)",
                             offer.slave_id.value, offer.hostname)
                t = self.task_waiting.pop()
                t.state = mesos_pb2.TASK_STARTING
                t.state_time = time.time()

                task = self.create_task(offer, t, cpus)
                tasks.append(task)

                self.task_launched[t.id] = t
                self.slaveTasks.setdefault(sid, set()).add(t.id)

                cpus -= self.cpus
                mem -= self.mem
                # gpus -= self.gpus

            operation = mesos_pb2.Offer.Operation()
            operation.type = mesos_pb2.Offer.Operation.LAUNCH
            operation.launch.task_infos.extend(tasks)
            driver.acceptOffers([offer.id], [operation])

    @safe
    def offerRescinded(self, driver, offer_id):
        logger.debug("rescinded offer: %s", offer_id)
        if self.task_waiting:
            self.requestMoreResources()

    def getResources(self, offer):
        cpus, mem, gpus = 0, 0, 0
        for r in offer.resources:
            if r.name == 'gpus':
                gpus = float(r.scalar.value)
            elif r.name == 'cpus':
                cpus = float(r.scalar.value)
            elif r.name == 'mem':
                mem = float(r.scalar.value)
        return cpus, mem, gpus

    def getResource(self, res, name):
        for r in res:
            if r.name == name:
                return r.scalar.value
        return 0

    def getAttribute(self, attrs, name):
        for r in attrs:
            if r.name == name:
                return r.scalar.value

    def getAttributes(self, offer):
        attrs = {}
        for a in offer.attributes:
            attrs[a.name] = a.scalar.value
        return attrs

    def create_task(self, offer, t, cpus):
        task = mesos_pb2.TaskInfo()

        task.task_id.value = t.id
        task.slave_id.value = offer.slave_id.value
        task.name = "task(%s/%d)" % (t.id, self.taskNum)
        task.executor.MergeFrom(self.executor)

        task.data = compress(cPickle.dumps((t, t.tried), -1))

        cpu = task.resources.add()
        cpu.name = "cpus"
        cpu.type = 0  # mesos_pb2.Value.SCALAR
        cpu.scalar.value = min(self.cpus, cpus)

        mem = task.resources.add()
        mem.name = "mem"
        mem.type = 0  # mesos_pb2.Value.SCALAR
        mem.scalar.value = self.mem
        #
        # gpu = task.resources.add()
        # gpu.name = "gpus"
        # gpu.type = 0 # mesos_pb2.Value.SCALAR
        # gpu.scalar.value = self.gpus

        return task

    @safe
    def statusUpdate(self, driver, update):
        logger.debug(
            "Task %s in state [%s]" %
            (update.task_id.value, mesos_pb2.TaskState.Name(update.state)))
        tid = str(update.task_id.value)

        if tid not in self.task_launched:
            # check failed after launched
            for t in self.task_waiting:
                if t.id == tid:
                    self.task_launched[tid] = t
                    self.task_waiting.remove(t)
                    break
            else:
                logger.debug("Task %s is finished, ignore it", tid)
                return

        t = self.task_launched[tid]
        t.state = update.state
        t.state_time = time.time()
        self.last_finish_time = t.state_time

        if update.state == mesos_pb2.TASK_RUNNING:
            self.started = True
            # to do task timeout handler
        elif update.state == mesos_pb2.TASK_LOST:
            self.task_launched.pop(tid)

            if t.tried < self.options.retry:
                t.tried += 1
                logger.warning("task %s lost, retry %s", t.id, update.state,
                               t.tried)
                self.task_waiting.append(t)  # try again
            else:
                self.taskEnded(
                    t, OtherFailure("task lost,exception:" + str(update.data)),
                    "task lost")

        elif update.state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED,
                              mesos_pb2.TASK_ERROR, mesos_pb2.TASK_KILLED):
            self.task_launched.pop(tid)

            slave = None
            for s in self.slaveTasks:
                if tid in self.slaveTasks[s]:
                    slave = s
                    self.slaveTasks[s].remove(tid)
                    break

            if update.state == mesos_pb2.TASK_FINISHED:
                self.taskEnded(t, Success(), update.data)

            if update.state == mesos_pb2.TASK_ERROR:
                logger.error(update.message)
                self.taskEnded(t, OtherFailure(update.message), update.message)
                driver.abort()
                self.shutdown()

            if update.state == mesos_pb2.TASK_FAILED or update.state == mesos_pb2.TASK_KILLED or update.state == mesos_pb2.TASK_LOST:
                if t.tried < self.options.retry:
                    t.tried += 1
                    logger.warning("task %s failed with %s, retry %s", t.id,
                                   update.state, t.tried)
                    self.task_waiting.append(t)  # try again
                else:
                    self.taskEnded(
                        t, OtherFailure("exception:" + str(update.data)), None)
                    logger.error("task %s failed on %s", t.id, slave)

        if not self.task_waiting:
            self.requestMoreResources()  # request more offers again

    @safe
    def check(self, driver):
        now = time.time()
        for tid, t in self.task_launched.items():
            if t.state == mesos_pb2.TASK_STARTING and t.state_time + 30 < now:
                logger.warning("task %s lauched failed, assign again", tid)
                if not self.task_waiting:
                    self.requestMoreResources()
                t.tried += 1
                t.state = -1
                self.task_launched.pop(tid)
                self.task_waiting.append(t)
            # TODO: check run time

    @safe
    def shutdown(self):
        if not self.started:
            return

        wait_started = datetime.datetime.now()
        while (len(self.task_launched) > 0) and \
          (SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).seconds):
            time.sleep(1)

        logger.debug("total:%d, task finished: %d,task failed: %d",
                     self.taskNum, self.finished_count, self.fail_count)

        self.shuttingdown = True
        # self.driver.join()
        self.driver.stop(False)

        #self.driver = None
        logger.debug("scheduler stop!!!")
        self.stopped = True
        self.started = False

    @safe
    def error(self, driver, code):
        logger.warning("Mesos error message: %s", code)

    def defaultParallelism(self):
        return 16

    def frameworkMessage(self, driver, executor, slave, data):
        logger.warning("[slave %s] %s", slave.value, data)

    def executorLost(self, driver, executorId, slaveId, status):
        logger.warning("executor at %s %s lost: %s", slaveId.value,
                       executorId.value, status)
        self.slaveTasks.pop(slaveId.value, None)

    def slaveLost(self, driver, slaveId):
        logger.warning("slave %s lost", slaveId.value)
        self.slaveTasks.pop(slaveId.value, None)

    def killTask(self, job_id, task_id, tried):
        tid = mesos_pb2.TaskID()
        tid.value = "%s:%s:%s" % (job_id, task_id, tried)
        self.driver.killTask(tid)
Example #16
0
        logging.info("task {task} status is {status}".format(
            task=update.task_id.value,
            status=mesos_pb2.TaskState.Name(update.state)))

        if update.state == mesos_pb2.TASK_FINISHED:
            self.finished_task += 1
            if self.finished_task == len(self.numbers):
                logging.info("all task has finished")
                driver.stop()


if __name__ == '__main__':
    # make us a framework
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"

    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = "default"
    executor.command.value = "python {}".format(
        os.path.abspath("./Executor.py"))
    executor.name = "Test Executor(Python)"
    executor.source = "python test"

    driver = MesosSchedulerDriver(
        MyScheduler(executor),
        framework,
        "192.168.12.179:5050"  # assumes running on the master
    )
    driver.run()
Example #17
0
    hello_executor.name = "Hello"
    hello_executor.command.value = "python hello_executor.py"

    uri_proto = hello_executor.command.uris.add()
    uri_proto.value = "http://kit-mesos-master:9000/hello_executor.py"
    uri_proto.extract = False

    goodbye_executor = mesos_pb2.ExecutorInfo()
    goodbye_executor.executor_id.value = "goodbye-executor"
    goodbye_executor.name = "GoodBye"
    goodbye_executor.command.value = "python goodbye_executor.py"

    uri_proto = goodbye_executor.command.uris.add()
    uri_proto.value = "http://kit-mesos-master:9000/goodbye_executor.py"
    uri_proto.extract = False

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"

    helloWorldScheduler = HelloWorldScheduler(hello_executor, goodbye_executor)

    driver = MesosSchedulerDriver(helloWorldScheduler, framework,
                                  "kit-mesos-master:5050")
    driver.start()
    logging.info("Listening for Ctrl-C")
    signal.signal(signal.SIGINT, graceful_shutdown)
    while True:
        time.sleep(5)
    sys.exit(0)
Example #18
0
class MesosScheduler(Scheduler):
    def __init__(self, manager, master, options):
        Scheduler.__init__(self,manager)
        self.master = master
        self.cpus = options.cpus
        self.mem = parse_mem(options.mem)
        self.gpus = options.gpus
        self.task_per_node = options.parallel or multiprocessing.cpu_count()
        self.options = options
        self.group = options.group
        self.last_finish_time = 0
        self.executor = None
        self.driver = None
        self.lock = threading.RLock()
        self.task_waiting = []
        self.task_launched = {}
        self.slaveTasks = {}
        self.starting = False

    def start_driver(self):
        name = 'OpenCluster'
        if self.options.name :
            name = "%s-%s" % (name,self.options.name)
        else:
            name = "%s-%s" % (name,datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"))

        if len(name) > 256:
            name = name[:256] + '...'

        framework = mesos_pb2.FrameworkInfo()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception("OpenCluster is not allowed to run as 'root'")
        framework.name = name
        framework.hostname = socket.gethostname()

        self.driver = MesosSchedulerDriver(self, framework, self.master)
        self.driver.start()
        logger.debug("Mesos Scheudler driver started")

        self.shuttingdown = False
        self.last_finish_time = time.time()
        self.stopped = False
        #
        # def check():
        #     while self.started:
        #         now = time.time()
        #         if not self.task_waiting and now - self.last_finish_time > MAX_IDLE_TIME:
        #             logger.info("stop mesos scheduler after %d seconds idle", now - self.last_finish_time)
        #             self.shutdown()
        #             break
        #         time.sleep(1)
        #
        #         if len(self.task_success()) + len(self.task_failed) == self.taskNum:
        #             self.shutdown()
        # spawn(check)

    @safe
    def registered(self, driver, frameworkId, masterInfo):
        self.started = True
        logger.debug("connect to master %s:%s(%s), registered as %s",
            int2ip(masterInfo.ip), masterInfo.port, masterInfo.id,
            frameworkId.value)
        self.executor = self.getExecutorInfo(str(frameworkId.value))

    @safe
    def reregistered(self, driver, masterInfo):
        logger.warning("re-connect to mesos master %s:%s(%s)",
            int2ip(masterInfo.ip), masterInfo.port, masterInfo.id)

    @safe
    def disconnected(self, driver):
        logger.debug("framework is disconnected")

    @safe
    def getExecutorInfo(self, framework_id):
        execInfo = mesos_pb2.ExecutorInfo()
        execInfo.executor_id.value = "multiframework"
        execInfo.command.value = '%s %s' % (
            sys.executable, # /usr/bin/python.exe or .../python
            os.path.abspath(os.path.join(os.path.dirname(__file__), 'simpleexecutor.py'))
        )
        v = execInfo.command.environment.variables.add()
        v.name = 'UID'
        v.value = str(os.getuid())
        v = execInfo.command.environment.variables.add()
        v.name = 'GID'
        v.value = str(os.getgid())

        if hasattr(execInfo, 'framework_id'):
            execInfo.framework_id.value = str(framework_id)

        Script = os.path.realpath(sys.argv[0])
        if hasattr(execInfo, 'name'):
            execInfo.name = Script

        execInfo.data = marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, env.environ))

        return execInfo
    @safe
    def clearCache(self):
        self.task_launched.clear()
        self.slaveTasks.clear()

    @safe
    def submitTasks(self, tasks):
        if not tasks:
            return
        self.completionEvents.join() #Blocks until all items in the events queue have been gotten and processed.
        self.clearCache()
        self.task_waiting.extend(tasks)
        self.taskNum = self.taskNum + len(tasks)
        logger.debug("Got job with %d tasks",  len(tasks))

        if not self.started and not self.starting:
            self.starting = True
            self.start_driver()
        while not self.started:
            self.lock.release()
            time.sleep(0.01)
            self.lock.acquire()

        self.requestMoreResources()
        self.manager.statusUpdate()

    def requestMoreResources(self):
        if self.started:
            self.driver.reviveOffers()

    @safe
    def resourceOffers(self, driver, offers):

        rf = mesos_pb2.Filters()
        if not self.task_waiting:
            rf.refuse_seconds = 5
            for o in offers:
                driver.launchTasks(o.id, [], rf)
            return

        random.shuffle(offers)
        self.last_offer_time = time.time()
        for offer in offers:
            if self.shuttingdown:
                print "Shutting down: declining offer on [%s]" % offer.hostname
                driver.declineOffer(offer.id)
                continue

            attrs = self.getAttributes(offer)
            if self.options.group and attrs.get('group', 'None') not in self.options.group:
                driver.launchTasks(offer.id, [], rf)
                continue

            cpus, mem, gpus = self.getResources(offer)
            logger.debug("got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s", offer.id.value, cpus, mem, gpus, offer.hostname)
            logger.debug("attributes,gpus:%s",attrs.get('gpus', None))
            sid = offer.slave_id.value
            tasks = []
            while (len(self.task_waiting)>0 and cpus >= self.cpus and mem >= self.mem and (self.gpus==0 or attrs.get('gpus', None) is not None)):

                logger.debug("Accepting resource on slave %s (%s)", offer.slave_id.value, offer.hostname)
                t = self.task_waiting.pop()
                t.state = mesos_pb2.TASK_STARTING
                t.state_time = time.time()

                task = self.create_task(offer, t, cpus)
                tasks.append(task)

                self.task_launched[t.id] = t
                self.slaveTasks.setdefault(sid, set()).add(t.id)

                cpus -= self.cpus
                mem -= self.mem
                # gpus -= self.gpus

            operation = mesos_pb2.Offer.Operation()
            operation.type = mesos_pb2.Offer.Operation.LAUNCH
            operation.launch.task_infos.extend(tasks)
            driver.acceptOffers([offer.id], [operation])

    @safe
    def offerRescinded(self, driver, offer_id):
        logger.debug("rescinded offer: %s", offer_id)
        if self.task_waiting:
            self.requestMoreResources()

    def getResources(self, offer):
        cpus, mem, gpus = 0, 0, 0
        for r in offer.resources:
            if r.name == 'gpus':
                gpus = float(r.scalar.value)
            elif r.name == 'cpus':
                cpus = float(r.scalar.value)
            elif r.name == 'mem':
                mem = float(r.scalar.value)
        return cpus, mem, gpus

    def getResource(self, res, name):
        for r in res:
            if r.name == name:
                return r.scalar.value
        return 0

    def getAttribute(self, attrs, name):
        for r in attrs:
            if r.name == name:
                return r.scalar.value

    def getAttributes(self, offer):
        attrs = {}
        for a in offer.attributes:
            attrs[a.name] = a.scalar.value
        return attrs

    def create_task(self, offer, t, cpus):
        task = mesos_pb2.TaskInfo()

        task.task_id.value = t.id
        task.slave_id.value = offer.slave_id.value
        task.name = "task(%s/%d)" % (t.id, self.taskNum)
        task.executor.MergeFrom(self.executor)

        task.data = compress(cPickle.dumps((t, t.tried), -1))

        cpu = task.resources.add()
        cpu.name = "cpus"
        cpu.type = 0 # mesos_pb2.Value.SCALAR
        cpu.scalar.value = min(self.cpus, cpus)

        mem = task.resources.add()
        mem.name = "mem"
        mem.type = 0 # mesos_pb2.Value.SCALAR
        mem.scalar.value = self.mem
        #
        # gpu = task.resources.add()
        # gpu.name = "gpus"
        # gpu.type = 0 # mesos_pb2.Value.SCALAR
        # gpu.scalar.value = self.gpus

        return task

    @safe
    def statusUpdate(self, driver, update):
        logger.debug("Task %s in state [%s]" % (update.task_id.value, mesos_pb2.TaskState.Name(update.state)))
        tid = str(update.task_id.value)

        if tid not in self.task_launched:
            # check failed after launched
            for t in self.task_waiting:
                if t.id == tid:
                    self.task_launched[tid] = t
                    self.task_waiting.remove(t)
                    break
            else:
                logger.debug("Task %s is finished, ignore it", tid)
                return

        t = self.task_launched[tid]
        t.state = update.state
        t.state_time = time.time()
        self.last_finish_time = t.state_time

        if update.state == mesos_pb2.TASK_RUNNING:
            self.started = True
            # to do task timeout handler
        elif update.state == mesos_pb2.TASK_LOST:
            self.task_launched.pop(tid)

            if t.tried < self.options.retry:
                t.tried += 1
                logger.warning("task %s lost, retry %s", t.id, update.state, t.tried)
                self.task_waiting.append(t) # try again
            else:
                self.taskEnded(t, OtherFailure("task lost,exception:" + str(update.data)), "task lost")

        elif update.state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED, mesos_pb2.TASK_ERROR, mesos_pb2.TASK_KILLED):
            self.task_launched.pop(tid)

            slave = None
            for s in self.slaveTasks:
                if tid in self.slaveTasks[s]:
                    slave = s
                    self.slaveTasks[s].remove(tid)
                    break

            if update.state == mesos_pb2.TASK_FINISHED :
                self.taskEnded(t, Success(), update.data)

            if update.state == mesos_pb2.TASK_ERROR :
                logger.error(update.message)
                self.taskEnded(t, OtherFailure(update.message), update.message)
                driver.abort()
                self.shutdown()

            if update.state == mesos_pb2.TASK_FAILED or update.state == mesos_pb2.TASK_KILLED or update.state == mesos_pb2.TASK_LOST:
                if t.tried < self.options.retry:
                    t.tried += 1
                    logger.warning("task %s failed with %s, retry %s", t.id, update.state, t.tried)
                    self.task_waiting.append(t) # try again
                else:
                    self.taskEnded(t, OtherFailure("exception:" + str(update.data)), None)
                    logger.error("task %s failed on %s", t.id, slave)

        if not self.task_waiting:
            self.requestMoreResources() # request more offers again

    @safe
    def check(self, driver):
        now = time.time()
        for tid, t in self.task_launched.items():
            if t.state == mesos_pb2.TASK_STARTING and t.state_time + 30 < now:
                logger.warning("task %s lauched failed, assign again", tid)
                if not self.task_waiting:
                    self.requestMoreResources()
                t.tried += 1
                t.state = -1
                self.task_launched.pop(tid)
                self.task_waiting.append(t)
            # TODO: check run time

    @safe
    def shutdown(self):
        if not self.started:
            return

        wait_started = datetime.datetime.now()
        while (len(self.task_launched) > 0) and \
          (SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).seconds):
            time.sleep(1)

        logger.debug("total:%d, task finished: %d,task failed: %d", self.taskNum, self.finished_count, self.fail_count)

        self.shuttingdown = True
        # self.driver.join()
        self.driver.stop(False)

        #self.driver = None
        logger.debug("scheduler stop!!!")
        self.stopped = True
        self.started = False

    @safe
    def error(self, driver, code):
        logger.warning("Mesos error message: %s", code)

    def defaultParallelism(self):
        return 16

    def frameworkMessage(self, driver, executor, slave, data):
        logger.warning("[slave %s] %s", slave.value, data)

    def executorLost(self, driver, executorId, slaveId, status):
        logger.warning("executor at %s %s lost: %s", slaveId.value, executorId.value, status)
        self.slaveTasks.pop(slaveId.value, None)

    def slaveLost(self, driver, slaveId):
        logger.warning("slave %s lost", slaveId.value)
        self.slaveTasks.pop(slaveId.value, None)

    def killTask(self, job_id, task_id, tried):
        tid = mesos_pb2.TaskID()
        tid.value = "%s:%s:%s" % (job_id, task_id, tried)
        self.driver.killTask(tid)
Example #19
0
                    print "NASEL FRAMEWORK"
                    reuse_framework_id = framework['id']
                    #exit(1)

        #exit(1)
    except ZookeeperNoMaster as ex:
        logging.error("Could not find any Mesos Master: {}".format(ex.message))
        exit(1)

    framework = mesos_pb2.FrameworkInfo()
    framework.user = "******"  # Have Mesos fill in the current user.
    framework.name = "vizceral"
    framework.failover_timeout=300

    #print framework

    if reuse_framework_id!="":
        mesos_framework = mesos_pb2.FrameworkID()
        mesos_framework.value=reuse_framework_id
        framework.id.MergeFrom(mesos_framework)
        logging.error("re-registring existing framework "+reuse_framework_id)

    driver = MesosSchedulerDriver(
        VizCeralScheduler(),
        framework,
        ZK_HOSTS

    )

    driver.run()
Example #20
0
    goodbye_executor.name = "GoodBye"
    goodbye_executor.command.value = "python goodbye_executor.py"

    uri_proto = goodbye_executor.command.uris.add()
    uri_proto.value = "http://kit-mesos-master:9000/goodbye_executor.py"
    uri_proto.extract = False

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"

    httpd = SocketServer.TCPServer(("", 9000),
                                   SimpleHTTPServer.SimpleHTTPRequestHandler)

    def create_web_server():
        print "serving at port", 9000
        httpd.serve_forever()

    web_thread = threading.Thread(target=create_web_server)
    web_thread.start()

    driver = MesosSchedulerDriver(
        HelloWorldScheduler(hello_executor, goodbye_executor), framework,
        "zk://localhost:2181/mesos")
    driver.start()
    logging.info("Listening for Ctrl-C")
    signal.signal(signal.SIGINT, shutdown)
    while True:
        time.sleep(5)
    sys.exit(0)
Example #21
0
def run(api_url,
        mesos_master,
        user,
        config_dir,
        state_file,
        changes_request_limit,
        http_port,
        stats=None):
    scheduler = ChangesScheduler(state_file,
                                 api=ChangesAPI(api_url),
                                 stats=stats,
                                 blacklist=FileBlacklist(
                                     os.path.join(config_dir, 'blacklist')),
                                 changes_request_limit=changes_request_limit)

    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = "default"
    executor.command.value = os.path.abspath("./executor.py")
    executor.name = "Changes Executor"
    executor.source = "changes"

    framework = mesos_pb2.FrameworkInfo()
    framework.user = user
    framework.name = "Changes Scheduler"
    framework.principal = "changes"
    # Give the scheduler 1 week to restart before mesos cancels the tasks.
    # this is the setting recommended by the docs.
    framework.failover_timeout = 3600 * 24 * 7

    if scheduler.framework_id:
        framework.id.value = scheduler.framework_id
        executor.framework_id.value = scheduler.framework_id

    driver = MesosSchedulerDriver(scheduler, framework, mesos_master)

    stopped = threading.Event()

    def handle_interrupt(signal, frame):
        stopped.set()
        logging.info("Received interrupt, shutting down")
        logging.warning(
            "Not saving state. Will wait for running tasks to finish.")
        scheduler.shuttingDown.set()
        while scheduler.activeTasks > 0:
            logging.info("Waiting for %d tasks to finish running",
                         scheduler.activeTasks)
            sleep(5)
        driver.stop()

    def handle_sigterm(signal, frame):
        # TODO: Avoid save_state race conditions by having handle_sigterm()
        # only set shuttingDown, then do the actual save-state and driver.stop()
        # in the main thread after all other threads are join()ed.
        # Also, stopped doesn't appear to be used.
        stopped.set()
        logging.info("Received sigterm, shutting down")
        scheduler.shuttingDown.set()
        if scheduler.state_file:
            try:
                scheduler.save_state()
                logging.info("Successfully saved state to %s.", state_file)
            except Exception:
                logging.exception("Failed to save state")
                driver.stop()
                return
            # With `failover` set to true, we do not tell Mesos to stop the existing tasks
            # started by this framework. Instead, the tasks will run for
            # `fail_timeout` more seconds set above or we start a scheduler with
            # the same framework id.
            driver.stop(True)
        else:
            logging.warning(
                "State file location not set. Not saving state. Existing builds will be cancelled."
            )
            driver.stop()

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_sigterm)

    driver.start()
    logging.info("Driver started")

    app = Flask("Changes Mesos Scheduler")
    app.add_url_rule('/api/state_json', 'state_json',
                     json_handler(scheduler.state_json))
    http_thread = threading.Thread(target=app.run, kwargs={'port': http_port})
    http_thread.start()

    scheduler.poll_changes_until_shutdown(driver, 5)
    status = 0
    if driver.join() == mesos_pb2.DRIVER_STOPPED:
        logging.info("Driver stopped cleanly.")
    else:
        # Ensure that the driver process terminates.
        status = 1
        logging.info("Stopping driver forcibly.")
        driver.stop()

    logging.info("Stopping HTTP server.")
    http_thread.terminate()
    http_thread.join()

    logging.info("Clean shutdown complete. Exiting status %d.", status)
    sys.exit(status)
Example #22
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    help = 'Launches the Scale scheduler'

    def add_arguments(self, parser):
        parser.add_argument('-m',
                            '--master',
                            action='store',
                            default=settings.MESOS_MASTER,
                            help='The master to connect to')

    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info('Scale Scheduler %s', settings.VERSION)

        try:
            scheduler_zk = settings.SCHEDULER_ZK
        except:
            scheduler_zk = None

        if scheduler_zk is not None:
            import socket
            from scheduler import cluster_utils
            my_id = socket.gethostname()
            cluster_utils.wait_for_leader(scheduler_zk, my_id,
                                          self.run_scheduler, mesos_master)
        else:
            # leader election is disabled
            self.run_scheduler(mesos_master)

    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale')
        webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS')

        if webserver_address:
            framework.webui_url = webserver_address

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error(
                    'Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error(
                    'Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
Example #23
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    option_list = BaseCommand.option_list + (
        make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER,
                    help=('The master to connect to')),
    )

    help = 'Launches the Scale scheduler'

    def handle(self, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info('Scale Scheduler %s', settings.VERSION)

        try:
            scheduler_zk = settings.SCHEDULER_ZK
        except:
            scheduler_zk = None

        if scheduler_zk is not None:
            import socket
            from scheduler import cluster_utils
            my_id = socket.gethostname()
            cluster_utils.wait_for_leader(scheduler_zk, my_id, self.run_scheduler, mesos_master)
        else:
            # leader election is disabled
            self.run_scheduler(mesos_master)

    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale'

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
Example #24
0
        uri_proto.extract = False

    vilfredo = VilfredoMesosScheduler(paretoExecutor, sys.argv[2])

    # Use credentials for authentication if they are provided.
    credential = None
    if len(sys.argv) > 3:
        credentialsPath = sys.argv[3]
        with open(credentialsPath) as f:
            words = f.readline().split()
            credential = mesos_pb2.Credential()
            credential.principal = words[0]
            credential.secret = words[1]

    if credential is None:
        driver = MesosSchedulerDriver(vilfredo, framework, sys.argv[1])
    else:
        driver = MesosSchedulerDriver(vilfredo, framework, sys.argv[1],
                                      credential)

    # driver.run() blocks; we run it in a separate thread.
    def run_driver_async():
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()
        sys.exit(status)

    framework_thread = threading.Thread(target=run_driver_async)
    framework_thread.start()

    print "(Listening for Ctrl-C)"
    signal.signal(signal.SIGINT, hard_shutdown)
Example #25
0
            raise ValueError('ALL DONE')
        logging.info("Recieved resource offers: {}".format(
            [o.id.value for o in offers]))
        # whenever we get an offer, we accept it and use it to launch a task that
        # just echos hello world to stdout
        for offer in offers:
            step = steps.pop()
            task = new_task(offer)
            task.command.value = "echo Running step {}".format(step)
            logging.info("Launching task {task} "
                         "using offer {offer}.".format(task=task.task_id.value,
                                                       offer=offer.id.value))
            tasks = [task]
            driver.launchTasks(offer.id, tasks)
            if not steps:
                break


if __name__ == '__main__':
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "runner"
    scheduler = HelloWorldScheduler()
    scheduler.define_tasks()
    driver = MesosSchedulerDriver(
        scheduler,
        framework,
        "zk://localhost:2181/mesos"  # assumes running on the master
    )
    driver.run()
                mem.type = mesos_pb2.Value.SCALAR
                mem.scalar.value = 1

                time.sleep(2)
                logging.info("Launching task {task} "
                             "using offer {offer}.".format(task=task.task_id.value,
                                                           offer=offer.id.value))
                tasks = [task]

                driver.launchTasks(offer.id, tasks)

            else: 

                driver.stop()

if __name__ == '__main__':

    # make us a framework
    framework = mesos_pb2.FrameworkInfo()
    framework.user = "******"  # Have Mesos fill in the current user.
    framework.name = "hello-world"
    framework.checkpoint = True

    driver = MesosSchedulerDriver(
        HelloWorldScheduler(),
        framework,
        "127.0.0.1:5050/"  # assumes running on the master
    )

    sys.exit(0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1)
Example #27
0
        uri_proto.extract = False

    renderExecutor.name = "Renderer"

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "RENDLER"

    try:
        maxRenderTasks = int(sys.argv[3])
    except:
        maxRenderTasks = 0
    rendler = RenderingCrawler(sys.argv[1], maxRenderTasks, crawlExecutor,
                               renderExecutor)

    driver = MesosSchedulerDriver(rendler, framework, sys.argv[2])

    # driver.run() blocks; we run it in a separate thread
    def run_driver_async():
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()
        sys.exit(status)

    framework_thread = Thread(target=run_driver_async, args=())
    framework_thread.start()

    print "(Listening for Ctrl-C)"
    signal.signal(signal.SIGINT, graceful_shutdown)
    while framework_thread.is_alive():
        time.sleep(1)
Example #28
0
def run(api_url, mesos_master, user, config_dir, state_file, stats=None):
    scheduler = ChangesScheduler(config_dir, state_file, api=ChangesAPI(api_url), stats=stats)

    executor = mesos_pb2.ExecutorInfo()
    executor.executor_id.value = "default"
    executor.command.value = os.path.abspath("./executor.py")
    executor.name = "Changes Executor"
    executor.source = "changes"

    framework = mesos_pb2.FrameworkInfo()
    framework.user = user
    framework.name = "Changes Scheduler"
    framework.principal = "changes"
    # Give the scheduler 30s to restart before mesos cancels the tasks.
    framework.failover_timeout = 30

    if scheduler.framework_id:
        framework.id.value = scheduler.framework_id
        executor.framework_id.value = scheduler.framework_id

    driver = MesosSchedulerDriver(
        scheduler,
        framework,
        mesos_master)

    stopped = threading.Event()

    def handle_interrupt(signal, frame):
        stopped.set()
        logging.info("Received interrupt, shutting down")
        logging.warning("Not saving state. Will wait for running tasks to finish.")
        scheduler.shuttingDown.set()
        while scheduler.activeTasks > 0:
            logging.info("Waiting for %d tasks to finish running", scheduler.activeTasks)
            sleep(5)
        driver.stop()

    def handle_sigterm(signal, frame):
        stopped.set()
        logging.info("Received sigterm, shutting down")
        scheduler.shuttingDown.set()
        if scheduler.state_file:
            try:
                scheduler.save_state()
                logging.info("Successfully saved state to %s.", state_file)
            except Exception:
                logging.exception("Failed to save state")
                driver.stop()
                return
            # With `failover` set to true, we do not tell Mesos to stop the existing tasks
            # started by this framework. Instead, the tasks will run for
            # `fail_timeout` more seconds set above or we start a scheduler with
            # the same framework id.
            driver.stop(True)
        else:
            logging.warning("State file location not set. Not saving state. Existing builds will be cancelled.")
            driver.stop()

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_sigterm)

    driver.start()
    logging.info("Driver started")
    while not stopped.is_set():
        stopped.wait(3)
    status = 0 if driver.join() == mesos_pb2.DRIVER_STOPPED else 1

    # Ensure that the driver process terminates.
    if status == 1:
        driver.stop()

    sys.exit(status)
                task.command.value = "python install_nuage_cni.py %s %s" % (
                    sys.argv[2], sys.argv[3])
                time.sleep(2)
                logging.info(
                    "Launching task {task} "
                    "using offer {offer}.".format(task=task.task_id.value,
                                                  offer=offer.id.value))
                tasks = [task]
                driver.launchTasks(offer.id, tasks)

    def statusUpdate(self, driver, update):
        print "Task %s is in state %d" % (update.task_id.value, update.state)
        if update.state == mesos_pb2.TASK_FINISHED:
            print "Task successfully executed on agent node"
            self.serviced_agents += 1
            if self.serviced_agents == len(agent_list):
                driver.stop()


if __name__ == '__main__':
    # Create a Mesos framework for CNI installation
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "install-cni"
    driver = MesosSchedulerDriver(
        InstallCNIScheduler(),
        framework,
        sys.argv[1] + ":5050"  # assumes running on the master
    )
    driver.run()
            elif update.task_id.value == "1":
                if update.message == "Command terminated with signal Killed: 9":
                    print "#### Passed forced shutdown"
                else:
                    print "!!!! Failed forced shutdown"

            self.tasksFinished += 1
            if self.tasksFinished == TOTAL_TASKS:
                driver.stop()

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "Usage: %s master" % sys.argv[0]
        sys.exit(1)

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""
    framework.name = "Escalation Framework (Python)"

    driver = MesosSchedulerDriver(
        EscalationScheduler(),
        framework,
        sys.argv[1])

    status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1

    # Ensure that the driver process terminates.
    driver.stop();

    sys.exit(status)
Example #31
0
def hard_shutdown(signal, frame):
    print "Shutting down..."
    driver.stop()


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "Usage: %s master command" % sys.argv[0]
        sys.exit(1)

    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""
    framework.name = "Lanceur (Simple Launcher Framework in python)"

    driver = MesosSchedulerDriver(LaunchScheduler(sys.argv[2]), framework,
                                  sys.argv[1])

    print "Starting Mesos driver"

    # driver.run() blocks; we run it in a separate thread
    def run_driver_async():
        status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        driver.stop()
        sys.exit(status)

    framework_thread = threading.Thread(target=run_driver_async)
    framework_thread.start()

    print "(Listening for Ctrl-C)"
    signal.signal(signal.SIGINT, hard_shutdown)
    while framework_thread.is_alive():
Example #32
0
class Command(BaseCommand):
    '''Command that launches the Scale scheduler
    '''

    option_list = BaseCommand.option_list + (
        make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER,
                    help=('The master to connect to')),
    )

    help = 'Launches the Scale scheduler'

    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        '''

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info(u'Command starting: scale_scheduler')
        logger.info(u' - Master: %s', mesos_master)
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = 'scale'
        executor.command.value = '%s %s scale_executor' % (settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE)
        executor.name = 'Scale Executor (Python)'

        self.scheduler = ScaleScheduler(executor)

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale Framework (Python)'

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1

        # Perform any required clean up operations like stopping background threads
        status = status or self._shutdown()

        logger.info(u'Command completed: scale_scheduler')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        '''See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        '''
        logger.info(u'Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        '''Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        '''
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"

    helloWorldScheduler = HelloWorldScheduler(hello_executor, goodbye_executor)
    httpd = SocketServer.TCPServer(
        ("", 9000),
        SimpleHTTPServer.SimpleHTTPRequestHandler)


    def create_web_server():
        print "serving at port", 9000
        httpd.serve_forever()


    thread = threading.Thread(target=create_web_server)
    thread.start()

    driver = MesosSchedulerDriver(
        helloWorldScheduler,
        framework,
        "zk://localhost:2181/mesos"
    )
    driver.start()
    logging.info("Listening for Ctrl-C")
    signal.signal(signal.SIGINT, graceful_shutdown)
    while True:
        time.sleep(5)
    sys.exit(0)
Example #34
0
                            except Exception as e:
                                raise e
                            finally:
                                mesos_lock.release()

                            db.rhino_tasks.update({'name': task['name']}, {'$set': {'state': 'KILLED'}})
                            kill_those_that_depend_on(task['name'])

            kill_those_that_depend_on(doc['name'])

        db.rhino_tasks.update({'mesos_id': status.task_id.value},
                              {'$set': {'state': state, 'retCode': ret_code, 'message': message}})

if __name__ == '__main__':
    try:
        web_server_thread = threading.Thread(target=web_server, args=())
        web_server_thread.start()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ""  # Have Mesos fill in the current user.
        framework.name = "rhino"
        mesos_driver = MesosSchedulerDriver(
            AppsomaRhinoScheduler(),
            framework,
            "zk://" + config['zookeeper_hosts'] + "/mesos"
        )
        mesos_driver.run()
    except KeyboardInterrupt:
        print "KeyboardInterrupt"
        os.kill(os.getpid(), 9)
Example #35
0
def start_factory_mesos():
    global pyroLoopCondition
    parser = OptionParser(
        usage="Usage: python factorymesos.py [options] <command>")
    parser.allow_interspersed_args = False
    parser.add_option("-s",
                      "--master",
                      type="string",
                      default="",
                      help="url of master (mesos://172.31.252.180:5050)")
    parser.add_option("-f",
                      "--factory",
                      type="string",
                      default="",
                      help="host:port of master (172.31.252.180:6666)")
    parser.add_option(
        "-w",
        "--warehouse_addr",
        type="string",
        default="",
        help=
        "kafka-172.31.252.182:9092|mysql-172.31.254.25:3306,db,username,password"
    )
    parser.add_option("-p",
                      "--task_per_node",
                      type="int",
                      default=0,
                      help="max number of tasks on one node (default: 0)")
    parser.add_option("-I",
                      "--image",
                      type="string",
                      help="image name for Docker")
    parser.add_option("-V",
                      "--volumes",
                      type="string",
                      help="volumes to mount into Docker")
    parser.add_option("-r",
                      "--retry",
                      type="int",
                      default=0,
                      help="retry times when failed (default: 0)")
    parser.add_option(
        "-e",
        "--config",
        type="string",
        default="/work/opencluster/config.ini",
        help=
        "absolute path of configuration file(default:/work/opencluster/config.ini)"
    )

    parser.add_option("-g",
                      "--group",
                      type="string",
                      default='',
                      help="which group to run (default: ''")
    parser.add_option(
        "-q",
        "--quiet",
        action="store_true",
        help="be quiet",
    )
    parser.add_option(
        "-v",
        "--verbose",
        action="store_true",
        help="show more useful log",
    )

    (options, command) = parser.parse_args()

    if not options:
        parser.print_help()
        sys.exit(2)

    if options.config:
        Conf.setConfigFile(options.config)

    options.master = options.master or Conf.getMesosMaster()
    options.warehouse_addr = options.warehouse_addr or Conf.getWareHouseAddr()

    servers = options.factory or Conf.getFactoryServers()
    servs = servers.split(",")
    server = servs[0].split(":")

    options.logLevel = (options.quiet and logging.ERROR
                        or options.verbose and logging.DEBUG or logging.INFO)
    setLogger(Conf.getFactoryServiceName(), "MESOS", options.logLevel)

    implicitAcknowledgements = 1
    if os.getenv("MESOS_EXPLICIT_ACKNOWLEDGEMENTS"):
        implicitAcknowledgements = 0
    sched = FactoryMesos(options, command, implicitAcknowledgements)

    driver = MesosSchedulerDriver(sched, sched.framework, options.master,
                                  implicitAcknowledgements)
    driver.start()
    logger.debug("Mesos Scheudler driver started")

    warehouse_addrs = options.warehouse_addr.split(",")

    def fetchTasksFromMySQL():
        global pyroLoopCondition
        mysqlIpAndPort = warehouse_addrs[0].split(":")
        last_data_time = time.time()

        while pyroLoopCondition:
            db = MySQLdb.connect(host=mysqlIpAndPort[0],
                                 port=int(mysqlIpAndPort[1]),
                                 db=warehouse_addrs[1],
                                 user=warehouse_addrs[2],
                                 passwd=warehouse_addrs[3])
            try:
                cur = db.cursor()
                curUpt = db.cursor()
                dataResults = cur.execute(
                    "select task_id,task_desc,task_start_time,status from t_task where status=0 order by priority asc limit 200"
                )
                results = cur.fetchmany(dataResults)
                for r in results:
                    sched.append_task(cPickle.loads(r[1]))
                    curUpt.execute(
                        "update t_task set task_start_time=now(),status=1 where task_id='"
                        + r[0] + "'")
                if len(results) > 0:
                    db.commit()
                    last_data_time = time.time()
                    driver.reviveOffers()

                if sched.tasks_total_len() > MAX_WAITING_TASK:
                    time.sleep(2)
                if time.time() - last_data_time > MAX_EMPTY_TASK_PERIOD:
                    time.sleep(10)

                if cur:
                    cur.close()
                if curUpt:
                    curUpt.close()
            finally:
                db.close()

    def fetchTasksFromKafka(priority):
        global pyroLoopCondition

        consumer = KafkaConsumer('OpenCluster%s' % priority,
                                 bootstrap_servers=[options.warehouse_addr],
                                 group_id="cnlab",
                                 auto_commit_enable=True,
                                 auto_commit_interval_ms=30 * 1000,
                                 auto_offset_reset='smallest')

        last_data_time = time.time()
        while pyroLoopCondition:
            for message in consumer.fetch_messages():
                logger.error("%s:%s:%s: key=%s " %
                             (message.topic, message.partition, message.offset,
                              message.key))
                sched.append_task(cPickle.loads(message.value))
                consumer.task_done(message)
                last_data_time = time.time()
            if sched.tasks_len(priority) > MAX_WAITING_TASK:
                time.sleep(2)
            if time.time() - last_data_time > MAX_EMPTY_TASK_PERIOD:
                time.sleep(10)

    if len(warehouse_addrs) > 2:
        spawn(fetchTasksFromMySQL)
    else:
        for i in range(1, sched.priority_size + 1):
            spawn(fetchTasksFromKafka, i)

    def handler(signm, frame):
        logger.warning("got signal %d, exit now", signm)
        sched.stop(3)

    signal.signal(signal.SIGTERM, handler)
    signal.signal(signal.SIGABRT, handler)

    try:
        while not sched.stopped:
            time.sleep(0.5)
            sched.check(driver)

            now = time.time()
            if now > sched.last_offer_time + 60 + random.randint(0, 5):
                logger.warning("too long to get offer, reviving...")
                sched.last_offer_time = now
                driver.reviveOffers()

    except KeyboardInterrupt:
        logger.warning(
            'stopped by KeyboardInterrupt. The Program is exiting gracefully! Please wait...'
        )
        sched.stop(4)

    #terminate pyrothread
    pyroLoopCondition = False

    time.sleep(5)
    driver.stop(False)
    sys.exit(sched.status)
Example #36
0
        task = mesos_pb2.TaskInfo()
        id = uuid.uuid4()
        task.task_id.value = str(id)
        task.slave_id.value = offer.slave_id.value
        task.name = "task {}".format(str(id))

        cpus = task.resources.add()
        cpus.name = "cpus"
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = 0.1

        mem = task.resources.add()
        mem.name = "mem"
        mem.type = mesos_pb2.Value.SCALAR
        mem.scalar.value = 32

        return task

if __name__ == '__main__':
    log("XXX framework started")

    framework = mesos_pb2.FrameworkInfo()
    framework.user = "******"
    framework.name = "nixos-test-framework"
    driver = MesosSchedulerDriver(
        NixosTestScheduler(),
        framework,
        sys.argv[1] + ":5050"
    )
    driver.run()
Example #37
0
def main():

    global shutdown
    global accept_offers
    global driver

    cfg = get_configuration()

    # Configure logging
    setup_logging(cfg)

    framework = mesos_framework(cfg)
    #credentials = mesos_credentials(cfg)

    mesos_scheduler = OurJobScheduler(cfg)
    #driver = MesosSchedulerDriver(mesos_scheduler, framework,
    #                              cfg.mesos.master, cfg.mesos.imp_ack,
    #                              credentials)
    driver = MesosSchedulerDriver(mesos_scheduler, framework, cfg.mesos.master,
                                  cfg.mesos.imp_ack)

    shutdown = Shutdown()
    accept_offers = AcceptOffers()

    # driver.run() blocks, so run it in a separate thread.
    def run_driver_async():
        status = 0 if driver.run() == MesosPb2.DRIVER_STOPPED else 1

        if cfg.debug > 0:
            logger.debug('Stopping Driver')
        driver.stop()

        logger.info('Terminating Framework')
        sys.exit(status)

    framework_thread = Thread(target=run_driver_async, args=())
    framework_thread.start()

    logger.info('Beginning Processing')

    while framework_thread.is_alive():
        # If a shutdown has been requested, suppress offers and wait for the
        # framework thread to complete.
        if shutdown.flag:
            if cfg.debug > 0:
                logger.debug('Suppressing Offers')
            driver.suppressOffers()

            while framework_thread.is_alive():
                logger.debug('Child Thread Still Alive')
                sleep(5)

            break

        # If the max number of jobs are already running, suppress offers and
        # wait for some jobs to finish.
        if (mesos_scheduler.tasks_launched == cfg.mesos.max_jobs):
            driver.suppressOffers()

            if cfg.debug > 0:
                logger.debug('Suppressing Offers')

            # Sleep until we have room for more tasks
            while (not shutdown.flag
                   and mesos_scheduler.tasks_launched == cfg.mesos.max_jobs):
                if cfg.debug > 0:
                    logger.debug('Waiting for more available tasks')
                sleep(5)

            # Sleep until more processing is requested
            while not shutdown.flag and not mesos_scheduler.have_work():
                if cfg.debug > 0:
                    logger.debug('Waiting for more work')
                sleep(5)

            if not shutdown.flag:
                if cfg.debug > 0:
                    logger.debug('Reviving Offers')
                driver.reviveOffers()

        # If there's no new work to be done, suppress offers until we have
        # more work
        if not shutdown.flag and not mesos_scheduler.have_work():
            driver.suppressOffers()

            # Sleep until more processing is requested
            while not shutdown.flag and not mesos_scheduler.have_work():
                if cfg.debug > 0:
                    logger.debug('Waiting for more work')
                sleep(5)

            if not shutdown.flag:
                if cfg.debug > 0:
                    logger.debug('Reviving Offers')
                driver.reviveOffers()

        # Sleep for a second, so that we are not flying through the loop
        sleep(1)

    logger.info('Terminated Processing')
                         "using offer %s.",
                         task.task_id.value,
                         offer.id.value)
            driver.launchTasks(offer.id, [task])

    def statusUpdate(self, driver, update):
        '''
        when a task is started, over,
        killed or lost (slave crash, ....), this method
        will be triggered with a status message.
        '''
        logging.info("Task %s is in state %s" %
                     (update.task_id.value,
                      mesos_pb2.TaskState.Name(update.state)))

if __name__ == '__main__':
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"
    driver = MesosSchedulerDriver(
        HelloWorldScheduler(),
        framework,
        "zk://localhost:2181/mesos"  # assumes running on the master
    )
    driver.start()
    logging.info("Listening for Ctrl-C")
    signal.signal(signal.SIGINT, shutdown)
    while True:
        time.sleep(5)
    sys.exit(0)
    def registered(self, driver, framework_id, master_info):
        logging.info("Registered with framework id: {}".format(framework_id))

    def resourceOffers(self, driver, offers):
        logging.info("Recieved resource offers: {}".format(
            [o.id.value for o in offers]))
        # whenever we get an offer, we accept it and use it to launch a task that
        # just echos hello world to stdout
        for offer in offers:
            task = new_task(offer)
            task.command.value = "echo hello world"
            time.sleep(2)
            logging.info("Launching task {task} "
                         "using offer {offer}.".format(task=task.task_id.value,
                                                       offer=offer.id.value))
            tasks = [task]
            driver.launchTasks(offer.id, tasks)


if __name__ == '__main__':
    # make us a framework
    framework = mesos_pb2.FrameworkInfo()
    framework.user = ""  # Have Mesos fill in the current user.
    framework.name = "hello-world"
    driver = MesosSchedulerDriver(
        HelloWorldScheduler(),
        framework,
        "zk://34.215.39.254/mesos"  # assumes running on the master
    )
    driver.run()