Ejemplo n.º 1
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    help = 'Launches the Scale scheduler'

    def add_arguments(self, parser):
        parser.add_argument('-m',
                            '--master',
                            action='store',
                            default=settings.MESOS_MASTER,
                            help='The master to connect to')

    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info('Scale Scheduler %s', settings.VERSION)

        try:
            scheduler_zk = settings.SCHEDULER_ZK
        except:
            scheduler_zk = None

        if scheduler_zk is not None:
            import socket
            from scheduler import cluster_utils
            my_id = socket.gethostname()
            cluster_utils.wait_for_leader(scheduler_zk, my_id,
                                          self.run_scheduler, mesos_master)
        else:
            # leader election is disabled
            self.run_scheduler(mesos_master)

    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = os.getenv('DCOS_PACKAGE_FRAMEWORK_NAME', 'Scale')
        webserver_address = os.getenv('SCALE_WEBSERVER_ADDRESS')

        if webserver_address:
            framework.webui_url = webserver_address

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error(
                    'Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error(
                    'Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
Ejemplo n.º 2
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    option_list = BaseCommand.option_list + (
        make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER,
                    help=('The master to connect to')),
    )

    help = 'Launches the Scale scheduler'

    def handle(self, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info('Scale Scheduler %s', settings.VERSION)

        try:
            scheduler_zk = settings.SCHEDULER_ZK
        except:
            scheduler_zk = None

        if scheduler_zk is not None:
            import socket
            from scheduler import cluster_utils
            my_id = socket.gethostname()
            cluster_utils.wait_for_leader(scheduler_zk, my_id, self.run_scheduler, mesos_master)
        else:
            # leader election is disabled
            self.run_scheduler(mesos_master)

    def run_scheduler(self, mesos_master):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler()

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale'

        logger.info('Connecting to Mesos master at %s', mesos_master)

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        try:
            status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
Ejemplo n.º 3
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    help = 'Launches the Scale scheduler'

    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # Set up global shutdown
        global GLOBAL_SHUTDOWN
        GLOBAL_SHUTDOWN = self._shutdown

        logger.info('Scale Scheduler %s', settings.VERSION)

        self.run_scheduler(settings.MESOS_MASTER)

    def run_scheduler(self, mesos_master):
        logger.info("Scale rising...")
        self.scheduler = ScaleScheduler()
        self.scheduler.initialize()
        scheduler_mgr.hostname = socket.getfqdn()

        logger.info('Connecting to Mesos master at %s:', mesos_master)

        # By default use ZK for master detection
        self.client = MesosClient(
            mesos_urls=[settings.MESOS_MASTER],
            # We have to run tasks as root, so docker commands may be executed
            frameworkUser='******',
            frameworkName=settings.FRAMEWORK_NAME,
            frameworkHostname=scheduler_mgr.hostname,
            frameworkWebUI=settings.WEBSERVER_ADDRESS)
        if settings.SERVICE_SECRET:
            # We are in Enterprise mode and using service account
            self.client.set_service_account(json.loads(
                settings.SERVICE_SECRET))
        elif settings.PRINCIPAL and settings.SECRET:
            self.client.set_credentials(settings.PRINCIPAL, settings.SECRET)

        mesos_role = settings.MESOS_ROLE
        logger.info('Launching scheduler with role: %s' % mesos_role)
        self.client.set_role(settings.MESOS_ROLE)

        logger.info('Accepting offers from role: %s' %
                    settings.ACCEPTED_RESOURCE_ROLE)

        self.client.add_capability('GPU_RESOURCES')

        try:
            self.scheduler.run(self.client)
            status = 0
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown()
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        return status
Ejemplo n.º 4
0
class Command(BaseCommand):
    '''Command that launches the Scale scheduler
    '''

    option_list = BaseCommand.option_list + (
        make_option('-m', '--master', action='store', type='str', default=settings.MESOS_MASTER,
                    help=('The master to connect to')),
    )

    help = 'Launches the Scale scheduler'

    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        '''

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info(u'Command starting: scale_scheduler')
        logger.info(u' - Master: %s', mesos_master)
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = 'scale'
        executor.command.value = '%s %s scale_executor' % (settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE)
        executor.name = 'Scale Executor (Python)'

        self.scheduler = ScaleScheduler(executor)

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale Framework (Python)'

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error('Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error('Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework, mesos_master)

        status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1

        # Perform any required clean up operations like stopping background threads
        status = status or self._shutdown()

        logger.info(u'Command completed: scale_scheduler')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        '''See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        '''
        logger.info(u'Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        '''Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        '''
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status
Ejemplo n.º 5
0
class Command(BaseCommand):
    '''Command that launches the Scale scheduler
    '''

    option_list = BaseCommand.option_list + (make_option(
        '-m',
        '--master',
        action='store',
        type='str',
        default=settings.MESOS_MASTER,
        help=('The master to connect to')), )

    help = 'Launches the Scale scheduler'

    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        '''

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # TODO: clean this up
        mesos_master = options.get('master')

        logger.info(u'Command starting: scale_scheduler')
        logger.info(u' - Master: %s', mesos_master)
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = 'scale'
        executor.command.value = '%s %s scale_executor' % (
            settings.PYTHON_EXECUTABLE, settings.MANAGE_FILE)
        executor.name = 'Scale Executor (Python)'

        try:
            scheduler_zk = settings.SCHEDULER_ZK
        except:
            scheduler_zk = None

        if scheduler_zk is not None:
            import socket
            from scheduler import cluster_utils
            my_id = socket.gethostname()
            cluster_utils.wait_for_leader(scheduler_zk, my_id,
                                          self.run_scheduler, mesos_master,
                                          executor)
        else:
            # leader election is disabled
            self.run_scheduler(mesos_master, executor)

    def run_scheduler(self, mesos_master, executor):
        logger.info("I am the leader")
        self.scheduler = ScaleScheduler(executor)

        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''  # Have Mesos fill in the current user.
        framework.name = 'Scale Framework (Python)'

        # TODO(vinod): Make checkpointing the default when it is default on the slave.
        if MESOS_CHECKPOINT:
            logger.info('Enabling checkpoint for the framework')
            framework.checkpoint = True

        if MESOS_AUTHENTICATE:
            logger.info('Enabling authentication for the framework')

            if not DEFAULT_PRINCIPLE:
                logger.error(
                    'Expecting authentication principal in the environment')
                sys.exit(1)

            if not DEFAULT_SECRET:
                logger.error(
                    'Expecting authentication secret in the environment')
                sys.exit(1)

            credential = mesos_pb2.Credential()
            credential.principal = DEFAULT_PRINCIPLE
            credential.secret = DEFAULT_SECRET

            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master, credential)
        else:
            self.driver = MesosSchedulerDriver(self.scheduler, framework,
                                               mesos_master)

        status = 0 if self.driver.run() == mesos_pb2.DRIVER_STOPPED else 1

        # Perform any required clean up operations like stopping background threads
        status = status or self._shutdown()

        logger.info(u'Command completed: scale_scheduler')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        '''See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        '''
        logger.info(u'Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        '''Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        '''
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown scale scheduler.')
            status = 1

        try:
            if self.driver:
                self.driver.stop()
        except:
            logger.exception('Failed to properly stop Mesos driver.')
            status = 1
        return status