コード例 #1
0
ファイル: slave.py プロジェクト: zerolugithub/ClusterRunner
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build, executor_start_index):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        :param executor_start_index: The index the slave should number its executors from for this build
        :type executor_start_index: int
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        self._network.post_with_digest(setup_url, post_data, Secret.get())

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instructs the slave process to kill itself.
        """
        kill_url = self._slave_api.url('kill')
        self._network.post_with_digest(kill_url, {}, Secret.get())
        self.mark_dead()

    def mark_dead(self):
        """
        Marks the slave dead.
        """
        self.set_is_alive(False)
        self.current_build_id = None
コード例 #2
0
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self.is_alive = True
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors_by_id = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors_by_id[executor_id] = executor

        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None
        self._build_teardown_coin = None

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors_by_id.values()]
        return {
            'is_alive': self.is_alive,
            'master_url': self._master_url,
            'current_build_id': self._current_build_id,
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params, build_executor_start_index):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for
        this build
        :type build_executor_start_index: int
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._current_build_id = build_id
        self._build_teardown_coin = SingleUseCoin()  # protects against build_teardown being executed multiple times

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(
            target=self._async_setup_build,
            name='Bld{}-Setup'.format(build_id),
            args=(executors, project_type_params, build_executor_start_index)
        ).start()

    def _async_setup_build(self, executors, project_type_params, build_executor_start_index):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.

        :type executors: list[SubjobExecutor]
        :type project_type_params: dict
        :type build_executor_start_index: int
        """
        self._base_executor_index = build_executor_start_index
        try:
            self._project_type.fetch_project()
            for executor in executors:
                executor.configure_project_type(project_type_params)
            self._project_type.run_job_config_setup()

        except SetupFailureError as ex:
            self._logger.error(ex)
            self._logger.info('Notifying master that build setup has failed for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_FAILED)

        else:
            self._logger.info('Notifying master that build setup is complete for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_COMPLETED)

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))
        SafeThread(
            target=self._async_teardown_build,
            name='Bld{}-Teardwn'.format(build_id)
        ).start()

    def _async_teardown_build(self):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        self._do_build_teardown_and_reset()
        while not self._idle_executors.full():
            time.sleep(1)
        self._send_master_idle_notification()

    def _do_build_teardown_and_reset(self, timeout=None):
        """
        Kill any currently running subjobs. Run the teardown_build commands for the current build (with an optional
        timeout). Clear attributes related to the currently running build.

        :param timeout: A maximum time in seconds to allow the teardown process to run before killing
        :type timeout: int | None
        """
        # Kill all subjob executors' processes. This only has an effect if we are tearing down before a build completes.
        for executor in self.executors_by_id.values():
            executor.kill()

        # Order matters! Spend the coin if it has been initialized.
        if not self._build_teardown_coin or not self._build_teardown_coin.spend() or not self._project_type:
            return  # There is no build to tear down or teardown is already in progress.

        self._logger.info('Executing teardown for build {}.', self._current_build_id)
        # todo: Catch exceptions raised during teardown_build so we don't skip notifying master of idle/disconnect.
        self._project_type.teardown_build(timeout=timeout)
        self._logger.info('Build teardown complete for build {}.', self._current_build_id)
        self._current_build_id = None
        self._project_type = None

    def _send_master_idle_notification(self):
        if not self._is_master_responsive():
            self._logger.notice('Could not post idle notification to master because master is unresponsive.')
            return

        # Notify master that this slave is finished with teardown and ready for a new build.
        self._logger.info('Notifying master that this slave is ready for new builds.')
        self._notify_master_of_state_change(SlaveState.IDLE)

    def _disconnect_from_master(self):
        """
        Perform internal bookkeeping, as well as notify the master, that this slave is disconnecting itself
        from the slave pool.
        """
        self.is_alive = False

        if not self._is_master_responsive():
            self._logger.notice('Could not post disconnect notification to master because master is unresponsive.')
            return

        # Notify master that this slave is shutting down and should not receive new builds.
        self._logger.info('Notifying master that this slave is disconnecting.')
        self._notify_master_of_state_change(SlaveState.DISCONNECTED)

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._disconnect_from_master)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Bld{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands,
                                               self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def _notify_master_of_state_change(self, new_state):
        """
        Send a state notification to the master. This is used to notify the master of events occurring on the slave
        related to build execution progress.

        :type new_state: SlaveState
        """
        state_url = self._master_api.url('slave', self._slave_id)
        self._network.put_with_digest(state_url, request_params={'slave': {'state': new_state}},
                                      secret=Secret.get(), error_on_failure=True)

    def kill(self):
        """
        Exits without error.
        """
        sys.exit(0)
コード例 #3
0
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors[executor_id] = executor

        self._setup_complete_event = Event()
        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None

        UnhandledExceptionHandler.singleton().add_teardown_callback(self._async_teardown_build,
                                                                    should_disconnect_from_master=True)

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors.values()]
        return {
            'connected': str(self._is_connected()),
            'master_url': self._master_url,
            'setup_complete': str(self._setup_complete_event.isSet()),
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def _is_connected(self):
        return self._master_url is not None

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._setup_complete_event.clear()
        self._current_build_id = build_id

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.setup_build(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(target=self._async_setup_build, args=(executors, project_type_params)).start()

    def _async_setup_build(self, executors, project_type_params):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.
        """
        # todo(joey): It's strange that the project_type is setting up the executors, which in turn set up projects.
        # todo(joey): I think this can be untangled a bit -- we should call executor.configure_project_type() here.
        self._project_type.setup_build(executors, project_type_params)

        self._logger.info('Build setup complete for build {}.', self._current_build_id)
        self._setup_complete_event.set()  # free any subjob threads that are waiting for setup to complete

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))

        self._logger.info('Executing teardown for build {}.', self._current_build_id)

        SafeThread(target=self._async_teardown_build).start()

    def _async_teardown_build(self, should_disconnect_from_master=False):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        if self._project_type:
            self._project_type.teardown_build()
            self._logger.info('Build teardown complete for build {}.', self._current_build_id)
            self._current_build_id = None
            self._project_type = None

        if not should_disconnect_from_master:
            # report back to master that this slave is finished with teardown and ready for a new build
            self._logger.info('Notifying master that this slave is ready for new builds.')
            idle_url = self._master_api.url('slave', self._slave_id, 'idle')
            response = self._network.post(idle_url)
            if response.status_code != http.client.OK:
                raise RuntimeError("Could not post teardown completion to master at {}".format(idle_url))

        elif self._is_master_responsive():
            # report back to master that this slave is shutting down and should not receive new builds
            self._logger.info('Notifying master to disconnect this slave.')
            disconnect_url = self._master_api.url('slave', self._slave_id, 'disconnect')
            response = self._network.post(disconnect_url)
            if response.status_code != http.client.OK:
                self._logger.error('Could not post disconnect notification to master at {}'.format(disconnect_url))

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str
        """
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Build{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        self._logger.debug('Waiting for setup to complete (Build {}, Subjob {})...', build_id, subjob_id)
        self._setup_complete_event.wait()  # block until setup completes
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def kill(self):
        # TODO(dtran): Kill the threads and this server more gracefully
        sys.exit(0)
コード例 #4
0
ファイル: slave.py プロジェクト: drobertduke/ClusterRunner
class Slave(object):

    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self.is_alive = True
        self._slave_api = UrlBuilder(slave_url, app.master.cluster_master.ClusterMaster.API_VERSION)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build(),
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build_id, project_type_params):
        """
        Execute a setup command on the slave for the specified build. The command is executed asynchronously from the
        perspective of this method, but any subjobs will block until the slave finishes executing the setup command.

        :param build_id: The build id that this setup command is for.
        :type build_id: int

        :param project_type_params: The parameters that define the project type this build will execute in
        :typeproject_type_paramss: dict
        """
        setup_url = self._slave_api.url('build', build_id, 'setup')
        post_data = {
            'project_type_params': project_type_params,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
        self._network.post(teardown_url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive:
            raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob,)).start()
        self.current_build_id = subjob.build_id()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_url=self.url)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def current_build(self):
        """
        :return:
        :rtype: int|None
        """
        return self.current_build_id
コード例 #5
0
ファイル: slave.py プロジェクト: thinhnd8752/ClusterRunner
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self,
                                 ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id,
                                           'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning(
                'Teardown request to slave failed because slave is unresponsive.'
            )
            self.mark_dead()

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        # todo: This should not be a SafeThread. https://github.com/box/ClusterRunner/issues/337
        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached: bool = True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(
                self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self,
                             self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session(
        )  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers
コード例 #6
0
ファイル: slave.py プロジェクト: box/ClusterRunner
class Slave:
    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._last_heartbeat_time = datetime.now()
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            self._remove_slave_from_registry()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self, ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning('Teardown request to slave failed because slave is unresponsive.')
            self.mark_dead()

    def start_subjob(self, subjob: Subjob):
        """
        Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build.
        :param subjob: The subjob to send to this slave
        """
        if not self.is_alive():
            raise DeadSlaveError('Tried to start a subjob on a dead slave.')
        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode.')

        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        try:
            response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)
        except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex:
            raise SlaveCommunicationError('Call to slave service failed: {}.'.format(repr(ex))) from ex

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached: bool=True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed and removed from slave registry when they finish teardown, or
        killed and removed from slave registry immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()
            self._remove_slave_from_registry()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self, self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session()  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers

    def update_last_heartbeat_time(self):
        self._last_heartbeat_time = datetime.now()

    def get_last_heartbeat_time(self) -> datetime:
        return self._last_heartbeat_time

    def _remove_slave_from_registry(self):
        """
        Remove shutdown-ed slave from SlaveRegistry.
        """
        self._logger.info('Removing slave (url={}; id={}) from Slave Registry.'.format(self.url, self.id))
        SlaveRegistry.singleton().remove_slave(slave_url=self.url)
コード例 #7
0
ファイル: slave.py プロジェクト: OspreyX/ClusterRunner
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': build.num_executors_allocated,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build.build_id()

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob,)).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value
コード例 #8
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build_id, project_type_params):
        """
        Execute a setup command on the slave for the specified build. The command is executed asynchronously from the
        perspective of this method, but any subjobs will block until the slave finishes executing the setup command.

        :param build_id: The build id that this setup command is for.
        :type build_id: int

        :param project_type_params: The parameters that define the project type this build will execute in
        :type project_type_params: dict
        """
        setup_url = self._slave_api.url('build', build_id, 'setup')
        slave_project_type_params = util.project_type_params_for_slave(
            project_type_params)
        post_data = {
            'project_type_params': slave_project_type_params,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build_id

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']

        except ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value