def is_up(self, service_url, timeout=0.1):
        """
        Checks if the service is up
        :type service_url: string
        :type timeout: float
        :rtype: bool
        """
        network = Network()
        timeout_time = time.time() + timeout
        while True:
            try:
                resp = network.get('http://{}'.format(service_url), timeout=timeout)
                if resp and resp.ok:
                    return True
            except (requests.RequestException, ConnectionError):
                pass
            if time.time() > timeout_time:
                break
            time.sleep(0.5)

        return False
    def is_up(self, service_url, timeout=0.1):
        """
        Checks if the service is up
        :type service_url: string
        :type timeout: float
        :rtype: bool
        """
        network = Network()
        timeout_time = time.time() + timeout
        while True:
            try:
                resp = network.get('http://{}'.format(service_url), timeout=timeout)
                if resp and resp.ok:
                    return True
            except (requests.RequestException, ConnectionError):
                pass
            if time.time() > timeout_time:
                break
            time.sleep(0.5)

        return False
Beispiel #3
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build, executor_start_index):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        :param executor_start_index: The index the slave should number its executors from for this build
        :type executor_start_index: int
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        self._network.post_with_digest(setup_url, post_data, Secret.get())

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instructs the slave process to kill itself.
        """
        kill_url = self._slave_api.url('kill')
        self._network.post_with_digest(kill_url, {}, Secret.get())
        self.mark_dead()

    def mark_dead(self):
        """
        Marks the slave dead.
        """
        self.set_is_alive(False)
        self.current_build_id = None
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors[executor_id] = executor

        self._setup_complete_event = Event()
        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None

        UnhandledExceptionHandler.singleton().add_teardown_callback(self._async_teardown_build,
                                                                    should_disconnect_from_master=True)

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors.values()]
        return {
            'connected': str(self._is_connected()),
            'master_url': self._master_url,
            'setup_complete': str(self._setup_complete_event.isSet()),
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def _is_connected(self):
        return self._master_url is not None

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._setup_complete_event.clear()
        self._current_build_id = build_id

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.setup_build(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(target=self._async_setup_build, args=(executors, project_type_params)).start()

    def _async_setup_build(self, executors, project_type_params):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.
        """
        # todo(joey): It's strange that the project_type is setting up the executors, which in turn set up projects.
        # todo(joey): I think this can be untangled a bit -- we should call executor.configure_project_type() here.
        self._project_type.setup_build(executors, project_type_params)

        self._logger.info('Build setup complete for build {}.', self._current_build_id)
        self._setup_complete_event.set()  # free any subjob threads that are waiting for setup to complete

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))

        self._logger.info('Executing teardown for build {}.', self._current_build_id)

        SafeThread(target=self._async_teardown_build).start()

    def _async_teardown_build(self, should_disconnect_from_master=False):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        if self._project_type:
            self._project_type.teardown_build()
            self._logger.info('Build teardown complete for build {}.', self._current_build_id)
            self._current_build_id = None
            self._project_type = None

        if not should_disconnect_from_master:
            # report back to master that this slave is finished with teardown and ready for a new build
            self._logger.info('Notifying master that this slave is ready for new builds.')
            idle_url = self._master_api.url('slave', self._slave_id, 'idle')
            response = self._network.post(idle_url)
            if response.status_code != http.client.OK:
                raise RuntimeError("Could not post teardown completion to master at {}".format(idle_url))

        elif self._is_master_responsive():
            # report back to master that this slave is shutting down and should not receive new builds
            self._logger.info('Notifying master to disconnect this slave.')
            disconnect_url = self._master_api.url('slave', self._slave_id, 'disconnect')
            response = self._network.post(disconnect_url)
            if response.status_code != http.client.OK:
                self._logger.error('Could not post disconnect notification to master at {}'.format(disconnect_url))

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str
        """
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Build{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        self._logger.debug('Waiting for setup to complete (Build {}, Subjob {})...', build_id, subjob_id)
        self._setup_complete_event.wait()  # block until setup completes
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def kill(self):
        # TODO(dtran): Kill the threads and this server more gracefully
        sys.exit(0)
Beispiel #5
0
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self.is_alive = True
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors_by_id = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors_by_id[executor_id] = executor

        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None
        self._build_teardown_coin = None

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors_by_id.values()]
        return {
            'is_alive': self.is_alive,
            'master_url': self._master_url,
            'current_build_id': self._current_build_id,
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params, build_executor_start_index):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for
        this build
        :type build_executor_start_index: int
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._current_build_id = build_id
        self._build_teardown_coin = SingleUseCoin()  # protects against build_teardown being executed multiple times

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(
            target=self._async_setup_build,
            name='Bld{}-Setup'.format(build_id),
            args=(executors, project_type_params, build_executor_start_index)
        ).start()

    def _async_setup_build(self, executors, project_type_params, build_executor_start_index):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.

        :type executors: list[SubjobExecutor]
        :type project_type_params: dict
        :type build_executor_start_index: int
        """
        self._base_executor_index = build_executor_start_index
        try:
            self._project_type.fetch_project()
            for executor in executors:
                executor.configure_project_type(project_type_params)
            self._project_type.run_job_config_setup()

        except SetupFailureError as ex:
            self._logger.error(ex)
            self._logger.info('Notifying master that build setup has failed for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_FAILED)

        else:
            self._logger.info('Notifying master that build setup is complete for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_COMPLETED)

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))
        SafeThread(
            target=self._async_teardown_build,
            name='Bld{}-Teardwn'.format(build_id)
        ).start()

    def _async_teardown_build(self):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        self._do_build_teardown_and_reset()
        while not self._idle_executors.full():
            time.sleep(1)
        self._send_master_idle_notification()

    def _do_build_teardown_and_reset(self, timeout=None):
        """
        Kill any currently running subjobs. Run the teardown_build commands for the current build (with an optional
        timeout). Clear attributes related to the currently running build.

        :param timeout: A maximum time in seconds to allow the teardown process to run before killing
        :type timeout: int | None
        """
        # Kill all subjob executors' processes. This only has an effect if we are tearing down before a build completes.
        for executor in self.executors_by_id.values():
            executor.kill()

        # Order matters! Spend the coin if it has been initialized.
        if not self._build_teardown_coin or not self._build_teardown_coin.spend() or not self._project_type:
            return  # There is no build to tear down or teardown is already in progress.

        self._logger.info('Executing teardown for build {}.', self._current_build_id)
        # todo: Catch exceptions raised during teardown_build so we don't skip notifying master of idle/disconnect.
        self._project_type.teardown_build(timeout=timeout)
        self._logger.info('Build teardown complete for build {}.', self._current_build_id)
        self._current_build_id = None
        self._project_type = None

    def _send_master_idle_notification(self):
        if not self._is_master_responsive():
            self._logger.notice('Could not post idle notification to master because master is unresponsive.')
            return

        # Notify master that this slave is finished with teardown and ready for a new build.
        self._logger.info('Notifying master that this slave is ready for new builds.')
        self._notify_master_of_state_change(SlaveState.IDLE)

    def _disconnect_from_master(self):
        """
        Perform internal bookkeeping, as well as notify the master, that this slave is disconnecting itself
        from the slave pool.
        """
        self.is_alive = False

        if not self._is_master_responsive():
            self._logger.notice('Could not post disconnect notification to master because master is unresponsive.')
            return

        # Notify master that this slave is shutting down and should not receive new builds.
        self._logger.info('Notifying master that this slave is disconnecting.')
        self._notify_master_of_state_change(SlaveState.DISCONNECTED)

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._disconnect_from_master)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Bld{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands,
                                               self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def _notify_master_of_state_change(self, new_state):
        """
        Send a state notification to the master. This is used to notify the master of events occurring on the slave
        related to build execution progress.

        :type new_state: SlaveState
        """
        state_url = self._master_api.url('slave', self._slave_id)
        self._network.put_with_digest(state_url, request_params={'slave': {'state': new_state}},
                                      secret=Secret.get(), error_on_failure=True)

    def kill(self):
        """
        Exits without error.
        """
        sys.exit(0)
Beispiel #6
0
class BuildRunner(object):
    """
    BuildRunner is a procedure-oriented class intended to be used in the context of a script. This class provides
    functionality to synchronously execute a build on the ClusterRunner, wait for it to complete, and collect the
    build results.

    Example usage pattern:
    >>> runner = BuildRunner('http://mymaster.net:123', {'type':'git', 'url':'https://github.com/box/StatusWolf.git'})
    >>> runner.run()
    """

    API_VERSION = 'v1'

    def __init__(self, master_url, request_params, secret):
        """
        :param master_url: The url of the master which the build will be executed on
        :type master_url: str
        :param request_params: A dict of request params that will be json-encoded and sent in the build request
        :type request_params: dict
        :type secret: str
        """
        self._master_url = self._ensure_url_has_scheme(master_url)
        self._request_params = request_params
        self._secret = secret
        self._build_id = None
        self._network = Network()
        self._logger = get_logger(__name__)
        self._last_build_status_details = None
        self._master_api = UrlBuilder(master_url, self.API_VERSION)
        self._cluster_master_api_client = ClusterMasterAPIClient(master_url)

    def run(self):
        """
        Send the build request to the master, wait for the build to finish, then download the build artifacts.

        :return: Whether or not we were successful in running the build. (Note this does *not* indicate the success or
            faulure of the build itself; that is determined by the contents of the build artifacts which should be
            parsed elsewhere.)
        :rtype: bool
        """
        try:
            self._start_build()
            result = self._block_until_finished()
            self._download_and_extract_results()
            return result

        except _BuildRunnerError as ex:
            self._logger.error(str(ex))
            self._logger.warning('Script aborted due to error!')
            self._cancel_build()
            return False

    def _cancel_build(self):
        """
        Request the master cancels the build.
        """
        if self._build_id is not None:
            self._logger.warning('Cancelling build {}'.format(self._build_id))
            self._cluster_master_api_client.cancel_build(self._build_id)

    def _start_build(self):
        """
        Send the build request to the master for execution.
        """
        build_url = self._master_api.url('build')
        # todo: catch connection error
        response = self._network.post_with_digest(build_url, self._request_params, self._secret, error_on_failure=True)
        response_data = response.json()

        if 'error' in response_data:
            error_message = response_data['error']
            raise _BuildRunnerError('Error starting build: ' + error_message)

        self._build_id = response_data['build_id']

        UnhandledExceptionHandler.singleton().add_teardown_callback(self._cancel_build)
        self._logger.info('Build is running. (Build id: {})', self._build_id)

    def _block_until_finished(self, timeout=None):
        """
        Poll the build status endpoint until the build is finished or until the timeout is reached.

        :param timeout: The maximum number of seconds to wait until giving up, or None for no timeout
        :type timeout: int|None
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize
        build_status_url = self._master_api.url('build', self._build_id)
        self._logger.debug('Polling build status url: {}', build_status_url)

        while time.time() <= timeout_time:
            response = self._network.get(build_status_url)
            response_data = response.json()

            if 'build' not in response_data or 'status' not in response_data['build']:
                raise _BuildRunnerError('Status response does not contain a "build" object with a "status" value.'
                                        'URL: {}, Content:{}'.format(build_status_url, response_data))

            build_data = response_data['build']
            if build_data['status'] == BuildStatus.FINISHED:
                self._logger.info('Build is finished. (Build id: {})', self._build_id)
                completion_message = 'Build {} result was {}'.format(self._build_id, build_data['result'])
                is_success = build_data['result'] == BuildResult.NO_FAILURES
                if is_success:
                    self._logger.info(completion_message)
                else:
                    self._logger.error(completion_message)
                    if build_data['failed_atoms']:
                        self._logger.error('These atoms had non-zero exit codes (failures):')
                        for failure in build_data['failed_atoms']:
                            self._logger.error(failure)
                    return False

                return True

            if build_data['status'] == BuildStatus.ERROR:
                message = 'Build aborted due to error: {}'.format(build_data.get('error_message'))
                raise _BuildRunnerError(message)

            if build_data['status'] == BuildStatus.BUILDING:
                if build_data['details'] != self._last_build_status_details:
                    self._last_build_status_details = build_data['details']
                    self._logger.info(build_data['details'])

            time.sleep(1)

        raise _BuildRunnerError('Build timed out after {} seconds.'.format(timeout))

    def _download_and_extract_results(self, timeout=None):
        """
        Download the result files for the build.
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize

        download_artifacts_url = self._master_api.url('build', self._build_id, 'result')
        download_filepath = 'build_results/artifacts.tar.gz'
        download_dir, _ = os.path.split(download_filepath)

        # remove any previous build artifacts
        if os.path.exists(download_dir):
            shutil.rmtree(download_dir)

        while time.time() <= timeout_time:
            response = self._network.get(download_artifacts_url)
            if response.status_code == http.client.OK:
                # save tar file to disk, decompress, and delete
                app.util.fs.create_dir(download_dir)
                with open(download_filepath, 'wb') as file:
                    chunk_size = 500 * 1024
                    for chunk in response.iter_content(chunk_size):
                        file.write(chunk)

                app.util.fs.extract_tar(download_filepath, delete=True)
                return

            time.sleep(1)

        raise _BuildRunnerError('Build timed out after {} seconds.'.format(timeout))

    def _ensure_url_has_scheme(self, url):
        """
        If url does not start with 'http' or 'https', add 'http://' to the beginning.
        :type url: str
        :rtype: str
        """
        url = url.strip()
        if not url.startswith('http'):
            url = 'http://' + url
        return url
Beispiel #7
0
class BuildRunner(object):
    """
    BuildRunner is a procedure-oriented class intended to be used in the context of a script. This class provides
    functionality to synchronously execute a build on the ClusterRunner, wait for it to complete, and collect the
    build results.

    Example usage pattern:
    >>> runner = BuildRunner('http://mymaster.net:123', {'type':'git', 'url':'https://github.com/box/StatusWolf.git'})
    >>> runner.run()
    """

    API_VERSION = 'v1'

    def __init__(self, master_url, request_params, secret):
        """
        :param master_url: The url of the master which the build will be executed on
        :type master_url: str
        :param request_params: A dict of request params that will be json-encoded and sent in the build request
        :type request_params: dict
        :type secret: str
        """
        self._master_url = self._ensure_url_has_scheme(master_url)
        self._request_params = request_params
        self._secret = secret
        self._build_id = None
        self._network = Network()
        self._logger = get_logger(__name__)
        self._last_build_status_details = None
        self._master_api = UrlBuilder(master_url, self.API_VERSION)
        self._cluster_master_api_client = ClusterMasterAPIClient(master_url)

    def run(self):
        """
        Send the build request to the master, wait for the build to finish, then download the build artifacts.

        :return: Whether or not we were successful in running the build. (Note this does *not* indicate the success or
            faulure of the build itself; that is determined by the contents of the build artifacts which should be
            parsed elsewhere.)
        :rtype: bool
        """
        try:
            self._start_build()
            result = self._block_until_finished()
            self._download_and_extract_results()
            return result

        except _BuildRunnerError as ex:
            self._logger.error(str(ex))
            self._logger.warning('Script aborted due to error!')
            self._cancel_build()
            return False

    def _cancel_build(self):
        """
        Request the master cancels the build.
        """
        if self._build_id is not None:
            self._logger.warning('Cancelling build {}'.format(self._build_id))
            self._cluster_master_api_client.cancel_build(self._build_id)

    def _start_build(self):
        """
        Send the build request to the master for execution.
        """
        build_url = self._master_api.url('build')
        # todo: catch connection error
        response = self._network.post_with_digest(build_url,
                                                  self._request_params,
                                                  self._secret,
                                                  error_on_failure=True)
        response_data = response.json()

        if 'error' in response_data:
            error_message = response_data['error']
            raise _BuildRunnerError('Error starting build: ' + error_message)

        self._build_id = response_data['build_id']

        UnhandledExceptionHandler.singleton().add_teardown_callback(
            self._cancel_build)
        self._logger.info('Build is running. (Build id: {})', self._build_id)

    def _block_until_finished(self, timeout=None):
        """
        Poll the build status endpoint until the build is finished or until the timeout is reached.

        :param timeout: The maximum number of seconds to wait until giving up, or None for no timeout
        :type timeout: int|None
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize
        build_status_url = self._master_api.url('build', self._build_id)
        self._logger.debug('Polling build status url: {}', build_status_url)

        while time.time() <= timeout_time:
            response = self._network.get(build_status_url)
            response_data = response.json()

            if 'build' not in response_data or 'status' not in response_data[
                    'build']:
                raise _BuildRunnerError(
                    'Status response does not contain a "build" object with a "status" value.'
                    'URL: {}, Content:{}'.format(build_status_url,
                                                 response_data))

            build_data = response_data['build']
            if build_data['status'] == BuildStatus.FINISHED:
                self._logger.info('Build is finished. (Build id: {})',
                                  self._build_id)
                completion_message = 'Build {} result was {}'.format(
                    self._build_id, build_data['result'])
                is_success = build_data['result'] == BuildResult.NO_FAILURES
                if is_success:
                    self._logger.info(completion_message)
                else:
                    self._logger.error(completion_message)
                    if build_data['failed_atoms']:
                        self._logger.error(
                            'These atoms had non-zero exit codes (failures):')
                        for failure in build_data['failed_atoms']:
                            self._logger.error(failure)
                    return False

                return True

            if build_data['status'] == BuildStatus.ERROR:
                message = 'Build aborted due to error: {}'.format(
                    build_data.get('error_message'))
                raise _BuildRunnerError(message)

            if build_data['status'] == BuildStatus.BUILDING:
                if build_data['details'] != self._last_build_status_details:
                    self._last_build_status_details = build_data['details']
                    self._logger.info(build_data['details'])

            time.sleep(1)

        raise _BuildRunnerError(
            'Build timed out after {} seconds.'.format(timeout))

    def _download_and_extract_results(self, timeout=None):
        """
        Download the result files for the build.
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize

        download_artifacts_url = self._master_api.url('build', self._build_id,
                                                      'result')
        download_filepath = 'build_results/artifacts.tar.gz'
        download_dir, _ = os.path.split(download_filepath)

        # remove any previous build artifacts
        if os.path.exists(download_dir):
            shutil.rmtree(download_dir)

        while time.time() <= timeout_time:
            response = self._network.get(download_artifacts_url)
            if response.status_code == http.client.OK:
                # save tar file to disk, decompress, and delete
                app.util.fs.create_dir(download_dir)
                with open(download_filepath, 'wb') as file:
                    chunk_size = 500 * 1024
                    for chunk in response.iter_content(chunk_size):
                        file.write(chunk)

                app.util.fs.extract_tar(download_filepath, delete=True)
                return

            time.sleep(1)

        raise _BuildRunnerError(
            'Build timed out after {} seconds.'.format(timeout))

    def _ensure_url_has_scheme(self, url):
        """
        If url does not start with 'http' or 'https', add 'http://' to the beginning.
        :type url: str
        :rtype: str
        """
        url = url.strip()
        if not url.startswith('http'):
            url = 'http://' + url
        return url
Beispiel #8
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self,
                                 ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id,
                                           'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning(
                'Teardown request to slave failed because slave is unresponsive.'
            )
            self.mark_dead()

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        # todo: This should not be a SafeThread. https://github.com/box/ClusterRunner/issues/337
        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached: bool = True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(
                self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self,
                             self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session(
        )  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers
Beispiel #9
0
class Slave:
    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._last_heartbeat_time = datetime.now()
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            self._remove_slave_from_registry()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self, ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning('Teardown request to slave failed because slave is unresponsive.')
            self.mark_dead()

    def start_subjob(self, subjob: Subjob):
        """
        Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build.
        :param subjob: The subjob to send to this slave
        """
        if not self.is_alive():
            raise DeadSlaveError('Tried to start a subjob on a dead slave.')
        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode.')

        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        try:
            response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)
        except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex:
            raise SlaveCommunicationError('Call to slave service failed: {}.'.format(repr(ex))) from ex

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached: bool=True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed and removed from slave registry when they finish teardown, or
        killed and removed from slave registry immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()
            self._remove_slave_from_registry()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self, self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session()  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers

    def update_last_heartbeat_time(self):
        self._last_heartbeat_time = datetime.now()

    def get_last_heartbeat_time(self) -> datetime:
        return self._last_heartbeat_time

    def _remove_slave_from_registry(self):
        """
        Remove shutdown-ed slave from SlaveRegistry.
        """
        self._logger.info('Removing slave (url={}; id={}) from Slave Registry.'.format(self.url, self.id))
        SlaveRegistry.singleton().remove_slave(slave_url=self.url)
Beispiel #10
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': build.num_executors_allocated,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build.build_id()

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob,)).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value
Beispiel #11
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build_id, project_type_params):
        """
        Execute a setup command on the slave for the specified build. The command is executed asynchronously from the
        perspective of this method, but any subjobs will block until the slave finishes executing the setup command.

        :param build_id: The build id that this setup command is for.
        :type build_id: int

        :param project_type_params: The parameters that define the project type this build will execute in
        :type project_type_params: dict
        """
        setup_url = self._slave_api.url('build', build_id, 'setup')
        slave_project_type_params = util.project_type_params_for_slave(
            project_type_params)
        post_data = {
            'project_type_params': slave_project_type_params,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build_id

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']

        except ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value
class BaseFunctionalTestCase(TestCase):
    """
    This is the base class for all functional tests. This class has two main purposes:
        - Make available a `FunctionalTestCluster` object for use in functional tests (self.cluster)
        - Implement any helper assertion methods that might be useful for making our tests easier to read and write
    """
    def setUp(self):
        # Configure logging to go to stdout. This makes debugging easier by allowing us to see logs for failed tests.
        log.configure_logging('DEBUG')

        Secret.set('testsecret')

        self.cluster = FunctionalTestCluster(verbose=self._get_test_verbosity())
        self._network = Network()

    def _create_test_config_file(self, conf_values_to_set=None):
        """
        Create a temporary conf file just for this test.

        :return: The path to the conf file
        :rtype: str
        """
        # Copy default conf file to tmp location
        repo_dir = path.dirname(path.dirname(path.dirname(path.dirname(path.realpath(__file__)))))
        self._conf_template_path = path.join(repo_dir, 'conf', 'default_clusterrunner.conf')
        test_conf_file_path = tempfile.NamedTemporaryFile().name
        shutil.copy(self._conf_template_path, test_conf_file_path)
        os.chmod(test_conf_file_path, ConfigFile.CONFIG_FILE_MODE)
        conf_file = ConfigFile(test_conf_file_path)

        # Set custom conf file values for this test
        conf_values_to_set = conf_values_to_set or {}
        for conf_key, conf_value in conf_values_to_set.items():
            conf_file.write_value(conf_key, conf_value, BASE_CONFIG_FILE_SECTION)

        return test_conf_file_path

    def tearDown(self):
        # Give the cluster a bit of extra time to finish working (before forcefully killing it and failing the test)
        with suppress(TestClusterTimeoutError):
            self.cluster.block_until_build_queue_empty(timeout=5)

        # Kill processes and make sure all processes exited with 0 exit code
        services = self.cluster.kill()

        # only check the exit code if not on Windows as Popen.terminate kills the process on Windows and the exit
        # code is not zero.
        # TODO: remove the is_windows() check after we can handle exit on Windows gracefully.
        if not is_windows():
            for service in services:
                self.assertEqual(
                    service.return_code,
                    0,
                    'Service running on url: {} should exit with code 0, but exited with code {}.'.format(
                        service.url,
                        service.return_code,
                    ),
                )
        # Remove the temp dir. This will delete the log files, so should be run after cluster shuts down.
        self.cluster.master_app_base_dir.cleanup()
        [slave_app_base_dir.cleanup() for slave_app_base_dir in self.cluster.slaves_app_base_dirs]

    def _get_test_verbosity(self):
        """
        Get test verbosity from an env variable. We need to use an env var since Nose does not support specifying
        command-line test configuration natively. (But if we need more of these configuration paramaters, we should
        instead look at the 'nose-testconfig' plugin instead of adding tons of environment variables.)

        :return: Whether or not tests should be run verbosely
        :rtype: bool
        """
        is_verbose = os.getenv('CR_VERBOSE') not in ('0', '', None)  # default value of is_verbose is False
        return is_verbose

    def assert_build_status_contains_expected_data(self, build_id, expected_data):
        """
        Assert that the build status endpoint contains the expected fields and values. This assertion does an API
        request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        :param expected_data: A dict of expected keys and values in the build status response
        :type expected_data: dict
        """
        build_status = self.cluster.master_api_client.get_build_status(build_id).get('build')
        self.assertIsInstance(build_status, dict, 'Build status API request should return a dict.')
        self.assertDictContainsSubset(expected_data, build_status,
                                      'Build status API response should contain the expected status data.')

    def assert_build_has_successful_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was successful (had no failures).
        This assertion does an API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_successful_build_params = {
            'result': 'NO_FAILURES',
            'status': 'FINISHED',
        }
        self.assert_build_status_contains_expected_data(build_id, expected_successful_build_params)

    def assert_build_has_failure_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was failed. This assertion does an
        API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_failure_build_params = {
            'result': 'FAILURE',
            'status': 'FINISHED',
        }
        self.assert_build_status_contains_expected_data(build_id, expected_failure_build_params)

    def assert_build_has_canceled_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was failed. This assertion does an
        API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_failure_build_params = {
            'result': 'FAILURE',
            'status': 'CANCELED',
            }
        self.assert_build_status_contains_expected_data(build_id, expected_failure_build_params)

    def assert_build_artifact_contents_match_expected(self, master_api, build_id, expected_build_artifact_contents):
        """
        Assert that artifact files for this build have the expected contents.

        :type master_api: app.util.url_builder.UrlBuilder
        :param build_id: The id of the build whose artifacts to check
        :type build_id: int
        :param expected_build_artifact_contents: A list of FSItems corresponding to the expected artifact dir contents
        :type expected_build_artifact_contents: list[FSItem]
        """
        with tempfile.TemporaryDirectory() as build_artifacts_dir_path:
            self._download_and_extract_results(master_api, build_id, build_artifacts_dir_path)
            self.assert_directory_contents_match_expected(build_artifacts_dir_path, expected_build_artifact_contents)

    def assert_directory_contents_match_expected(self, dir_path, expected_dir_contents):
        """
        Assert that the specified directory has the expected contents.

        :param dir_path: The path of the directory whose artifacts to check
        :type dir_path: string
        :param expected_dir_contents: A list of FSItems corresponding to the expected directory contents
        :type expected_dir_contents: list[FSItem]
        """
        if expected_dir_contents is not None:
            dir_path = os.path.abspath(dir_path)  # converts path to absolute, removes trailing slash if present
            expected_dir_name = os.path.basename(dir_path)
            expected_build_artifacts = Directory(expected_dir_name, expected_dir_contents)
            expected_build_artifacts.assert_matches_path(dir_path, allow_extra_items=False)

    def _download_and_extract_results(self, master_api, build_id, download_dir):
        """
        :type master_api: app.util.url_builder.UrlBuilder
        :type build_id: int
        :type download_dir: str
        """
        download_artifacts_url = master_api.url('build', build_id, 'result')
        download_filepath = os.path.join(download_dir, BuildArtifact.ARTIFACT_FILE_NAME)
        response = self._network.get(download_artifacts_url)

        if response.status_code == http.client.OK:
            # save tar file to disk, decompress, and delete
            with open(download_filepath, 'wb') as file:
                chunk_size = 500 * 1024
                for chunk in response.iter_content(chunk_size):
                    file.write(chunk)

            extract_tar(download_filepath, delete=True)
class BaseFunctionalTestCase(TestCase):
    """
    This is the base class for all functional tests. This class has two main purposes:
        - Make available a `FunctionalTestCluster` object for use in functional tests (self.cluster)
        - Implement any helper assertion methods that might be useful for making our tests easier to read and write
    """
    def setUp(self):
        # Configure logging to go to stdout. This makes debugging easier by allowing us to see logs for failed tests.
        log.configure_logging('DEBUG')

        Secret.set('testsecret')

        self.cluster = FunctionalTestCluster(
            verbose=self._get_test_verbosity())
        self._network = Network()

    def _create_test_config_file(self, conf_values_to_set=None):
        """
        Create a temporary conf file just for this test.

        :return: The path to the conf file
        :rtype: str
        """
        # Copy default conf file to tmp location
        repo_dir = path.dirname(
            path.dirname(path.dirname(path.dirname(path.realpath(__file__)))))
        self._conf_template_path = path.join(repo_dir, 'conf',
                                             'default_clusterrunner.conf')
        test_conf_file_path = tempfile.NamedTemporaryFile().name
        shutil.copy(self._conf_template_path, test_conf_file_path)
        os.chmod(test_conf_file_path, ConfigFile.CONFIG_FILE_MODE)
        conf_file = ConfigFile(test_conf_file_path)

        # Set custom conf file values for this test
        conf_values_to_set = conf_values_to_set or {}
        for conf_key, conf_value in conf_values_to_set.items():
            conf_file.write_value(conf_key, conf_value,
                                  BASE_CONFIG_FILE_SECTION)

        return test_conf_file_path

    def tearDown(self):
        # Give the cluster a bit of extra time to finish working (before forcefully killing it and failing the test)
        with suppress(TestClusterTimeoutError):
            self.cluster.block_until_build_queue_empty(timeout=5)

        # Kill processes and make sure all processes exited with 0 exit code
        services = self.cluster.kill()

        # only check the exit code if not on Windows as Popen.terminate kills the process on Windows and the exit
        # code is not zero.
        # TODO: remove the is_windows() check after we can handle exit on Windows gracefully.
        if not is_windows():
            for service in services:
                self.assertEqual(
                    service.return_code,
                    0,
                    'Service running on url: {} should exit with code 0, but exited with code {}.'
                    .format(
                        service.url,
                        service.return_code,
                    ),
                )
        # Remove the temp dir. This will delete the log files, so should be run after cluster shuts down.
        self.cluster.master_app_base_dir.cleanup()
        [
            slave_app_base_dir.cleanup()
            for slave_app_base_dir in self.cluster.slaves_app_base_dirs
        ]

    def _get_test_verbosity(self):
        """
        Get test verbosity from an env variable. We need to use an env var since Nose does not support specifying
        command-line test configuration natively. (But if we need more of these configuration paramaters, we should
        instead look at the 'nose-testconfig' plugin instead of adding tons of environment variables.)

        :return: Whether or not tests should be run verbosely
        :rtype: bool
        """
        is_verbose = os.getenv('CR_VERBOSE') not in (
            '0', '', None)  # default value of is_verbose is False
        return is_verbose

    def assert_build_status_contains_expected_data(self, build_id,
                                                   expected_data):
        """
        Assert that the build status endpoint contains the expected fields and values. This assertion does an API
        request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        :param expected_data: A dict of expected keys and values in the build status response
        :type expected_data: dict
        """
        build_status = self.cluster.master_api_client.get_build_status(
            build_id).get('build')
        self.assertIsInstance(
            build_status, dict,
            'Build status API request should return a dict.')
        self.assertDictContainsSubset(
            expected_data, build_status,
            'Build status API response should contain the expected status data.'
        )

    def assert_build_has_successful_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was successful (had no failures).
        This assertion does an API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_successful_build_params = {
            'result': 'NO_FAILURES',
            'status': 'FINISHED',
        }
        self.assert_build_status_contains_expected_data(
            build_id, expected_successful_build_params)

    def assert_build_has_failure_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was failed. This assertion does an
        API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_failure_build_params = {
            'result': 'FAILURE',
            'status': 'FINISHED',
        }
        self.assert_build_status_contains_expected_data(
            build_id, expected_failure_build_params)

    def assert_build_has_canceled_status(self, build_id):
        """
        Assert that the build status endpoint contains fields signifying the build was failed. This assertion does an
        API request to the master service of self.cluster.

        :param build_id: The id of the build whose status to check
        :type build_id: int
        """
        expected_failure_build_params = {
            'result': 'FAILURE',
            'status': 'CANCELED',
        }
        self.assert_build_status_contains_expected_data(
            build_id, expected_failure_build_params)

    def assert_build_artifact_contents_match_expected(
            self, master_api, build_id, expected_build_artifact_contents):
        """
        Assert that artifact files for this build have the expected contents.

        :type master_api: app.util.url_builder.UrlBuilder
        :param build_id: The id of the build whose artifacts to check
        :type build_id: int
        :param expected_build_artifact_contents: A list of FSItems corresponding to the expected artifact dir contents
        :type expected_build_artifact_contents: list[FSItem]
        """
        with tempfile.TemporaryDirectory() as build_artifacts_dir_path:
            self._download_and_extract_results(master_api, build_id,
                                               build_artifacts_dir_path)
            self.assert_directory_contents_match_expected(
                build_artifacts_dir_path, expected_build_artifact_contents)

    def assert_directory_contents_match_expected(self, dir_path,
                                                 expected_dir_contents):
        """
        Assert that the specified directory has the expected contents.

        :param dir_path: The path of the directory whose artifacts to check
        :type dir_path: string
        :param expected_dir_contents: A list of FSItems corresponding to the expected directory contents
        :type expected_dir_contents: list[FSItem]
        """
        if expected_dir_contents is not None:
            dir_path = os.path.abspath(
                dir_path
            )  # converts path to absolute, removes trailing slash if present
            expected_dir_name = os.path.basename(dir_path)
            expected_build_artifacts = Directory(expected_dir_name,
                                                 expected_dir_contents)
            expected_build_artifacts.assert_matches_path(
                dir_path, allow_extra_items=False)

    def _download_and_extract_results(self, master_api, build_id,
                                      download_dir):
        """
        :type master_api: app.util.url_builder.UrlBuilder
        :type build_id: int
        :type download_dir: str
        """
        download_artifacts_url = master_api.url('build', build_id, 'result')
        download_filepath = os.path.join(download_dir,
                                         BuildArtifact.ARTIFACT_FILE_NAME)
        response = self._network.get(download_artifacts_url)

        if response.status_code == http.client.OK:
            # save tar file to disk, decompress, and delete
            with open(download_filepath, 'wb') as file:
                chunk_size = 500 * 1024
                for chunk in response.iter_content(chunk_size):
                    file.write(chunk)

            extract_tar(download_filepath, delete=True)