def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._slave_registry = SlaveRegistry.singleton()
        self._scheduler_pool = BuildSchedulerPool()
        self._build_request_handler = BuildRequestHandler(self._scheduler_pool)
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._scheduler_pool)
        self._slave_allocator.start()

        # The best practice for determining the number of threads to use is
        # the number of threads per core multiplied by the number of physical
        # cores. So for example, with 10 cores, 2 sockets and 2 per core, the
        # max would be 40.
        #
        # Currently we use threads for incrementing/decrementing slave executor
        # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be
        # plenty for these tasks. In the case of heavy load, the bottle neck will be the number
        # of executors, not the time it takes to lock/unlock the executor counts or the number of
        # teardown requests. Tweak the number to find the sweet spot if you feel this is the case.
        self._thread_pool_executor = ThreadPoolExecutor(max_workers=32)

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)
        fs.create_dir(self._master_results_path)

        # Configure heartbeat tracking
        self._unresponsive_slaves_cleanup_interval = Configuration[
            'unresponsive_slaves_cleanup_interval']
        self._hb_scheduler = sched.scheduler()

        SlavesCollector.register_slaves_metrics_collector(
            lambda: self._slave_registry.get_all_slaves_by_id().values())
Exemple #2
0
 def _create_slave_allocator(self, **kwargs):
     """
     Create a slave allocator for testing.
     :param kwargs: Any constructor parameters for the slave; if none are specified, test defaults will be used.
     :rtype: SlaveAllocator
     """
     return SlaveAllocator(Mock(spec_set=BuildSchedulerPool))
    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._all_slaves_by_url = {}
        self._all_builds_by_id = OrderedDict()
        self._build_request_handler = BuildRequestHandler()
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._build_request_handler)
        self._slave_allocator.start()

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)

        fs.create_dir(self._master_results_path)
 def _create_slave_allocator(self, **kwargs):
     """
     Create a slave allocator for testing.
     :param kwargs: Any constructor parameters for the slave; if none are specified, test defaults will be used.
     :rtype: SlaveAllocator
     """
     kwargs.setdefault('build_request_handler', Mock())
     return SlaveAllocator(**kwargs)
Exemple #5
0
    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._all_slaves_by_url = {}
        self._scheduler_pool = BuildSchedulerPool()
        self._build_request_handler = BuildRequestHandler(self._scheduler_pool)
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._scheduler_pool)
        self._slave_allocator.start()

        # Initialize the database connection before we initialize a BuildStore
        Connection.create(Configuration['database_url'])
        UnhandledExceptionHandler.singleton().add_teardown_callback(
            BuildStore.clean_up)

        # The best practice for determining the number of threads to use is
        # the number of threads per core multiplied by the number of physical
        # cores. So for example, with 10 cores, 2 sockets and 2 per core, the
        # max would be 40.
        #
        # Currently we use threads for incrementing/decrementing slave executor
        # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be
        # plenty for these tasks. In the case of heavy load, the bottle neck will be the number
        # of executors, not the time it takes to lock/unlock the executor counts or the number of
        # teardown requests. Tweak the number to find the sweet spot if you feel this is the case.
        self._thread_pool_executor = ThreadPoolExecutor(max_workers=32)

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        # TODO: We can remove this code since we persist builds across master restarts
        # if os.path.exists(self._master_results_path):
        #     fs.async_delete(self._master_results_path)
        # fs.create_dir(self._master_results_path)

        SlavesCollector.register_slaves_metrics_collector(
            lambda: self.all_slaves_by_id().values())
    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration["results_directory"]
        self._all_slaves_by_url = {}
        self._all_builds_by_id = OrderedDict()
        self._build_request_handler = BuildRequestHandler()
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._build_request_handler)
        self._slave_allocator.start()

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)

        fs.create_dir(self._master_results_path)
class ClusterMaster(object):
    """
    The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of.
    """

    API_VERSION = 'v1'

    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._all_slaves_by_url = {}
        self._all_builds_by_id = OrderedDict()
        self._build_request_handler = BuildRequestHandler()
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._build_request_handler)
        self._slave_allocator.start()

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)

        fs.create_dir(self._master_results_path)

    def _get_status(self):
        """
        Just returns a dumb message and prints it to the console.

        :rtype: str
        """
        return 'Master service is up.'

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        slaves_representation = [
            slave.api_representation()
            for slave in self.all_slaves_by_id().values()
        ]
        return {
            'status': self._get_status(),
            'slaves': slaves_representation,
        }

    def builds(self):
        """
        Returns a list of all builds
        :rtype: list[Build]
        """
        return self._all_builds_by_id.values()

    def active_builds(self):
        """
        Returns a list of incomplete builds
        :rtype: list[Build]
        """
        return [build for build in self.builds() if not build.is_finished]

    def all_slaves_by_id(self):
        """
        Retrieve all connected slaves
        :rtype: dict [int, Slave]
        """
        slaves_by_slave_id = {}
        for slave in self._all_slaves_by_url.values():
            slaves_by_slave_id[slave.id] = slave
        return slaves_by_slave_id

    def get_slave(self, slave_id=None, slave_url=None):
        """
        Get the instance of given slave by either the slave's id or url. Only one of slave_id or slave_url should be
        specified.

        :param slave_id: The id of the slave to return
        :type slave_id: int
        :param slave_url: The url of the slave to return
        :type slave_url: str
        :return: The instance of the slave
        :rtype: Slave
        """
        if (slave_id is None) == (slave_url is None):
            raise ValueError(
                'Only one of slave_id or slave_url should be specified to get_slave().'
            )

        if slave_id is not None:
            for slave in self._all_slaves_by_url.values():
                if slave.id == slave_id:
                    return slave
        else:
            if slave_url in self._all_slaves_by_url:
                return self._all_slaves_by_url[slave_url]

        raise ItemNotFoundError(
            'Requested slave ({}) does not exist.'.format(slave_id))

    def connect_slave(self, slave_url, num_executors):
        """
        Connect a slave to this master.

        :type slave_url: str
        :type num_executors: int
        :return: The response with the slave id of the slave.
        :rtype: dict[str, str]
        """
        # If a slave had previously been connected, and is now being reconnected, the cleanest way to resolve this
        # bookkeeping is for the master to forget about the previous slave instance and start with a fresh instance.
        if slave_url in self._all_slaves_by_url:
            self._logger.warning(
                'Slave on {} requested to connect to master, even though previously connected. '
                +
                'Removing existing slave instance from the master\'s bookkeeping.',
                slave_url)
            old_slave = self._all_slaves_by_url.get(slave_url)

            # If a slave has requested to reconnect, we have to assume that whatever build the dead slave was
            # working on no longer has valid results.
            if old_slave.current_build_id is not None:
                self._logger.info(
                    '{} has build [{}] running on it. Attempting to cancel build.',
                    slave_url, old_slave.current_build_id)
                try:
                    build = self.get_build(old_slave.current_build_id)
                    build.cancel()
                    self._logger.info(
                        'Cancelled build {} due to dead slave {}',
                        old_slave.current_build_id, slave_url)
                except ItemNotFoundError:
                    self._logger.info(
                        'Failed to find build {} that was running on {}',
                        old_slave.current_build_id, slave_url)

        slave = Slave(slave_url, num_executors)
        self._all_slaves_by_url[slave_url] = slave
        self._slave_allocator.add_idle_slave(slave)
        self._logger.info(
            'Slave on {} connected to master with {} executors. (id: {})',
            slave_url, num_executors, slave.id)
        return {'slave_id': str(slave.id)}

    def handle_slave_state_update(self, slave, new_slave_state):
        """
        Execute logic to transition the specified slave to the given state.

        :type slave: Slave
        :type new_slave_state: SlaveState
        """
        slave_transition_functions = {
            SlaveState.DISCONNECTED: self._disconnect_slave,
            SlaveState.SHUTDOWN: self._graceful_shutdown_slave,
            SlaveState.IDLE: self._slave_allocator.add_idle_slave,
            SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave,
            SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave,
        }

        if new_slave_state not in slave_transition_functions:
            raise BadRequestError(
                'Invalid slave state "{}". Valid states are: {}.'.format(
                    new_slave_state,
                    ', '.join(slave_transition_functions.keys())))

        do_transition = slave_transition_functions.get(new_slave_state)
        do_transition(slave)

    def set_shutdown_mode_on_slaves(self, slave_ids):
        """
        :type slave_ids: list[int]
        """
        # Find all the slaves first so if an invalid slave_id is specified, we 404 before shutting any of them down.
        slaves = [self.get_slave(slave_id) for slave_id in slave_ids]
        for slave in slaves:
            self.handle_slave_state_update(slave, SlaveState.SHUTDOWN)

    def _graceful_shutdown_slave(self, slave):
        """
        Puts slave in shutdown mode so it cannot receive new builds. The slave will be killed when finished with any
        running builds.
        :type slave: Slave
        """
        slave.set_shutdown_mode()
        self._logger.info('Slave on {} was put in shutdown mode. (id: {})',
                          slave.url, slave.id)

    def _disconnect_slave(self, slave):
        """
        Mark a slave dead.

        :type slave: Slave
        """
        # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves;
        # that will happen during slave allocation.
        slave.mark_dead()
        # todo: Fail/resend any currently executing subjobs still executing on this slave.
        self._logger.info('Slave on {} was disconnected. (id: {})', slave.url,
                          slave.id)

    def _handle_setup_success_on_slave(self, slave):
        """
        Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called
        once after the specified slave has already run build_setup commands for the specified build.

        :type slave: Slave
        """
        build = self.get_build(slave.current_build_id)
        build.begin_subjob_executions_on_slave(slave)

    def _handle_setup_failure_on_slave(self, slave):
        """
        Respond to failed build setup on a slave. This should put the slave back into a usable state.

        :type slave: Slave
        """
        raise BadRequestError(
            'Setup failure handling on the master is not yet implemented.')

    def handle_request_for_new_build(self, build_params):
        """
        Creates a new Build object and adds it to the request queue to be processed.

        :param build_params:
        :type build_params: dict[str, str]
        :rtype tuple [bool, dict [str, str]]
        """
        build_request = BuildRequest(build_params)
        success = False

        if build_request.is_valid():
            build = Build(build_request)
            self._all_builds_by_id[build.build_id()] = build
            build.generate_project_type()
            self._build_request_handler.handle_build_request(build)
            response = {'build_id': build.build_id()}
            success = True
        elif not build_request.is_valid_type():
            response = {'error': 'Invalid build request type.'}
        else:
            required_params = build_request.required_parameters()
            response = {
                'error':
                'Missing required parameter. Required parameters: {}'.format(
                    required_params)
            }

        return success, response  # todo: refactor to use exception instead of boolean

    def handle_request_to_update_build(self, build_id, update_params):
        """
        Updates the state of a build with the values passed in.  Used for cancelling running builds.

        :type build_id: int
        :param update_params: The fields that should be updated and their new values
        :type update_params: dict [str, str]
        :return: The success/failure and the response we want to send to the requestor
        :rtype: tuple [bool, dict [str, str]]
        """
        build = self._all_builds_by_id.get(int(build_id))
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        success, response = build.validate_update_params(update_params)
        if not success:
            return success, response
        return build.update_state(update_params), {}

    def handle_result_reported_from_slave(self,
                                          slave_url,
                                          build_id,
                                          subjob_id,
                                          payload=None):
        """
        Process the result and dispatch the next subjob
        :type slave_url: str
        :type build_id: int
        :type subjob_id: int
        :type payload: dict
        :rtype: str
        """
        self._logger.info(
            'Results received from {} for subjob. (Build {}, Subjob {})',
            slave_url, build_id, subjob_id)
        build = self._all_builds_by_id[int(build_id)]
        slave = self._all_slaves_by_url[slave_url]
        # If the build has been canceled, don't work on the next subjob.
        if not build.is_finished:
            try:
                build.complete_subjob(subjob_id, payload)
            finally:
                build.execute_next_subjob_or_teardown_slave(slave)

    def get_build(self, build_id):
        """
        Returns a build by id
        :param build_id: The id for the build whose status we are getting
        :type build_id: int
        :rtype: Build
        """
        build = self._all_builds_by_id.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        return build

    def get_path_for_build_results_archive(self, build_id):
        """
        Given a build id, get the absolute file path for the archive file containing the build results.

        :param build_id: The build id for which to retrieve the artifacts archive file
        :type build_id: int
        :return: The path to the archived results file
        :rtype: str
        """
        build = self._all_builds_by_id.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        archive_file = build.artifacts_archive_file
        if archive_file is None:
            raise ItemNotReadyError(
                'Build artifact file is not yet ready. Try again later.')

        return archive_file
class ClusterMaster(ClusterService):
    """
    The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of.
    """

    API_VERSION = 'v1'

    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._slave_registry = SlaveRegistry.singleton()
        self._scheduler_pool = BuildSchedulerPool()
        self._build_request_handler = BuildRequestHandler(self._scheduler_pool)
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._scheduler_pool)
        self._slave_allocator.start()

        # The best practice for determining the number of threads to use is
        # the number of threads per core multiplied by the number of physical
        # cores. So for example, with 10 cores, 2 sockets and 2 per core, the
        # max would be 40.
        #
        # Currently we use threads for incrementing/decrementing slave executor
        # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be
        # plenty for these tasks. In the case of heavy load, the bottle neck will be the number
        # of executors, not the time it takes to lock/unlock the executor counts or the number of
        # teardown requests. Tweak the number to find the sweet spot if you feel this is the case.
        self._thread_pool_executor = ThreadPoolExecutor(max_workers=32)

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)
        fs.create_dir(self._master_results_path)

        # Configure heartbeat tracking
        self._unresponsive_slaves_cleanup_interval = Configuration[
            'unresponsive_slaves_cleanup_interval']
        self._hb_scheduler = sched.scheduler()

        SlavesCollector.register_slaves_metrics_collector(
            lambda: self._slave_registry.get_all_slaves_by_id().values())

    def start_heartbeat_tracker_thread(self):
        self._logger.info('Heartbeat tracker will run every {} seconds'.format(
            self._unresponsive_slaves_cleanup_interval))
        Thread(target=self._start_heartbeat_tracker,
               name='HeartbeatTrackerThread',
               daemon=True).start()

    def _start_heartbeat_tracker(self):
        self._hb_scheduler.enter(0, 0,
                                 self._disconnect_non_heartbeating_slaves)
        self._hb_scheduler.run()

    def _disconnect_non_heartbeating_slaves(self):
        slaves_to_disconnect = [
            slave
            for slave in self._slave_registry.get_all_slaves_by_url().values()
            if slave.is_alive() and not self._is_slave_responsive(slave)
        ]

        for slave in slaves_to_disconnect:
            self._disconnect_slave(slave)
            self._logger.error(
                'Slave {} marked offline as it is not sending heartbeats.'.
                format(slave.id))

        self._hb_scheduler.enter(self._unresponsive_slaves_cleanup_interval, 0,
                                 self._disconnect_non_heartbeating_slaves)

    def _is_slave_responsive(self, slave: ClusterSlave) -> bool:
        time_since_last_heartbeat = (datetime.now() -
                                     slave.get_last_heartbeat_time()).seconds
        return time_since_last_heartbeat < self._unresponsive_slaves_cleanup_interval

    def _get_status(self):
        """
        Just returns a dumb message and prints it to the console.

        :rtype: str
        """
        return 'Master service is up.'

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        slaves_representation = [
            slave.api_representation()
            for slave in self._slave_registry.get_all_slaves_by_id().values()
        ]
        return {
            'status': self._get_status(),
            'slaves': slaves_representation,
        }

    def get_builds(self,
                   offset: int = None,
                   limit: int = None) -> List['Build']:
        """
        Returns a list of all builds.
        :param offset: The starting index of the requested build
        :param limit: The number of builds requested
        """
        num_builds = BuildStore.size()
        start, end = get_paginated_indices(offset, limit, num_builds)
        return BuildStore.get_range(start, end)

    def active_builds(self):
        """
        Returns a list of incomplete builds
        :rtype: list[Build]
        """
        return [build for build in self.get_builds() if not build.is_finished]

    def connect_slave(self, slave_url, num_executors, slave_session_id=None):
        """
        Connect a slave to this master.

        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str | None
        :return: The response with the slave id of the slave.
        :rtype: dict[str, str]
        """
        # todo: Validate arg types for this and other methods called via API.
        # If a slave had previously been connected, and is now being reconnected, the cleanest way to resolve this
        # bookkeeping is for the master to forget about the previous slave instance and start with a fresh instance.
        try:
            old_slave = self._slave_registry.get_slave(slave_url=slave_url)
        except ItemNotFoundError:
            pass
        else:
            self._logger.warning(
                'Slave requested to connect to master, even though previously connected as {}. '
                +
                'Removing existing slave instance from the master\'s bookkeeping.',
                old_slave)
            # If a slave has requested to reconnect, we have to assume that whatever build the dead slave was
            # working on no longer has valid results.
            if old_slave.current_build_id is not None:
                self._logger.info(
                    '{} has build [{}] running on it. Attempting to cancel build.',
                    old_slave, old_slave.current_build_id)
                try:
                    build = self.get_build(old_slave.current_build_id)
                    build.cancel()
                    self._logger.info(
                        'Cancelled build {} due to dead slave {}',
                        old_slave.current_build_id, old_slave)
                except ItemNotFoundError:
                    self._logger.info(
                        'Failed to find build {} that was running on {}',
                        old_slave.current_build_id, old_slave)

        slave = Slave(slave_url, num_executors, slave_session_id)
        self._slave_registry.add_slave(slave)
        self._slave_allocator.add_idle_slave(slave)
        self._logger.info(
            'Slave on {} connected to master with {} executors. (id: {})',
            slave_url, num_executors, slave.id)
        return {'slave_id': str(slave.id)}

    def handle_slave_state_update(self, slave, new_slave_state):
        """
        Execute logic to transition the specified slave to the given state.

        :type slave: Slave
        :type new_slave_state: SlaveState
        """
        slave_transition_functions = {
            SlaveState.DISCONNECTED: self._disconnect_slave,
            SlaveState.SHUTDOWN: self._graceful_shutdown_slave,
            SlaveState.IDLE: self._slave_allocator.add_idle_slave,
            SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave,
            SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave,
        }

        if new_slave_state not in slave_transition_functions:
            raise BadRequestError(
                'Invalid slave state "{}". Valid states are: {}.'.format(
                    new_slave_state,
                    ', '.join(slave_transition_functions.keys())))

        do_transition = slave_transition_functions.get(new_slave_state)
        do_transition(slave)

    def update_slave_last_heartbeat_time(self, slave):
        slave.update_last_heartbeat_time()

    def set_shutdown_mode_on_slaves(self, slave_ids):
        """
        :type slave_ids: list[int]
        """
        # Find all the slaves first so if an invalid slave_id is specified, we 404 before shutting any of them down.
        slaves = [
            self._slave_registry.get_slave(slave_id=slave_id)
            for slave_id in slave_ids
        ]
        for slave in slaves:
            self.handle_slave_state_update(slave, SlaveState.SHUTDOWN)

    def _graceful_shutdown_slave(self, slave):
        """
        Puts slave in shutdown mode so it cannot receive new builds. The slave will be killed when finished with any
        running builds.
        :type slave: Slave
        """
        slave.set_shutdown_mode()
        self._logger.info('Slave on {} was put in shutdown mode. (id: {})',
                          slave.url, slave.id)

    def _disconnect_slave(self, slave):
        """
        Mark a slave dead.

        :type slave: Slave
        """
        # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves;
        # that will happen during slave allocation.
        slave.mark_dead()
        # todo: Fail/resend any currently executing subjobs still executing on this slave.
        self._logger.info('Slave on {} was disconnected. (id: {})', slave.url,
                          slave.id)

    def _handle_setup_success_on_slave(self, slave: Slave):
        """
        Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called
        once after the specified slave has already run build_setup commands for the specified build.
        """
        build = self.get_build(slave.current_build_id)
        scheduler = self._scheduler_pool.get(build)
        self._thread_pool_executor.submit(
            scheduler.begin_subjob_executions_on_slave, slave=slave)

    def _handle_setup_failure_on_slave(self, slave):
        """
        Respond to failed build setup on a slave. This should put the slave back into a usable state.

        :type slave: Slave
        """
        build = self.get_build(slave.current_build_id)
        build.setup_failures += 1
        if build.setup_failures >= MAX_SETUP_FAILURES:
            build.cancel()
            build.mark_failed(
                'Setup failed on this build more than {} times. Failing the build.'
                .format(MAX_SETUP_FAILURES))
        slave.teardown()

    def handle_request_for_new_build(self, build_params):
        """
        Creates a new Build object and adds it to the request queue to be processed.

        :param build_params:
        :type build_params: dict[str, str]
        :rtype tuple [bool, dict [str, str]]
        """
        build_request = BuildRequest(build_params)
        success = False

        if build_request.is_valid():
            build = Build(build_request)
            BuildStore.add(build)
            build.generate_project_type(
            )  # WIP(joey): This should be internal to the Build object.
            self._build_request_handler.handle_build_request(build)
            response = {'build_id': build.build_id()}
            success = True
        elif not build_request.is_valid_type():
            response = {'error': 'Invalid build request type.'}
        else:
            required_params = build_request.required_parameters()
            response = {
                'error':
                'Missing required parameter. Required parameters: {}'.format(
                    required_params)
            }

        return success, response  # todo: refactor to use exception instead of boolean

    def handle_request_to_update_build(self, build_id, update_params):
        """
        Updates the state of a build with the values passed in.  Used for cancelling running builds.

        :type build_id: int
        :param update_params: The fields that should be updated and their new values
        :type update_params: dict [str, str]
        :return: The success/failure and the response we want to send to the requestor
        :rtype: tuple [bool, dict [str, str]]
        """
        build = BuildStore.get(int(build_id))
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        success, response = build.validate_update_params(update_params)
        if not success:
            return success, response
        return build.update_state(update_params), {}

    def handle_result_reported_from_slave(self,
                                          slave_url,
                                          build_id,
                                          subjob_id,
                                          payload=None):
        """
        Process the result and dispatch the next subjob
        :type slave_url: str
        :type build_id: int
        :type subjob_id: int
        :type payload: dict
        :rtype: str
        """
        self._logger.info(
            'Results received from {} for subjob. (Build {}, Subjob {})',
            slave_url, build_id, subjob_id)
        build = BuildStore.get(int(build_id))
        slave = self._slave_registry.get_slave(slave_url=slave_url)
        try:
            build.complete_subjob(subjob_id, payload)
        finally:
            scheduler = self._scheduler_pool.get(build)
            self._thread_pool_executor.submit(
                scheduler.execute_next_subjob_or_free_executor, slave=slave)

    def get_build(self, build_id):
        """
        Returns a build by id
        :param build_id: The id for the build whose status we are getting
        :type build_id: int
        :rtype: Build
        """
        build = BuildStore.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id: {}.'.format(build_id))

        return build

    def get_path_for_build_results_archive(self,
                                           build_id: int,
                                           is_tar_request: bool = False
                                           ) -> str:
        """
        Given a build id, get the absolute file path for the archive file containing the build results.

        :param build_id: The build id for which to retrieve the artifacts archive file
        :param is_tar_request: If true, download the tar.gz archive instead of a zip.
        :return: The path to the archived results file
        """
        build = BuildStore.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        archive_file = build.artifacts_tar_file if is_tar_request else build.artifacts_zip_file
        if archive_file is None:
            raise ItemNotReadyError(
                'Build artifact file is not yet ready. Try again later.')

        return archive_file
class ClusterMaster(ClusterService):
    """
    The ClusterRunner Master service: This is the main application class that the web framework/REST API sits on top of.
    """

    API_VERSION = 'v1'

    def __init__(self):
        self._logger = get_logger(__name__)
        self._master_results_path = Configuration['results_directory']
        self._all_slaves_by_url = {}
        self._all_builds_by_id = OrderedDict()
        self._scheduler_pool = BuildSchedulerPool()
        self._build_request_handler = BuildRequestHandler(self._scheduler_pool)
        self._build_request_handler.start()
        self._slave_allocator = SlaveAllocator(self._build_request_handler)
        self._slave_allocator.start()

        # Asynchronously delete (but immediately rename) all old builds when master starts.
        # Remove this if/when build numbers are unique across master starts/stops
        if os.path.exists(self._master_results_path):
            fs.async_delete(self._master_results_path)

        fs.create_dir(self._master_results_path)

    def _get_status(self):
        """
        Just returns a dumb message and prints it to the console.

        :rtype: str
        """
        return 'Master service is up.'

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        slaves_representation = [slave.api_representation() for slave in self.all_slaves_by_id().values()]
        return {
            'status': self._get_status(),
            'slaves': slaves_representation,
        }

    def builds(self):
        """
        Returns a list of all builds
        :rtype: list[Build]
        """
        return self._all_builds_by_id.values()

    def active_builds(self):
        """
        Returns a list of incomplete builds
        :rtype: list[Build]
        """
        return [build for build in self.builds() if not build.is_finished]

    def all_slaves_by_id(self):
        """
        Retrieve all connected slaves
        :rtype: dict [int, Slave]
        """
        slaves_by_slave_id = {}
        for slave in self._all_slaves_by_url.values():
            slaves_by_slave_id[slave.id] = slave
        return slaves_by_slave_id

    def get_slave(self, slave_id=None, slave_url=None):
        """
        Get the instance of given slave by either the slave's id or url. Only one of slave_id or slave_url should be
        specified.

        :param slave_id: The id of the slave to return
        :type slave_id: int
        :param slave_url: The url of the slave to return
        :type slave_url: str
        :return: The instance of the slave
        :rtype: Slave
        """
        if (slave_id is None) == (slave_url is None):
            raise ValueError('Only one of slave_id or slave_url should be specified to get_slave().')

        if slave_id is not None:
            for slave in self._all_slaves_by_url.values():
                if slave.id == slave_id:
                    return slave
        else:
            if slave_url in self._all_slaves_by_url:
                return self._all_slaves_by_url[slave_url]

        raise ItemNotFoundError('Requested slave ({}) does not exist.'.format(slave_id))

    def connect_slave(self, slave_url, num_executors):
        """
        Connect a slave to this master.

        :type slave_url: str
        :type num_executors: int
        :return: The response with the slave id of the slave.
        :rtype: dict[str, str]
        """
        # If a slave had previously been connected, and is now being reconnected, the cleanest way to resolve this
        # bookkeeping is for the master to forget about the previous slave instance and start with a fresh instance.
        if slave_url in self._all_slaves_by_url:
            self._logger.warning('Slave on {} requested to connect to master, even though previously connected. ' +
                                 'Removing existing slave instance from the master\'s bookkeeping.', slave_url)
            old_slave = self._all_slaves_by_url.get(slave_url)

            # If a slave has requested to reconnect, we have to assume that whatever build the dead slave was
            # working on no longer has valid results.
            if old_slave.current_build_id is not None:
                self._logger.info('{} has build [{}] running on it. Attempting to cancel build.', slave_url,
                                  old_slave.current_build_id)
                try:
                    build = self.get_build(old_slave.current_build_id)
                    build.cancel()
                    self._logger.info('Cancelled build {} due to dead slave {}', old_slave.current_build_id,
                                      slave_url)
                except ItemNotFoundError:
                    self._logger.info('Failed to find build {} that was running on {}', old_slave.current_build_id,
                                      slave_url)

        slave = Slave(slave_url, num_executors)
        self._all_slaves_by_url[slave_url] = slave
        self._slave_allocator.add_idle_slave(slave)
        self._logger.info('Slave on {} connected to master with {} executors. (id: {})',
                          slave_url, num_executors, slave.id)
        return {'slave_id': str(slave.id)}

    def handle_slave_state_update(self, slave, new_slave_state):
        """
        Execute logic to transition the specified slave to the given state.

        :type slave: Slave
        :type new_slave_state: SlaveState
        """
        slave_transition_functions = {
            SlaveState.DISCONNECTED: self._disconnect_slave,
            SlaveState.SHUTDOWN: self._graceful_shutdown_slave,
            SlaveState.IDLE: self._slave_allocator.add_idle_slave,
            SlaveState.SETUP_COMPLETED: self._handle_setup_success_on_slave,
            SlaveState.SETUP_FAILED: self._handle_setup_failure_on_slave,
        }

        if new_slave_state not in slave_transition_functions:
            raise BadRequestError('Invalid slave state "{}". Valid states are: {}.'
                                  .format(new_slave_state, ', '.join(slave_transition_functions.keys())))

        do_transition = slave_transition_functions.get(new_slave_state)
        do_transition(slave)

    def set_shutdown_mode_on_slaves(self, slave_ids):
        """
        :type slave_ids: list[int]
        """
        # Find all the slaves first so if an invalid slave_id is specified, we 404 before shutting any of them down.
        slaves = [self.get_slave(slave_id) for slave_id in slave_ids]
        for slave in slaves:
            self.handle_slave_state_update(slave, SlaveState.SHUTDOWN)

    def _graceful_shutdown_slave(self, slave):
        """
        Puts slave in shutdown mode so it cannot receive new builds. The slave will be killed when finished with any
        running builds.
        :type slave: Slave
        """
        slave.set_shutdown_mode()
        self._logger.info('Slave on {} was put in shutdown mode. (id: {})', slave.url, slave.id)

    def _disconnect_slave(self, slave):
        """
        Mark a slave dead.

        :type slave: Slave
        """
        # Mark slave dead. We do not remove it from the list of all slaves. We also do not remove it from idle_slaves;
        # that will happen during slave allocation.
        slave.mark_dead()
        # todo: Fail/resend any currently executing subjobs still executing on this slave.
        self._logger.info('Slave on {} was disconnected. (id: {})', slave.url, slave.id)

    def _handle_setup_success_on_slave(self, slave):
        """
        Respond to successful build setup on a slave. This starts subjob executions on the slave. This should be called
        once after the specified slave has already run build_setup commands for the specified build.

        :type slave: Slave
        """
        build = self.get_build(slave.current_build_id)
        scheduler = self._scheduler_pool.get(build)
        scheduler.begin_subjob_executions_on_slave(slave)

    def _handle_setup_failure_on_slave(self, slave):
        """
        Respond to failed build setup on a slave. This should put the slave back into a usable state.

        :type slave: Slave
        """
        raise BadRequestError('Setup failure handling on the master is not yet implemented.')

    def handle_request_for_new_build(self, build_params):
        """
        Creates a new Build object and adds it to the request queue to be processed.

        :param build_params:
        :type build_params: dict[str, str]
        :rtype tuple [bool, dict [str, str]]
        """
        build_request = BuildRequest(build_params)
        success = False

        if build_request.is_valid():
            build = Build(build_request)
            self._all_builds_by_id[build.build_id()] = build
            build.generate_project_type()  # WIP(joey): This should be internal to the Build object.
            self._build_request_handler.handle_build_request(build)
            response = {'build_id': build.build_id()}
            success = True
        elif not build_request.is_valid_type():
            response = {'error': 'Invalid build request type.'}
        else:
            required_params = build_request.required_parameters()
            response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)}

        return success, response  # todo: refactor to use exception instead of boolean

    def handle_request_to_update_build(self, build_id, update_params):
        """
        Updates the state of a build with the values passed in.  Used for cancelling running builds.

        :type build_id: int
        :param update_params: The fields that should be updated and their new values
        :type update_params: dict [str, str]
        :return: The success/failure and the response we want to send to the requestor
        :rtype: tuple [bool, dict [str, str]]
        """
        build = self._all_builds_by_id.get(int(build_id))
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        success, response = build.validate_update_params(update_params)
        if not success:
            return success, response
        return build.update_state(update_params), {}

    def handle_result_reported_from_slave(self, slave_url, build_id, subjob_id, payload=None):
        """
        Process the result and dispatch the next subjob
        :type slave_url: str
        :type build_id: int
        :type subjob_id: int
        :type payload: dict
        :rtype: str
        """
        self._logger.info('Results received from {} for subjob. (Build {}, Subjob {})', slave_url, build_id, subjob_id)
        build = self._all_builds_by_id[int(build_id)]
        slave = self._all_slaves_by_url[slave_url]
        # If the build has been canceled, don't work on the next subjob.
        if not build.is_finished:  # WIP(joey): This check should be internal to the Build object.
            try:
                build.complete_subjob(subjob_id, payload)
            finally:
                scheduler = self._scheduler_pool.get(build)
                scheduler.execute_next_subjob_or_free_executor(slave)

    def get_build(self, build_id):
        """
        Returns a build by id
        :param build_id: The id for the build whose status we are getting
        :type build_id: int
        :rtype: Build
        """
        build = self._all_builds_by_id.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id: {}.'.format(build_id))

        return build

    def get_path_for_build_results_archive(self, build_id):
        """
        Given a build id, get the absolute file path for the archive file containing the build results.

        :param build_id: The build id for which to retrieve the artifacts archive file
        :type build_id: int
        :return: The path to the archived results file
        :rtype: str
        """
        build = self._all_builds_by_id.get(build_id)
        if build is None:
            raise ItemNotFoundError('Invalid build id.')

        archive_file = build.artifacts_archive_file
        if archive_file is None:
            raise ItemNotReadyError('Build artifact file is not yet ready. Try again later.')

        return archive_file