Example #1
0
 def test_url_should_generate_correct_url(self):
     host = 'master:9000'
     first, second, third = 'first', 'second', 'third'
     builder = UrlBuilder(host)
     url = builder.url(first, second, third)
     self.assertEqual('http://{}/v1/{}/{}/{}'.format(host, first, second, third), url,
                      'Url generated did not match expectation')
    def _validate_successful_deployment(self, master_service_url, slaves_to_validate):
        """
        Poll the master's /slaves endpoint until either timeout or until all of the slaves have registered with
        the master.

        Throws exception upon timeout or API response error.

        :param master_service_url: the hostname:port for the running master service
        :type master_service_url: str
        :param slaves_to_validate: the list of slave hostnames (no ports) to deploy to
        :type slaves_to_validate: list[str]
        """
        master_api = UrlBuilder(master_service_url, BuildRunner.API_VERSION)
        slave_api_url = master_api.url('slave')
        network = Network()

        def all_slaves_registered():
            return len(self._non_registered_slaves(slave_api_url, slaves_to_validate, network)) == 0

        if not wait_for(
                boolean_predicate=all_slaves_registered,
                timeout_seconds=self._SLAVE_REGISTRY_TIMEOUT_SEC,
                poll_period=1,
                exceptions_to_swallow=(requests.RequestException, requests.ConnectionError)
        ):
            try:
                non_registered_slaves = self._non_registered_slaves(slave_api_url, slaves_to_validate, network)
            except ConnectionError:
                self._logger.error('Error contacting {} on the master.'.format(slave_api_url))
                raise SystemExit(1)

            self._logger.error('Slave registration timed out after {} sec, with slaves {} missing.'.format(
                self._SLAVE_REGISTRY_TIMEOUT_SEC, ','.join(non_registered_slaves)))
            raise SystemExit(1)
Example #3
0
    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host,
                          self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(
            self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(
            self._disconnect_from_master)
    def _validate_successful_deployment(self, master_service_url, slaves_to_validate):
        """
        Poll the master's /slaves endpoint until either timeout or until all of the slaves have registered with
        the master.

        Throws exception upon timeout or API response error.

        :param master_service_url: the hostname:port for the running master service
        :type master_service_url: str
        :param slaves_to_validate: the list of slave hostnames (no ports) to deploy to
        :type slaves_to_validate: list[str]
        """
        master_api = UrlBuilder(master_service_url, BuildRunner.API_VERSION)
        slave_api_url = master_api.url('slave')
        network = Network()

        def all_slaves_registered():
            return len(self._registered_slave_hostnames(slave_api_url, network)) == len(slaves_to_validate)

        if not wait_for(
                boolean_predicate=all_slaves_registered,
                timeout_seconds=self._SLAVE_REGISTRY_TIMEOUT_SEC,
                poll_period=1,
                exceptions_to_swallow=(requests.RequestException, requests.ConnectionError)
        ):
            try:
                registered_slaves = self._registered_slave_hostnames(slave_api_url, network)
                non_registered_slaves = self._non_registered_slaves(registered_slaves, slaves_to_validate)
            except ConnectionError:
                self._logger.error('Error contacting {} on the master.'.format(slave_api_url))
                raise SystemExit(1)

            self._logger.error('Slave registration timed out after {} sec, with slaves {} missing.'.format(
                self._SLAVE_REGISTRY_TIMEOUT_SEC, ','.join(non_registered_slaves)))
            raise SystemExit(1)
 def test_url_should_generate_correct_url(self):
     host = 'master:9000'
     first, second, third = 'first', 'second', 'third'
     builder = UrlBuilder(host)
     url = builder.url(first, second, third)
     self.assertEqual(
         'http://{}/v1/{}/{}/{}'.format(host, first, second, third), url,
         'Url generated did not match expectation')
Example #6
0
 def __init__(self, slave_url, num_executors):
     """
     :type slave_url: str
     :type num_executors: int
     """
     self.url = slave_url
     self.num_executors = num_executors
     self.id = self._slave_id_counter.increment()
     self._num_executors_in_use = Counter()
     self._network = Network(min_connection_poolsize=num_executors)
     self.current_build_id = None
     self._is_alive = True
     self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
     self._logger = log.get_logger(__name__)
Example #7
0
 def __init__(self, master_url, request_params, secret):
     """
     :param master_url: The url of the master which the build will be executed on
     :type master_url: str
     :param request_params: A dict of request params that will be json-encoded and sent in the build request
     :type request_params: dict
     :type secret: str
     """
     self._master_url = self._ensure_url_has_scheme(master_url)
     self._request_params = request_params
     self._secret = secret
     self._build_id = None
     self._network = Network()
     self._logger = get_logger(__name__)
     self._last_build_status_details = None
     self._master_api = UrlBuilder(master_url, self.API_VERSION)
     self._cluster_master_api_client = ClusterMasterAPIClient(master_url)
 def __init__(self, base_api_url):
     """
     :param base_api_url: The base API url of the service (e.g., 'http://localhost:43000')
     :type base_api_url: str
     """
     self._api = UrlBuilder(base_api_url)
     self._network = Network()
     self._logger = log.get_logger(__name__)
Example #9
0
 def __init__(self, slave_url, num_executors, slave_session_id=None):
     """
     :type slave_url: str
     :type num_executors: int
     :type slave_session_id: str
     """
     self.url = slave_url
     self.num_executors = num_executors
     self.id = self._slave_id_counter.increment()
     self._num_executors_in_use = Counter()
     self._network = Network(min_connection_poolsize=num_executors)
     self.current_build_id = None
     self._last_heartbeat_time = datetime.now()
     self._is_alive = True
     self._is_in_shutdown_mode = False
     self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
     self._session_id = slave_session_id
     self._logger = log.get_logger(__name__)
    def get(self, build_id, subjob_id, atom_id):
        """
        :type build_id: int
        :type subjob_id: int
        :type atom_id: int
        """
        max_lines = int(self.get_query_argument('max_lines', 50))
        offset_line = self.get_query_argument('offset_line', None)

        if offset_line is not None:
            offset_line = int(offset_line)

        try:
            response = self._cluster_master.get_console_output(
                build_id,
                subjob_id,
                atom_id,
                Configuration['results_directory'],
                max_lines,
                offset_line
            )
            self.write(response)
            return
        except ItemNotFoundError as e:
            # If the master doesn't have the atom's console output, it's possible it's currently being worked on,
            # in which case the slave that is working on it may be able to provide the in-progress console output.
            build = self._cluster_master.get_build(int(build_id))
            subjob = build.subjob(int(subjob_id))
            slave = subjob.slave

            if slave is None:
                raise e

            api_url_builder = UrlBuilder(slave.url)
            slave_console_url = api_url_builder.url('build', build_id, 'subjob', subjob_id, 'atom', atom_id, 'console')
            query = {'max_lines': max_lines}

            if offset_line is not None:
                query['offset_line'] = offset_line

            query_string = urllib.parse.urlencode(query)
            self.redirect('{}?{}'.format(slave_console_url, query_string))
Example #11
0
 def __init__(self, slave_url, num_executors):
     """
     :type slave_url: str
     :type num_executors: int
     """
     self.url = slave_url
     self.num_executors = num_executors
     self.id = self._slave_id_counter.increment()
     self._num_executors_in_use = Counter()
     self._network = Network(min_connection_poolsize=num_executors)
     self.current_build_id = None
     self.is_alive = True
     self._slave_api = UrlBuilder(slave_url, app.master.cluster_master.ClusterMaster.API_VERSION)
    def block_until_build_queue_empty(self, timeout=60):
        """
        This blocks until the master's build queue is empty. This data is exposed via the /queue endpoint and contains
        any jobs that are currently building or not yet started. If the queue is not empty before the timeout, this
        method raises an exception.

        :param timeout: The maximum number of seconds to block before raising an exception.
        :type timeout: int
        """
        master_api = UrlBuilder(self._master_url)
        queue_url = master_api.url('queue')

        def is_queue_empty():
            queue_resp = requests.get(queue_url)
            if queue_resp and queue_resp.ok:
                queue_data = queue_resp.json()
                if 'queue' in queue_data and len(queue_data['queue']) == 0:
                    return True
            return False

        if not poll.wait_for(is_queue_empty, timeout, 0.5):
            raise Exception('Master service did not become idle before timeout.')
    def block_until_build_queue_empty(self, timeout=60):
        """
        This blocks until the master's build queue is empty. This data is exposed via the /queue endpoint and contains
        any jobs that are currently building or not yet started. If the queue is not empty before the timeout, this
        method raises an exception.

        :param timeout: The maximum number of seconds to block before raising an exception.
        :type timeout: int
        """
        master_api = UrlBuilder(self._master_url)
        queue_url = master_api.url('queue')

        def is_queue_empty():
            queue_resp = requests.get(queue_url)
            if queue_resp and queue_resp.ok:
                queue_data = queue_resp.json()
                if 'queue' in queue_data and len(queue_data['queue']) == 0:
                    return True
            return False

        if not poll.wait_for(is_queue_empty, timeout, 0.5):
            raise Exception('Master service did not become idle before timeout.')
Example #14
0
    def _download_and_extract_zip_results(self, master_api: UrlBuilder, build_id: int, download_dir: str):
        """Download the artifacts.zip from the master and extract it."""
        download_artifacts_url = master_api.url('build', build_id, 'artifacts.zip')
        download_filepath = os.path.join(download_dir, BuildArtifact.ARTIFACT_ZIPFILE_NAME)
        response = self._network.get(download_artifacts_url)

        if response.status_code == http.client.OK:
            # save file to disk, decompress, and delete
            with open(download_filepath, 'wb') as file:
                chunk_size = 500 * 1024
                for chunk in response.iter_content(chunk_size):
                    file.write(chunk)

            fs.unzip_directory(download_filepath, delete=True)
    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str
        """
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)
Example #16
0
 def __init__(self, master_url, request_params, secret):
     """
     :param master_url: The url of the master which the build will be executed on
     :type master_url: str
     :param request_params: A dict of request params that will be json-encoded and sent in the build request
     :type request_params: dict
     :type secret: str
     """
     self._master_url = self._ensure_url_has_scheme(master_url)
     self._request_params = request_params
     self._secret = secret
     self._build_id = None
     self._network = Network()
     self._logger = get_logger(__name__)
     self._last_build_status_details = None
     self._master_api = UrlBuilder(master_url, self.API_VERSION)
     self._cluster_master_api_client = ClusterMasterAPIClient(master_url)
Example #17
0
    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or "localhost:43000"
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url("slave")
        data = {"slave": "{}:{}".format(self.host, self.port), "num_executors": self._num_executors}
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get("slave_id"))
        self._logger.info("Slave {}:{} connected to master on {}.", self.host, self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._disconnect_from_master)
Example #18
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build, executor_start_index):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        :param executor_start_index: The index the slave should number its executors from for this build
        :type executor_start_index: int
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        self._network.post_with_digest(setup_url, post_data, Secret.get())

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instructs the slave process to kill itself.
        """
        kill_url = self._slave_api.url('kill')
        self._network.post_with_digest(kill_url, {}, Secret.get())
        self.mark_dead()

    def mark_dead(self):
        """
        Marks the slave dead.
        """
        self.set_is_alive(False)
        self.current_build_id = None
Example #19
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build):
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :type build: Build
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': build.num_executors_allocated,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build.build_id()

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob,)).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except requests.exceptions.ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors[executor_id] = executor

        self._setup_complete_event = Event()
        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None

        UnhandledExceptionHandler.singleton().add_teardown_callback(self._async_teardown_build,
                                                                    should_disconnect_from_master=True)

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors.values()]
        return {
            'connected': str(self._is_connected()),
            'master_url': self._master_url,
            'setup_complete': str(self._setup_complete_event.isSet()),
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def _is_connected(self):
        return self._master_url is not None

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._setup_complete_event.clear()
        self._current_build_id = build_id

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.setup_build(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(target=self._async_setup_build, args=(executors, project_type_params)).start()

    def _async_setup_build(self, executors, project_type_params):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.
        """
        # todo(joey): It's strange that the project_type is setting up the executors, which in turn set up projects.
        # todo(joey): I think this can be untangled a bit -- we should call executor.configure_project_type() here.
        self._project_type.setup_build(executors, project_type_params)

        self._logger.info('Build setup complete for build {}.', self._current_build_id)
        self._setup_complete_event.set()  # free any subjob threads that are waiting for setup to complete

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))

        self._logger.info('Executing teardown for build {}.', self._current_build_id)

        SafeThread(target=self._async_teardown_build).start()

    def _async_teardown_build(self, should_disconnect_from_master=False):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        if self._project_type:
            self._project_type.teardown_build()
            self._logger.info('Build teardown complete for build {}.', self._current_build_id)
            self._current_build_id = None
            self._project_type = None

        if not should_disconnect_from_master:
            # report back to master that this slave is finished with teardown and ready for a new build
            self._logger.info('Notifying master that this slave is ready for new builds.')
            idle_url = self._master_api.url('slave', self._slave_id, 'idle')
            response = self._network.post(idle_url)
            if response.status_code != http.client.OK:
                raise RuntimeError("Could not post teardown completion to master at {}".format(idle_url))

        elif self._is_master_responsive():
            # report back to master that this slave is shutting down and should not receive new builds
            self._logger.info('Notifying master to disconnect this slave.')
            disconnect_url = self._master_api.url('slave', self._slave_id, 'disconnect')
            response = self._network.post(disconnect_url)
            if response.status_code != http.client.OK:
                self._logger.error('Could not post disconnect notification to master at {}'.format(disconnect_url))

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str
        """
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Build{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        self._logger.debug('Waiting for setup to complete (Build {}, Subjob {})...', build_id, subjob_id)
        self._setup_complete_event.wait()  # block until setup completes
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def kill(self):
        # TODO(dtran): Kill the threads and this server more gracefully
        sys.exit(0)
Example #21
0
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self.is_alive = True
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors_by_id = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors_by_id[executor_id] = executor

        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None
        self._build_teardown_coin = None

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors_by_id.values()]
        return {
            'is_alive': self.is_alive,
            'master_url': self._master_url,
            'current_build_id': self._current_build_id,
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params, build_executor_start_index):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for
        this build
        :type build_executor_start_index: int
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._current_build_id = build_id
        self._build_teardown_coin = SingleUseCoin()  # protects against build_teardown being executed multiple times

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(
            target=self._async_setup_build,
            name='Bld{}-Setup'.format(build_id),
            args=(executors, project_type_params, build_executor_start_index)
        ).start()

    def _async_setup_build(self, executors, project_type_params, build_executor_start_index):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.

        :type executors: list[SubjobExecutor]
        :type project_type_params: dict
        :type build_executor_start_index: int
        """
        self._base_executor_index = build_executor_start_index
        try:
            self._project_type.fetch_project()
            for executor in executors:
                executor.configure_project_type(project_type_params)
            self._project_type.run_job_config_setup()

        except SetupFailureError as ex:
            self._logger.error(ex)
            self._logger.info('Notifying master that build setup has failed for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_FAILED)

        else:
            self._logger.info('Notifying master that build setup is complete for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_COMPLETED)

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))
        SafeThread(
            target=self._async_teardown_build,
            name='Bld{}-Teardwn'.format(build_id)
        ).start()

    def _async_teardown_build(self):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        self._do_build_teardown_and_reset()
        while not self._idle_executors.full():
            time.sleep(1)
        self._send_master_idle_notification()

    def _do_build_teardown_and_reset(self, timeout=None):
        """
        Kill any currently running subjobs. Run the teardown_build commands for the current build (with an optional
        timeout). Clear attributes related to the currently running build.

        :param timeout: A maximum time in seconds to allow the teardown process to run before killing
        :type timeout: int | None
        """
        # Kill all subjob executors' processes. This only has an effect if we are tearing down before a build completes.
        for executor in self.executors_by_id.values():
            executor.kill()

        # Order matters! Spend the coin if it has been initialized.
        if not self._build_teardown_coin or not self._build_teardown_coin.spend() or not self._project_type:
            return  # There is no build to tear down or teardown is already in progress.

        self._logger.info('Executing teardown for build {}.', self._current_build_id)
        # todo: Catch exceptions raised during teardown_build so we don't skip notifying master of idle/disconnect.
        self._project_type.teardown_build(timeout=timeout)
        self._logger.info('Build teardown complete for build {}.', self._current_build_id)
        self._current_build_id = None
        self._project_type = None

    def _send_master_idle_notification(self):
        if not self._is_master_responsive():
            self._logger.notice('Could not post idle notification to master because master is unresponsive.')
            return

        # Notify master that this slave is finished with teardown and ready for a new build.
        self._logger.info('Notifying master that this slave is ready for new builds.')
        self._notify_master_of_state_change(SlaveState.IDLE)

    def _disconnect_from_master(self):
        """
        Perform internal bookkeeping, as well as notify the master, that this slave is disconnecting itself
        from the slave pool.
        """
        self.is_alive = False

        if not self._is_master_responsive():
            self._logger.notice('Could not post disconnect notification to master because master is unresponsive.')
            return

        # Notify master that this slave is shutting down and should not receive new builds.
        self._logger.info('Notifying master that this slave is disconnecting.')
        self._notify_master_of_state_change(SlaveState.DISCONNECTED)

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._disconnect_from_master)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Bld{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands,
                                               self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def _notify_master_of_state_change(self, new_state):
        """
        Send a state notification to the master. This is used to notify the master of events occurring on the slave
        related to build execution progress.

        :type new_state: SlaveState
        """
        state_url = self._master_api.url('slave', self._slave_id)
        self._network.put_with_digest(state_url, request_params={'slave': {'state': new_state}},
                                      secret=Secret.get(), error_on_failure=True)

    def kill(self):
        """
        Exits without error.
        """
        sys.exit(0)
Example #22
0
class Slave:
    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._last_heartbeat_time = datetime.now()
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            self._remove_slave_from_registry()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters().copy()
        slave_project_type_params.update(build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self, ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice('Teardown request to slave {} was not sent since slave is disconnected.', self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning('Teardown request to slave failed because slave is unresponsive.')
            self.mark_dead()

    def start_subjob(self, subjob: Subjob):
        """
        Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build.
        :param subjob: The subjob to send to this slave
        """
        if not self.is_alive():
            raise DeadSlaveError('Tried to start a subjob on a dead slave.')
        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode.')

        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        try:
            response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)
        except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex:
            raise SlaveCommunicationError('Call to slave service failed: {}.'.format(repr(ex))) from ex

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def is_alive(self, use_cached: bool=True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data['slave']:
                    self._logger.warning('{}\'s API is missing key slave[\'is_alive\'].', self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning('{}\'s API key \'is_alive\' is not a boolean.', self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed and removed from slave registry when they finish teardown, or
        killed and removed from slave registry immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()
            self._remove_slave_from_registry()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self, self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session()  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers

    def update_last_heartbeat_time(self):
        self._last_heartbeat_time = datetime.now()

    def get_last_heartbeat_time(self) -> datetime:
        return self._last_heartbeat_time

    def _remove_slave_from_registry(self):
        """
        Remove shutdown-ed slave from SlaveRegistry.
        """
        self._logger.info('Removing slave (url={}; id={}) from Slave Registry.'.format(self.url, self.id))
        SlaveRegistry.singleton().remove_slave(slave_url=self.url)
Example #23
0
class BuildRunner(object):
    """
    BuildRunner is a procedure-oriented class intended to be used in the context of a script. This class provides
    functionality to synchronously execute a build on the ClusterRunner, wait for it to complete, and collect the
    build results.

    Example usage pattern:
    >>> runner = BuildRunner('http://mymaster.net:123', {'type':'git', 'url':'https://github.com/box/StatusWolf.git'})
    >>> runner.run()
    """

    API_VERSION = 'v1'

    def __init__(self, master_url, request_params, secret):
        """
        :param master_url: The url of the master which the build will be executed on
        :type master_url: str
        :param request_params: A dict of request params that will be json-encoded and sent in the build request
        :type request_params: dict
        :type secret: str
        """
        self._master_url = self._ensure_url_has_scheme(master_url)
        self._request_params = request_params
        self._secret = secret
        self._build_id = None
        self._network = Network()
        self._logger = get_logger(__name__)
        self._last_build_status_details = None
        self._master_api = UrlBuilder(master_url, self.API_VERSION)
        self._cluster_master_api_client = ClusterMasterAPIClient(master_url)

    def run(self):
        """
        Send the build request to the master, wait for the build to finish, then download the build artifacts.

        :return: Whether or not we were successful in running the build. (Note this does *not* indicate the success or
            faulure of the build itself; that is determined by the contents of the build artifacts which should be
            parsed elsewhere.)
        :rtype: bool
        """
        try:
            self._start_build()
            result = self._block_until_finished()
            self._download_and_extract_results()
            return result

        except _BuildRunnerError as ex:
            self._logger.error(str(ex))
            self._logger.warning('Script aborted due to error!')
            self._cancel_build()
            return False

    def _cancel_build(self):
        """
        Request the master cancels the build.
        """
        if self._build_id is not None:
            self._logger.warning('Cancelling build {}'.format(self._build_id))
            self._cluster_master_api_client.cancel_build(self._build_id)

    def _start_build(self):
        """
        Send the build request to the master for execution.
        """
        build_url = self._master_api.url('build')
        # todo: catch connection error
        response = self._network.post_with_digest(build_url, self._request_params, self._secret, error_on_failure=True)
        response_data = response.json()

        if 'error' in response_data:
            error_message = response_data['error']
            raise _BuildRunnerError('Error starting build: ' + error_message)

        self._build_id = response_data['build_id']

        UnhandledExceptionHandler.singleton().add_teardown_callback(self._cancel_build)
        self._logger.info('Build is running. (Build id: {})', self._build_id)

    def _block_until_finished(self, timeout=None):
        """
        Poll the build status endpoint until the build is finished or until the timeout is reached.

        :param timeout: The maximum number of seconds to wait until giving up, or None for no timeout
        :type timeout: int|None
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize
        build_status_url = self._master_api.url('build', self._build_id)
        self._logger.debug('Polling build status url: {}', build_status_url)

        while time.time() <= timeout_time:
            response = self._network.get(build_status_url)
            response_data = response.json()

            if 'build' not in response_data or 'status' not in response_data['build']:
                raise _BuildRunnerError('Status response does not contain a "build" object with a "status" value.'
                                        'URL: {}, Content:{}'.format(build_status_url, response_data))

            build_data = response_data['build']
            if build_data['status'] == BuildStatus.FINISHED:
                self._logger.info('Build is finished. (Build id: {})', self._build_id)
                completion_message = 'Build {} result was {}'.format(self._build_id, build_data['result'])
                is_success = build_data['result'] == BuildResult.NO_FAILURES
                if is_success:
                    self._logger.info(completion_message)
                else:
                    self._logger.error(completion_message)
                    if build_data['failed_atoms']:
                        self._logger.error('These atoms had non-zero exit codes (failures):')
                        for failure in build_data['failed_atoms']:
                            self._logger.error(failure)
                    return False

                return True

            if build_data['status'] == BuildStatus.ERROR:
                message = 'Build aborted due to error: {}'.format(build_data.get('error_message'))
                raise _BuildRunnerError(message)

            if build_data['status'] == BuildStatus.BUILDING:
                if build_data['details'] != self._last_build_status_details:
                    self._last_build_status_details = build_data['details']
                    self._logger.info(build_data['details'])

            time.sleep(1)

        raise _BuildRunnerError('Build timed out after {} seconds.'.format(timeout))

    def _download_and_extract_results(self, timeout=None):
        """
        Download the result files for the build.
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize

        download_artifacts_url = self._master_api.url('build', self._build_id, 'result')
        download_filepath = 'build_results/artifacts.tar.gz'
        download_dir, _ = os.path.split(download_filepath)

        # remove any previous build artifacts
        if os.path.exists(download_dir):
            shutil.rmtree(download_dir)

        while time.time() <= timeout_time:
            response = self._network.get(download_artifacts_url)
            if response.status_code == http.client.OK:
                # save tar file to disk, decompress, and delete
                app.util.fs.create_dir(download_dir)
                with open(download_filepath, 'wb') as file:
                    chunk_size = 500 * 1024
                    for chunk in response.iter_content(chunk_size):
                        file.write(chunk)

                app.util.fs.extract_tar(download_filepath, delete=True)
                return

            time.sleep(1)

        raise _BuildRunnerError('Build timed out after {} seconds.'.format(timeout))

    def _ensure_url_has_scheme(self, url):
        """
        If url does not start with 'http' or 'https', add 'http://' to the beginning.
        :type url: str
        :rtype: str
        """
        url = url.strip()
        if not url.startswith('http'):
            url = 'http://' + url
        return url
Example #24
0
class BuildRunner(object):
    """
    BuildRunner is a procedure-oriented class intended to be used in the context of a script. This class provides
    functionality to synchronously execute a build on the ClusterRunner, wait for it to complete, and collect the
    build results.

    Example usage pattern:
    >>> runner = BuildRunner('http://mymaster.net:123', {'type':'git', 'url':'https://github.com/box/StatusWolf.git'})
    >>> runner.run()
    """

    API_VERSION = 'v1'

    def __init__(self, master_url, request_params, secret):
        """
        :param master_url: The url of the master which the build will be executed on
        :type master_url: str
        :param request_params: A dict of request params that will be json-encoded and sent in the build request
        :type request_params: dict
        :type secret: str
        """
        self._master_url = self._ensure_url_has_scheme(master_url)
        self._request_params = request_params
        self._secret = secret
        self._build_id = None
        self._network = Network()
        self._logger = get_logger(__name__)
        self._last_build_status_details = None
        self._master_api = UrlBuilder(master_url, self.API_VERSION)
        self._cluster_master_api_client = ClusterMasterAPIClient(master_url)

    def run(self):
        """
        Send the build request to the master, wait for the build to finish, then download the build artifacts.

        :return: Whether or not we were successful in running the build. (Note this does *not* indicate the success or
            faulure of the build itself; that is determined by the contents of the build artifacts which should be
            parsed elsewhere.)
        :rtype: bool
        """
        try:
            self._start_build()
            result = self._block_until_finished()
            self._download_and_extract_results()
            return result

        except _BuildRunnerError as ex:
            self._logger.error(str(ex))
            self._logger.warning('Script aborted due to error!')
            self._cancel_build()
            return False

    def _cancel_build(self):
        """
        Request the master cancels the build.
        """
        if self._build_id is not None:
            self._logger.warning('Cancelling build {}'.format(self._build_id))
            self._cluster_master_api_client.cancel_build(self._build_id)

    def _start_build(self):
        """
        Send the build request to the master for execution.
        """
        build_url = self._master_api.url('build')
        # todo: catch connection error
        response = self._network.post_with_digest(build_url,
                                                  self._request_params,
                                                  self._secret,
                                                  error_on_failure=True)
        response_data = response.json()

        if 'error' in response_data:
            error_message = response_data['error']
            raise _BuildRunnerError('Error starting build: ' + error_message)

        self._build_id = response_data['build_id']

        UnhandledExceptionHandler.singleton().add_teardown_callback(
            self._cancel_build)
        self._logger.info('Build is running. (Build id: {})', self._build_id)

    def _block_until_finished(self, timeout=None):
        """
        Poll the build status endpoint until the build is finished or until the timeout is reached.

        :param timeout: The maximum number of seconds to wait until giving up, or None for no timeout
        :type timeout: int|None
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize
        build_status_url = self._master_api.url('build', self._build_id)
        self._logger.debug('Polling build status url: {}', build_status_url)

        while time.time() <= timeout_time:
            response = self._network.get(build_status_url)
            response_data = response.json()

            if 'build' not in response_data or 'status' not in response_data[
                    'build']:
                raise _BuildRunnerError(
                    'Status response does not contain a "build" object with a "status" value.'
                    'URL: {}, Content:{}'.format(build_status_url,
                                                 response_data))

            build_data = response_data['build']
            if build_data['status'] == BuildStatus.FINISHED:
                self._logger.info('Build is finished. (Build id: {})',
                                  self._build_id)
                completion_message = 'Build {} result was {}'.format(
                    self._build_id, build_data['result'])
                is_success = build_data['result'] == BuildResult.NO_FAILURES
                if is_success:
                    self._logger.info(completion_message)
                else:
                    self._logger.error(completion_message)
                    if build_data['failed_atoms']:
                        self._logger.error(
                            'These atoms had non-zero exit codes (failures):')
                        for failure in build_data['failed_atoms']:
                            self._logger.error(failure)
                    return False

                return True

            if build_data['status'] == BuildStatus.ERROR:
                message = 'Build aborted due to error: {}'.format(
                    build_data.get('error_message'))
                raise _BuildRunnerError(message)

            if build_data['status'] == BuildStatus.BUILDING:
                if build_data['details'] != self._last_build_status_details:
                    self._last_build_status_details = build_data['details']
                    self._logger.info(build_data['details'])

            time.sleep(1)

        raise _BuildRunnerError(
            'Build timed out after {} seconds.'.format(timeout))

    def _download_and_extract_results(self, timeout=None):
        """
        Download the result files for the build.
        """
        timeout_time = time.time() + timeout if timeout else sys.maxsize

        download_artifacts_url = self._master_api.url('build', self._build_id,
                                                      'result')
        download_filepath = 'build_results/artifacts.tar.gz'
        download_dir, _ = os.path.split(download_filepath)

        # remove any previous build artifacts
        if os.path.exists(download_dir):
            shutil.rmtree(download_dir)

        while time.time() <= timeout_time:
            response = self._network.get(download_artifacts_url)
            if response.status_code == http.client.OK:
                # save tar file to disk, decompress, and delete
                app.util.fs.create_dir(download_dir)
                with open(download_filepath, 'wb') as file:
                    chunk_size = 500 * 1024
                    for chunk in response.iter_content(chunk_size):
                        file.write(chunk)

                app.util.fs.extract_tar(download_filepath, delete=True)
                return

            time.sleep(1)

        raise _BuildRunnerError(
            'Build timed out after {} seconds.'.format(timeout))

    def _ensure_url_has_scheme(self, url):
        """
        If url does not start with 'http' or 'https', add 'http://' to the beginning.
        :type url: str
        :rtype: str
        """
        url = url.strip()
        if not url.startswith('http'):
            url = 'http://' + url
        return url
Example #25
0
class Slave(object):

    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self.is_alive = True
        self._slave_api = UrlBuilder(slave_url, app.master.cluster_master.ClusterMaster.API_VERSION)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build(),
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception('Trying to mark slave idle while {} executors still in use.',
                            self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build_id, project_type_params):
        """
        Execute a setup command on the slave for the specified build. The command is executed asynchronously from the
        perspective of this method, but any subjobs will block until the slave finishes executing the setup command.

        :param build_id: The build id that this setup command is for.
        :type build_id: int

        :param project_type_params: The parameters that define the project type this build will execute in
        :typeproject_type_paramss: dict
        """
        setup_url = self._slave_api.url('build', build_id, 'setup')
        post_data = {
            'project_type_params': project_type_params,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        teardown_url = self._slave_api.url('build', self.current_build_id, 'teardown')
        self._network.post(teardown_url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive:
            raise RuntimeError('Tried to start a subjob on a dead slave! ({}, id: {})'.format(self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob,)).start()
        self.current_build_id = subjob.build_id()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_url=self.url)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception('Cannot claim executor on slave {}. No executors left.'.format(self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception('Cannot free executor on slave {}. All are free.'.format(self.url))
        return new_count

    def current_build(self):
        """
        :return:
        :rtype: int|None
        """
        return self.current_build_id
Example #26
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors, slave_session_id=None):
        """
        :type slave_url: str
        :type num_executors: int
        :type slave_session_id: str
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._is_in_shutdown_mode = False
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._session_id = slave_session_id
        self._logger = log.get_logger(__name__)

    def __str__(self):
        return '<slave #{} - {}>'.format(self.id, self.url)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'session_id': self._session_id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
            'is_alive': self.is_alive(),
            'is_in_shutdown_mode': self._is_in_shutdown_mode,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        If the slave is in shutdown mode, clear the build_id, kill the slave, and raise an error.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

        if self._is_in_shutdown_mode:
            self.kill()
            raise SlaveMarkedForShutdownError

    def setup(self, build: Build, executor_start_index: int) -> bool:
        """
        Execute a setup command on the slave for the specified build. The setup process executes asynchronously on the
        slave and the slave will alert the master when setup is complete and it is ready to start working on subjobs.

        :param build: The build to set up this slave to work on
        :param executor_start_index: The index the slave should number its executors from for this build
        :return: Whether or not the call to start setup on the slave was successful
        """
        slave_project_type_params = build.build_request.build_parameters(
        ).copy()
        slave_project_type_params.update(
            build.project_type.slave_param_overrides())

        setup_url = self._slave_api.url('build', build.build_id(), 'setup')
        post_data = {
            'project_type_params': slave_project_type_params,
            'build_executor_start_index': executor_start_index,
        }

        self.current_build_id = build.build_id()
        try:
            self._network.post_with_digest(setup_url, post_data, Secret.get())
        except (requests.ConnectionError, requests.Timeout) as ex:
            self._logger.warning('Setup call to {} failed with {}: {}.', self,
                                 ex.__class__.__name__, str(ex))
            self.mark_dead()
            return False
        return True

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if not self.is_alive():
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)
            return

        teardown_url = self._slave_api.url('build', self.current_build_id,
                                           'teardown')
        try:
            self._network.post(teardown_url)
        except (requests.ConnectionError, requests.Timeout):
            self._logger.warning(
                'Teardown request to slave failed because slave is unresponsive.'
            )
            self.mark_dead()

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise DeadSlaveError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode. ({}, id: {})'
                .format(self.url, self.id))

        # todo: This should not be a SafeThread. https://github.com/box/ClusterRunner/issues/337
        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached: bool = True) -> bool:
        """
        Is the slave API responsive?

        Note that if the slave API responds but its session id does not match the one we've stored in this
        instance, then this method will still return false.

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :return: Whether or not the slave is alive
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(
                self._slave_api.url(), headers=self._expected_session_header())

            if not response.ok:
                self.mark_dead()
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self.mark_dead()
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self.mark_dead()
                else:
                    self._is_alive = response_data['slave']['is_alive']
        except (requests.ConnectionError, requests.Timeout):
            self.mark_dead()

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value

    def set_shutdown_mode(self):
        """
        Mark this slave as being in shutdown mode.  Slaves in shutdown mode will not get new subjobs and will be
        killed when they finish teardown, or killed immediately if they are not processing a build.
        """
        self._is_in_shutdown_mode = True
        if self.current_build_id is None:
            self.kill()

    def is_shutdown(self):
        """
        Whether the slave is in shutdown mode.
        """
        return self._is_in_shutdown_mode

    def kill(self):
        """
        Instruct the slave process to kill itself.
        """
        self._logger.notice('Killing {}', self)
        kill_url = self._slave_api.url('kill')
        try:
            self._network.post_with_digest(kill_url, {}, Secret.get())
        except (requests.ConnectionError, requests.Timeout):
            pass
        self.mark_dead()

    def mark_dead(self):
        """
        Mark the slave dead.
        """
        self._logger.warning('{} has gone offline. Last build: {}', self,
                             self.current_build_id)
        self._is_alive = False
        self.current_build_id = None
        self._network.reset_session(
        )  # Close any pooled connections for this slave.

    def _expected_session_header(self):
        """
        Return headers that should be sent with slave requests to verify that the master is still talking to
        the same slave service that it originally connected to.

        Note that adding these headers to existing requests may add new failure cases (e.g., slave API would
        start returning a 412) so we should make sure all potential 412 errors are handled appropriately when
        adding these headers to existing requests.

        :rtype: dict
        """
        headers = {}
        if self._session_id:
            headers[SessionId.EXPECTED_SESSION_HEADER_KEY] = self._session_id

        return headers
Example #27
0
class Slave(object):

    API_VERSION = 'v1'
    _slave_id_counter = Counter()

    def __init__(self, slave_url, num_executors):
        """
        :type slave_url: str
        :type num_executors: int
        """
        self.url = slave_url
        self.num_executors = num_executors
        self.id = self._slave_id_counter.increment()
        self._num_executors_in_use = Counter()
        self._network = Network(min_connection_poolsize=num_executors)
        self.current_build_id = None
        self._is_alive = True
        self._slave_api = UrlBuilder(slave_url, self.API_VERSION)
        self._logger = log.get_logger(__name__)

    def api_representation(self):
        return {
            'url': self.url,
            'id': self.id,
            'num_executors': self.num_executors,
            'num_executors_in_use': self.num_executors_in_use(),
            'current_build_id': self.current_build_id,
        }

    def mark_as_idle(self):
        """
        Do bookkeeping when this slave becomes idle.  Error if the slave cannot be idle.
        """
        if self._num_executors_in_use.value() != 0:
            raise Exception(
                'Trying to mark slave idle while {} executors still in use.',
                self._num_executors_in_use.value())

        self.current_build_id = None

    def setup(self, build_id, project_type_params):
        """
        Execute a setup command on the slave for the specified build. The command is executed asynchronously from the
        perspective of this method, but any subjobs will block until the slave finishes executing the setup command.

        :param build_id: The build id that this setup command is for.
        :type build_id: int

        :param project_type_params: The parameters that define the project type this build will execute in
        :type project_type_params: dict
        """
        setup_url = self._slave_api.url('build', build_id, 'setup')
        slave_project_type_params = util.project_type_params_for_slave(
            project_type_params)
        post_data = {
            'project_type_params': slave_project_type_params,
        }
        self._network.post_with_digest(setup_url, post_data, Secret.get())
        self.current_build_id = build_id

    def teardown(self):
        """
        Tell the slave to run the build teardown
        """
        if self.is_alive():
            teardown_url = self._slave_api.url('build', self.current_build_id,
                                               'teardown')
            self._network.post(teardown_url)
        else:
            self._logger.notice(
                'Teardown request to slave {} was not sent since slave is disconnected.',
                self.url)

    def start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        if not self.is_alive():
            raise RuntimeError(
                'Tried to start a subjob on a dead slave! ({}, id: {})'.format(
                    self.url, self.id))

        SafeThread(target=self._async_start_subjob, args=(subjob, )).start()

    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {
            'subjob_artifact_dir': subjob.artifact_dir(),
            'atomic_commands': subjob.atomic_commands(),
        }
        response = self._network.post_with_digest(execution_url,
                                                  post_data,
                                                  Secret.get(),
                                                  error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)

    def num_executors_in_use(self):
        return self._num_executors_in_use.value()

    def claim_executor(self):
        new_count = self._num_executors_in_use.increment()
        if new_count > self.num_executors:
            raise Exception(
                'Cannot claim executor on slave {}. No executors left.'.format(
                    self.url))
        return new_count

    def free_executor(self):
        new_count = self._num_executors_in_use.decrement()
        if new_count < 0:
            raise Exception(
                'Cannot free executor on slave {}. All are free.'.format(
                    self.url))
        return new_count

    def is_alive(self, use_cached=True):
        """
        Is the slave API responsive?

        :param use_cached: Should we use the last returned value of the network check to the slave? If True,
            will return cached value. If False, this method will perform an actual network call to the slave.
        :type use_cached: bool
        :rtype: bool
        """
        if use_cached:
            return self._is_alive

        try:
            response = self._network.get(self._slave_api.url())

            if not response.ok:
                self._is_alive = False
            else:
                response_data = response.json()

                if 'slave' not in response_data or 'is_alive' not in response_data[
                        'slave']:
                    self._logger.warning(
                        '{}\'s API is missing key slave[\'is_alive\'].',
                        self.url)
                    self._is_alive = False
                elif not isinstance(response_data['slave']['is_alive'], bool):
                    self._logger.warning(
                        '{}\'s API key \'is_alive\' is not a boolean.',
                        self.url)
                    self._is_alive = False
                else:
                    self._is_alive = response_data['slave']['is_alive']

        except ConnectionError:
            self._logger.warning('Slave with url {} is offline.', self.url)
            self._is_alive = False

        return self._is_alive

    def set_is_alive(self, value):
        """
        Setter for the self._is_alive attribute.

        :type value: bool
        """
        self._is_alive = value