def _prepare_build_async(self, build, project_lock):
        """
        :type build: Build
        :type project_lock: Lock
        """
        self._logger.info('Build {} is waiting for the project lock',
                          build.build_id())

        with project_lock:
            self._logger.info('Build {} has acquired project lock',
                              build.build_id())
            analytics.record_event(
                analytics.BUILD_PREPARE_START,
                build_id=build.build_id(),
                log_msg=
                'Build preparation loop is handling request for build {build_id}.'
            )
            try:
                self._prepare_build(build)
                if not build.has_error:
                    analytics.record_event(
                        analytics.BUILD_PREPARE_FINISH,
                        build_id=build.build_id(),
                        log_msg=
                        'Build {build_id} successfully prepared and waiting for slaves.'
                    )
                    self._builds_waiting_for_slaves.put(build)
            except Exception as ex:  # pylint: disable=broad-except
                build.mark_failed(str(ex))
                self._logger.exception(
                    'Could not handle build request for build {}.'.format(
                        build.build_id()))
Exemple #2
0
    def _execute_subjob(self, build_id, subjob_id, executor, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, atomic_commands, self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        resp = self._network.post(results_url, data=data, files=files)
        if resp.ok:
            self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)
        else:
            self._logger.error(
                ('Build {}, Subjob {} encountered an error when sending results to master.'
                 '\n\tStatus Code {}\n\t{}').format(build_id, subjob_id, resp.status_code, resp.text))
    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        self._logger.debug('Waiting for setup to complete (Build {}, Subjob {})...', build_id, subjob_id)
        self._setup_complete_event.wait()  # block until setup completes
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)
    def handle_request_for_new_build(self, build_params):
        """
        Creates a new Build object and adds it to the request queue to be processed.

        :param build_params:
        :type build_params: dict[str, str]
        :rtype tuple [bool, dict [str, str]]
        """
        build_request = BuildRequest(build_params)

        success = False
        if build_request.is_valid():
            build = Build(build_request)
            self._all_builds_by_id[build.build_id()] = build
            self._request_queue.put(build)
            analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id())
            response = {'build_id': build.build_id()}
            success = True

        elif not build_request.is_valid_type():
            response = {'error': 'Invalid build request type.'}

        else:
            required_params = build_request.required_parameters()
            response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)}

        return success, response
    def async_run(self, port, log_level, eventlog_file):
        """
        Run a ClusterRunner master service.

        :param port: the port on which to run the slave service
        :type port: int | None
        :param log_level: the log level at which to do application logging (or None for default log level)
        :type log_level: str | None
        :param eventlog_file: an optional alternate file in which to write event logs
        :type eventlog_file: str | None
        """
        port = port or Configuration['port']
        log_level = log_level or Configuration['log_level']
        eventlog_file = eventlog_file or Configuration['eventlog_file']

        log.configure_logging(log_level=log_level, log_file=Configuration['log_file'])
        analytics.initialize(eventlog_file)
        analytics.record_event(analytics.SERVICE_STARTED, service='master')

        cluster_master = ClusterMaster()
        application = ClusterMasterApplication(cluster_master)

        ioloop = self._start_application(application, port)

        self._write_pid_file(Configuration['master_pid_file'])

        # log startup message once ioloop is running
        hostname = Configuration['hostname']
        log_startup = functools.partial(self._logger.info, 'Master service is running on {}:{}.'.format(hostname, port))
        ioloop.add_callback(log_startup)

        ioloop.start()  # this call blocks until the server is stopped
        ioloop.close(all_fds=True)  # all_fds=True is necessary here to make sure connections don't hang
        self._logger.notice('Master server was stopped.')
Exemple #6
0
    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands,
                                               self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)
    def handle_request_for_new_build(self, build_params):
        """
        Creates a new Build object and adds it to the request queue to be processed.

        :param build_params:
        :type build_params: dict[str, str]
        :rtype tuple [bool, dict [str, str]]
        """
        build_request = BuildRequest(build_params)

        success = False
        if build_request.is_valid():
            build = Build(build_request)
            self._all_builds_by_id[build.build_id()] = build
            self._request_queue.put(build)
            analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id())
            response = {'build_id': build.build_id()}
            success = True

        elif not build_request.is_valid_type():
            response = {'error': 'Invalid build request type.'}

        else:
            required_params = build_request.required_parameters()
            response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)}

        return success, response
Exemple #8
0
    def start_subjob(self, subjob: Subjob):
        """
        Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build.
        :param subjob: The subjob to send to this slave
        """
        if not self.is_alive():
            raise DeadSlaveError('Tried to start a subjob on a dead slave.')
        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError(
                'Tried to start a subjob on a slave in shutdown mode.')

        execution_url = self._slave_api.url('build', subjob.build_id(),
                                            'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        try:
            response = self._network.post_with_digest(execution_url,
                                                      post_data,
                                                      Secret.get(),
                                                      error_on_failure=True)
        except (requests.ConnectionError, requests.Timeout,
                RequestFailedError) as ex:
            raise SlaveCommunicationError(
                'Call to slave service failed: {}.'.format(repr(ex))) from ex

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB,
                               executor_id=subjob_executor_id,
                               build_id=subjob.build_id(),
                               subjob_id=subjob.subjob_id(),
                               slave_id=self.id)
 def handle_build_request(self, build):
     """
     :param build: the requested build
     :type build: Build
     """
     self._request_queue.put(build)
     analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(),
                            log_msg='Queued request for build {build_id}.')
 def handle_build_request(self, build):
     """
     :param build: the requested build
     :type build: Build
     """
     self._request_queue.put(build)
     analytics.record_event(analytics.BUILD_REQUEST_QUEUED,
                            build_id=build.build_id(),
                            log_msg='Queued request for build {build_id}.')
Exemple #11
0
    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        self._slaves_allocated.append(slave)
        slave.setup(self)
        self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START, build_id=self.build_id(), slave_id=slave.id)
Exemple #12
0
    def async_run(self, port, master_url, num_executors, log_level,
                  eventlog_file):
        """
        Run a ClusterRunner slave service.

        :param port: the port on which to run the slave service
        :type port: int | None
        :param master_url: the url of the master to which this slave should attach
        :type master_url: string | None
        :param num_executors: the number of executors the slave service should use
        :type num_executors: int | None
        :param log_level: the log level at which to do application logging (or None for default log level)
        :type log_level: str | None
        :param eventlog_file: an optional alternate file in which to write event logs
        :type eventlog_file: str | None
        """
        num_executors = num_executors or Configuration['num_executors']
        master_url = master_url or '{}:{}'.format(
            Configuration['master_hostname'], Configuration['master_port'])
        port = port or Configuration['port']
        log_level = log_level or Configuration['log_level']
        eventlog_file = eventlog_file or Configuration['eventlog_file']

        log.configure_logging(log_level=log_level,
                              log_file=Configuration['log_file'].format(port))
        analytics.initialize(eventlog_file)
        analytics.record_event(analytics.SERVICE_STARTED, service='slave')

        cluster_slave = ClusterSlave(
            port=port,
            num_executors=num_executors,
            host=Configuration['hostname'],
        )

        application = ClusterSlaveApplication(cluster_slave)

        ioloop = self._start_application(application, port)

        self._write_pid_file(Configuration['slave_pid_file'])

        # connect to master once tornado ioloop is running
        connect_slave_to_master = functools.partial(
            cluster_slave.connect_to_master, master_url=master_url)
        ioloop.add_callback(connect_slave_to_master)

        # start sending heartbeat after connecting to master
        start_slave_heartbeat = functools.partial(
            cluster_slave.start_heartbeat_thread)
        ioloop.add_callback(start_slave_heartbeat)

        ioloop.start()  # this call blocks until the server is stopped
        ioloop.close(
            all_fds=True
        )  # all_fds=True is necessary here to make sure connections don't hang
        self._logger.notice('Slave server was stopped.')
Exemple #13
0
    def _async_start_subjob(self, subjob):
        """
        :type subjob: Subjob
        """
        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)
Exemple #14
0
    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        self._slaves_allocated.append(slave)
        slave.setup(self)
        self._num_executors_allocated += min(slave.num_executors,
                                             self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START,
                               build_id=self.build_id(),
                               slave_id=slave.id)
    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        if not self._slaves_allocated:
            # If this is the first slave to be allocated, update the build state.
            self._build.mark_started()

        self._slaves_allocated.append(slave)
        slave.setup(self._build, executor_start_index=self._num_executors_allocated)
        self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START, build_id=self._build.build_id(), slave_id=slave.id)
    def post(self, build_id, subjob_id):
        slave_url = self.decoded_body.get('slave')
        slave = self._cluster_master.get_slave(slave_url=slave_url)
        file_payload = self.request.files.get('file')
        if not file_payload:
            raise RuntimeError('Result file not provided')

        slave_executor_id = self.decoded_body.get('metric_data', {}).get('executor_id')
        analytics.record_event(analytics.MASTER_RECEIVED_RESULT, executor_id=slave_executor_id, build_id=int(build_id),
                               subjob_id=int(subjob_id), slave_id=slave.id)

        self._cluster_master.handle_result_reported_from_slave(
            slave_url, int(build_id), int(subjob_id), file_payload[0])
        self._write_status()
    def begin_subjob_executions_on_slave(self, slave):
        """
        Begin subjob executions on a slave. This should be called once after the specified slave has already run
        build_setup commands for this build.

        :type slave: Slave
        """
        analytics.record_event(analytics.BUILD_SETUP_FINISH, build_id=self._build.build_id(), slave_id=slave.id)
        for slave_executor_count in range(slave.num_executors):
            if (self._num_executors_in_use >= self._max_executors
                    or slave_executor_count >= self._max_executors_per_slave):
                break
            slave.claim_executor()
            self._num_executors_in_use += 1
            self.execute_next_subjob_or_free_executor(slave)
Exemple #18
0
    def begin_subjob_executions_on_slave(self, slave):
        """
        Begin subjob executions on a slave. This should be called once after the specified slave has already run
        build_setup commands for this build.

        :type slave: Slave
        """
        analytics.record_event(analytics.BUILD_SETUP_FINISH, build_id=self._build.build_id(), slave_id=slave.id)
        for slave_executor_count in range(slave.num_executors):
            if (self._num_executors_in_use >= self._max_executors
                    or slave_executor_count >= self._max_executors_per_slave):
                break
            slave.claim_executor()
            self._num_executors_in_use += 1
            self.execute_next_subjob_or_free_executor(slave)
Exemple #19
0
    def async_run(self, port, master_url, num_executors, log_level, eventlog_file):
        """
        Run a ClusterRunner slave service.

        :param port: the port on which to run the slave service
        :type port: int | None
        :param master_url: the url of the master to which this slave should attach
        :type master_url: string | None
        :param num_executors: the number of executors the slave service should use
        :type num_executors: int | None
        :param log_level: the log level at which to do application logging (or None for default log level)
        :type log_level: str | None
        :param eventlog_file: an optional alternate file in which to write event logs
        :type eventlog_file: str | None
        """
        num_executors = num_executors or Configuration['num_executors']
        master_url = master_url or '{}:{}'.format(Configuration['master_hostname'], Configuration['master_port'])
        port = port or Configuration['port']
        log_level = log_level or Configuration['log_level']
        eventlog_file = eventlog_file or Configuration['eventlog_file']

        log.configure_logging(log_level=log_level, log_file=Configuration['log_file'].format(port))
        analytics.initialize(eventlog_file)
        analytics.record_event(analytics.SERVICE_STARTED, service='slave')

        cluster_slave = ClusterSlave(
            port=port,
            num_executors=num_executors,
            host=Configuration['hostname'],
        )

        application = ClusterSlaveApplication(cluster_slave)

        ioloop = self._start_application(application, port)

        self._write_pid_file(Configuration['slave_pid_file'])

        # connect to master once tornado ioloop is running
        connect_slave_to_master = functools.partial(cluster_slave.connect_to_master, master_url=master_url)
        ioloop.add_callback(connect_slave_to_master)

        # start sending heartbeat after connecting to master
        start_slave_heartbeat = functools.partial(cluster_slave.start_heartbeat_thread)
        ioloop.add_callback(start_slave_heartbeat)

        ioloop.start()  # this call blocks until the server is stopped
        ioloop.close(all_fds=True)  # all_fds=True is necessary here to make sure connections don't hang
        self._logger.notice('Slave server was stopped.')
Exemple #20
0
    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        if not self._slaves_allocated:
            # If this is the first slave to be allocated, update the build state.
            self._build.mark_started()

        self._slaves_allocated.append(slave)
        slave.setup(self._build,
                    executor_start_index=self._num_executors_allocated)
        self._num_executors_allocated += min(slave.num_executors,
                                             self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START,
                               build_id=self._build.build_id(),
                               slave_id=slave.id)
Exemple #21
0
    def allocate_slave(self, slave: Slave) -> bool:
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.
        :param slave: The slave to allocate
        :return: Whether slave allocation was successful; this can fail if the slave is unresponsive
        """
        if not self._build_started:
            self._build_started = True
            self._build.mark_started()

        # Increment executors before triggering setup. This helps make sure the build won't take down
        # every slave in the cluster if setup calls fail because of a problem with the build.
        next_executor_index = self._num_executors_allocated
        self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START, build_id=self._build.build_id(), slave_id=slave.id)
        self._slaves_allocated.append(slave)

        return slave.setup(self._build, executor_start_index=next_executor_index)
Exemple #22
0
    def start_subjob(self, subjob: Subjob):
        """
        Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build.
        :param subjob: The subjob to send to this slave
        """
        if not self.is_alive():
            raise DeadSlaveError('Tried to start a subjob on a dead slave.')
        if self._is_in_shutdown_mode:
            raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode.')

        execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id())
        post_data = {'atomic_commands': subjob.atomic_commands()}
        try:
            response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True)
        except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex:
            raise SlaveCommunicationError('Call to slave service failed: {}.'.format(repr(ex))) from ex

        subjob_executor_id = response.json().get('executor_id')
        analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id,
                               build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)
    def _prepare_build_async(self, build, project_lock):
        """
        :type build: Build
        :type project_lock: Lock
        """
        self._logger.info('Build {} is waiting for the project lock', build.build_id())

        with project_lock:
            self._logger.info('Build {} has acquired project lock', build.build_id())
            analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(),
                                   log_msg='Build preparation loop is handling request for build {build_id}.')
            try:
                self._prepare_build(build)
                if not build.has_error:
                    analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(),
                                           log_msg='Build {build_id} successfully prepared and waiting for slaves.')
                    self._builds_waiting_for_slaves.put(build)
            except Exception as ex:  # pylint: disable=broad-except
                build.mark_failed(str(ex))
                self._logger.exception('Could not handle build request for build {}.'.format(build.build_id()))
    def _prepare_build_async(self, build, project_lock):
        """
        :type build: Build
        :type project_lock: Lock
        """
        self._logger.info('Build {} is waiting for the project lock', build.build_id())

        with project_lock:
            self._logger.info('Build {} has acquired project lock', build.build_id())
            analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(),
                                   log_msg='Build preparation loop is handling request for build {build_id}.')
            try:
                build.prepare(self._subjob_calculator)
                if not build.has_error:
                    analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True,
                                           log_msg='Build {build_id} successfully prepared.')
                    # If the atomizer found no work to do, perform build cleanup and skip the slave allocation.
                    if len(build.all_subjobs()) == 0:
                        self._logger.info('Build {} has no work to perform and is exiting.', build.build_id())
                        build.finish()
                    # If there is work to be done, this build must queue to be allocated slaves.
                    else:
                        self._logger.info('Build {} is waiting for slaves.', build.build_id())
                        self._builds_waiting_for_slaves.put(build)

            except Exception as ex:  # pylint: disable=broad-except
                build.mark_failed(str(ex))  # WIP(joey): Build should do this internally.
                self._logger.exception('Could not handle build request for build {}.'.format(build.build_id()))
                analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)
Exemple #25
0
    def async_run(self, port, log_level, eventlog_file):
        """
        Run a ClusterRunner master service.

        :param port: the port on which to run the slave service
        :type port: int | None
        :param log_level: the log level at which to do application logging (or None for default log level)
        :type log_level: str | None
        :param eventlog_file: an optional alternate file in which to write event logs
        :type eventlog_file: str | None
        """
        port = port or Configuration['port']
        log_level = log_level or Configuration['log_level']
        eventlog_file = eventlog_file or Configuration['eventlog_file']

        log.configure_logging(log_level=log_level,
                              log_file=Configuration['log_file'])
        analytics.initialize(eventlog_file)
        analytics.record_event(analytics.SERVICE_STARTED, service='master')

        cluster_master = ClusterMaster()
        application = ClusterMasterApplication(cluster_master)

        ioloop = self._start_application(application, port)

        self._write_pid_file(Configuration['master_pid_file'])

        # log startup message once ioloop is running
        hostname = Configuration['hostname']
        log_startup = functools.partial(
            self._logger.info,
            'Master service is running on {}:{}.'.format(hostname, port))
        ioloop.add_callback(log_startup)

        ioloop.start()  # this call blocks until the server is stopped
        ioloop.close(
            all_fds=True
        )  # all_fds=True is necessary here to make sure connections don't hang
        self._logger.notice('Master server was stopped.')
    def _execute_subjob(self, build_id, subjob_id, executor, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type atomic_commands: list[str]
        """
        subjob_event_data = {"build_id": build_id, "subjob_id": subjob_id, "executor_id": executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, atomic_commands, self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url("build", build_id, "subjob", subjob_id, "result")
        data = {"slave": "{}:{}".format(self.host, self.port), "metric_data": {"executor_id": executor.id}}
        files = {"file": ("payload", open(results_file, "rb"), "application/x-compressed")}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info("Build {}, Subjob {} completed and sent results to master.", build_id, subjob_id)
Exemple #27
0
    def _prepare_build_async(self, build, project_lock):
        """
        :type build: app.master.build.Build
        :type project_lock: Lock
        """
        self._logger.info('Build {} is waiting for the project lock',
                          build.build_id())

        with project_lock:
            self._logger.info('Build {} has acquired project lock',
                              build.build_id())
            analytics.record_event(
                analytics.BUILD_PREPARE_START,
                build_id=build.build_id(),
                log_msg=
                'Build preparation loop is handling request for build {build_id}.'
            )
            try:
                build.prepare()
                if not build.is_stopped:
                    analytics.record_event(
                        analytics.BUILD_PREPARE_FINISH,
                        build_id=build.build_id(),
                        is_success=True,
                        log_msg='Build {build_id} successfully prepared.')
                    # If the atomizer found no work to do, perform build cleanup and skip the slave allocation.
                    if len(build.get_subjobs()) == 0:
                        self._logger.info(
                            'Build {} has no work to perform and is exiting.',
                            build.build_id())
                        build.finish()
                    # If there is work to be done, this build must queue to be allocated slaves.
                    else:
                        self._logger.info('Build {} is waiting for slaves.',
                                          build.build_id())
                        self._scheduler_pool.add_build_waiting_for_slaves(
                            build)

            except Exception as ex:  # pylint: disable=broad-except
                if not build.is_canceled:
                    build.mark_failed(
                        str(ex))  # WIP(joey): Build should do this internally.
                    self._logger.exception(
                        'Could not handle build request for build {}.'.format(
                            build.build_id()))
                analytics.record_event(analytics.BUILD_PREPARE_FINISH,
                                       build_id=build.build_id(),
                                       is_success=False)
    def execute_subjob(self, build_id, subjob_id, atomic_commands, base_executor_index):
        """
        This is the method for executing a subjob. This performs the work required by executing the specified command,
        then archives the results into a single file and returns the filename.

        :type build_id: int
        :type subjob_id: int
        :type atomic_commands: list[str]
        :type base_executor_index: int
        :rtype: str
        """
        self._logger.info('Executing subjob (Build {}, Subjob {})...', build_id, subjob_id)

        # Set the current task
        self._current_build_id = build_id
        self._current_subjob_id = subjob_id

        # Maintain a list of atom artifact directories for compression and sending back to master
        atom_artifact_dirs = []

        # execute every atom and keep track of time elapsed for each
        for atom_id, atomic_command in enumerate(atomic_commands):
            atom_artifact_dir = BuildArtifact.atom_artifact_directory(
                build_id,
                subjob_id,
                atom_id,
                result_root=Configuration['artifact_directory']
            )

            # remove and recreate the atom artifact dir
            shutil.rmtree(atom_artifact_dir, ignore_errors=True)
            fs_util.create_dir(atom_artifact_dir)

            atom_environment_vars = {
                'ARTIFACT_DIR': atom_artifact_dir,
                'ATOM_ID': atom_id,
                'EXECUTOR_INDEX': self.id,  # Deprecated, use MACHINE_EXECUTOR_INDEX
                'MACHINE_EXECUTOR_INDEX': self.id,
                'BUILD_EXECUTOR_INDEX': base_executor_index + self.id,
            }

            atom_artifact_dirs.append(atom_artifact_dir)

            job_name = self._project_type.job_name
            atom_event_data = {'build_id': build_id, 'atom_id': atom_id, 'job_name': job_name, 'subjob_id': subjob_id}
            analytics.record_event(analytics.ATOM_START, **atom_event_data)

            exit_code = self._execute_atom_command(atomic_command, atom_environment_vars, atom_artifact_dir)

            atom_event_data['exit_code'] = exit_code
            analytics.record_event(analytics.ATOM_FINISH, **atom_event_data)

        # Generate mapping of atom directories (for archiving) to paths in the archive file
        targets_to_archive_paths = {atom_dir: os.path.basename(os.path.normpath(atom_dir))
                                    for atom_dir in atom_artifact_dirs}

        # zip file names must be unique for a build, so we append the subjob_id to the compressed file
        subjob_artifact_dir = BuildArtifact.build_artifact_directory(build_id,
                                                                     result_root=Configuration['artifact_directory'])
        tarfile_path = os.path.join(subjob_artifact_dir, 'results_{}.tar.gz'.format(subjob_id))
        fs_util.compress_directories(targets_to_archive_paths, tarfile_path)

        # Reset the current task
        self._current_build_id = None
        self._current_subjob_id = None

        return tarfile_path
Exemple #29
0
    def execute_subjob(self, build_id, subjob_id, subjob_artifact_dir,
                       atomic_commands, base_executor_index):
        """
        This is the method for executing a subjob. This performs the work required by executing the specified command,
        then archives the results into a single file and returns the filename.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :type base_executor_index: int
        :rtype: str
        """
        self._logger.info('Executing subjob (Build {}, Subjob {})...',
                          build_id, subjob_id)

        # Set the current task
        self._current_build_id = build_id
        self._current_subjob_id = subjob_id

        # Maintain a list of atom artifact directories for compression and sending back to master
        atom_artifact_dirs = []

        # execute every atom and keep track of time elapsed for each
        for atom_id, atomic_command in enumerate(atomic_commands):

            atom_artifact_dir = os.path.join(
                subjob_artifact_dir,
                Subjob.ATOM_DIR_FORMAT.format(subjob_id, atom_id))

            # remove and recreate the atom artifact dir
            shutil.rmtree(atom_artifact_dir, ignore_errors=True)
            fs_util.create_dir(atom_artifact_dir)

            atom_environment_vars = {
                'ARTIFACT_DIR': atom_artifact_dir,
                'ATOM_ID': atom_id,
                'EXECUTOR_INDEX':
                self.id,  # Deprecated, use MACHINE_EXECUTOR_INDEX
                'MACHINE_EXECUTOR_INDEX': self.id,
                'BUILD_EXECUTOR_INDEX': base_executor_index + self.id,
            }

            atom_artifact_dirs.append(atom_artifact_dir)

            job_name = self._project_type.job_name
            atom_event_data = {
                'build_id': build_id,
                'atom_id': atom_id,
                'job_name': job_name,
                'subjob_id': subjob_id
            }
            analytics.record_event(analytics.ATOM_START, **atom_event_data)

            exit_code = self._execute_atom_command(atomic_command,
                                                   atom_environment_vars,
                                                   atom_artifact_dir)

            atom_event_data['exit_code'] = exit_code
            analytics.record_event(analytics.ATOM_FINISH, **atom_event_data)

        # Generate mapping of atom directories (for archiving) to paths in the archive file
        targets_to_archive_paths = {
            atom_dir: os.path.basename(os.path.normpath(atom_dir))
            for atom_dir in atom_artifact_dirs
        }

        # zip file names must be unique for a build, so we append the subjob_id to the compressed file
        tarfile_path = os.path.join(subjob_artifact_dir,
                                    'results_{}.tar.gz'.format(subjob_id))
        fs_util.compress_directories(targets_to_archive_paths, tarfile_path)

        # Reset the current task
        self._current_build_id = None
        self._current_subjob_id = None

        return tarfile_path