def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event( analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg= 'Build preparation loop is handling request for build {build_id}.' ) try: self._prepare_build(build) if not build.has_error: analytics.record_event( analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), log_msg= 'Build {build_id} successfully prepared and waiting for slaves.' ) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception( 'Could not handle build request for build {}.'.format( build.build_id()))
def _execute_subjob(self, build_id, subjob_id, executor, atomic_commands): """ This is the method for executing a subjob asynchronously. This performs the work required by executing the specified command, then does a post back to the master results endpoint to signal that the work is done. :type build_id: int :type subjob_id: int :type executor: SubjobExecutor :type atomic_commands: list[str] """ subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id} analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data) results_file = executor.execute_subjob(build_id, subjob_id, atomic_commands, self._base_executor_index) analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data) results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result') data = { 'slave': '{}:{}'.format(self.host, self.port), 'metric_data': {'executor_id': executor.id}, } files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')} self._idle_executors.put(executor) # work is done; mark executor as idle resp = self._network.post(results_url, data=data, files=files) if resp.ok: self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id) else: self._logger.error( ('Build {}, Subjob {} encountered an error when sending results to master.' '\n\tStatus Code {}\n\t{}').format(build_id, subjob_id, resp.status_code, resp.text))
def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands): """ This is the method for executing a subjob asynchronously. This performs the work required by executing the specified command, then does a post back to the master results endpoint to signal that the work is done. :type build_id: int :type subjob_id: int :type executor: SubjobExecutor :type subjob_artifact_dir: str :type atomic_commands: list[str] """ self._logger.debug('Waiting for setup to complete (Build {}, Subjob {})...', build_id, subjob_id) self._setup_complete_event.wait() # block until setup completes subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id} analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data) results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands) analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data) results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result') data = { 'slave': '{}:{}'.format(self.host, self.port), 'metric_data': {'executor_id': executor.id}, } files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')} self._idle_executors.put(executor) # work is done; mark executor as idle self._network.post(results_url, data=data, files=files) # todo: check return code self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)
def handle_request_for_new_build(self, build_params): """ Creates a new Build object and adds it to the request queue to be processed. :param build_params: :type build_params: dict[str, str] :rtype tuple [bool, dict [str, str]] """ build_request = BuildRequest(build_params) success = False if build_request.is_valid(): build = Build(build_request) self._all_builds_by_id[build.build_id()] = build self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id()) response = {'build_id': build.build_id()} success = True elif not build_request.is_valid_type(): response = {'error': 'Invalid build request type.'} else: required_params = build_request.required_parameters() response = {'error': 'Missing required parameter. Required parameters: {}'.format(required_params)} return success, response
def async_run(self, port, log_level, eventlog_file): """ Run a ClusterRunner master service. :param port: the port on which to run the slave service :type port: int | None :param log_level: the log level at which to do application logging (or None for default log level) :type log_level: str | None :param eventlog_file: an optional alternate file in which to write event logs :type eventlog_file: str | None """ port = port or Configuration['port'] log_level = log_level or Configuration['log_level'] eventlog_file = eventlog_file or Configuration['eventlog_file'] log.configure_logging(log_level=log_level, log_file=Configuration['log_file']) analytics.initialize(eventlog_file) analytics.record_event(analytics.SERVICE_STARTED, service='master') cluster_master = ClusterMaster() application = ClusterMasterApplication(cluster_master) ioloop = self._start_application(application, port) self._write_pid_file(Configuration['master_pid_file']) # log startup message once ioloop is running hostname = Configuration['hostname'] log_startup = functools.partial(self._logger.info, 'Master service is running on {}:{}.'.format(hostname, port)) ioloop.add_callback(log_startup) ioloop.start() # this call blocks until the server is stopped ioloop.close(all_fds=True) # all_fds=True is necessary here to make sure connections don't hang self._logger.notice('Master server was stopped.')
def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands): """ This is the method for executing a subjob asynchronously. This performs the work required by executing the specified command, then does a post back to the master results endpoint to signal that the work is done. :type build_id: int :type subjob_id: int :type executor: SubjobExecutor :type subjob_artifact_dir: str :type atomic_commands: list[str] """ subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id} analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data) results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands, self._base_executor_index) analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data) results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result') data = { 'slave': '{}:{}'.format(self.host, self.port), 'metric_data': {'executor_id': executor.id}, } files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')} self._idle_executors.put(executor) # work is done; mark executor as idle self._network.post(results_url, data=data, files=files) # todo: check return code self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)
def start_subjob(self, subjob: Subjob): """ Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build. :param subjob: The subjob to send to this slave """ if not self.is_alive(): raise DeadSlaveError('Tried to start a subjob on a dead slave.') if self._is_in_shutdown_mode: raise SlaveMarkedForShutdownError( 'Tried to start a subjob on a slave in shutdown mode.') execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id()) post_data = {'atomic_commands': subjob.atomic_commands()} try: response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True) except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex: raise SlaveCommunicationError( 'Call to slave service failed: {}.'.format(repr(ex))) from ex subjob_executor_id = response.json().get('executor_id') analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id, build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)
def handle_build_request(self, build): """ :param build: the requested build :type build: Build """ self._request_queue.put(build) analytics.record_event(analytics.BUILD_REQUEST_QUEUED, build_id=build.build_id(), log_msg='Queued request for build {build_id}.')
def allocate_slave(self, slave): """ Allocate a slave to this build. This tells the slave to execute setup commands for this build. :type slave: Slave """ self._slaves_allocated.append(slave) slave.setup(self) self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave) analytics.record_event(analytics.BUILD_SETUP_START, build_id=self.build_id(), slave_id=slave.id)
def async_run(self, port, master_url, num_executors, log_level, eventlog_file): """ Run a ClusterRunner slave service. :param port: the port on which to run the slave service :type port: int | None :param master_url: the url of the master to which this slave should attach :type master_url: string | None :param num_executors: the number of executors the slave service should use :type num_executors: int | None :param log_level: the log level at which to do application logging (or None for default log level) :type log_level: str | None :param eventlog_file: an optional alternate file in which to write event logs :type eventlog_file: str | None """ num_executors = num_executors or Configuration['num_executors'] master_url = master_url or '{}:{}'.format( Configuration['master_hostname'], Configuration['master_port']) port = port or Configuration['port'] log_level = log_level or Configuration['log_level'] eventlog_file = eventlog_file or Configuration['eventlog_file'] log.configure_logging(log_level=log_level, log_file=Configuration['log_file'].format(port)) analytics.initialize(eventlog_file) analytics.record_event(analytics.SERVICE_STARTED, service='slave') cluster_slave = ClusterSlave( port=port, num_executors=num_executors, host=Configuration['hostname'], ) application = ClusterSlaveApplication(cluster_slave) ioloop = self._start_application(application, port) self._write_pid_file(Configuration['slave_pid_file']) # connect to master once tornado ioloop is running connect_slave_to_master = functools.partial( cluster_slave.connect_to_master, master_url=master_url) ioloop.add_callback(connect_slave_to_master) # start sending heartbeat after connecting to master start_slave_heartbeat = functools.partial( cluster_slave.start_heartbeat_thread) ioloop.add_callback(start_slave_heartbeat) ioloop.start() # this call blocks until the server is stopped ioloop.close( all_fds=True ) # all_fds=True is necessary here to make sure connections don't hang self._logger.notice('Slave server was stopped.')
def _async_start_subjob(self, subjob): """ :type subjob: Subjob """ execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id()) post_data = {'atomic_commands': subjob.atomic_commands()} response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True) subjob_executor_id = response.json().get('executor_id') analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id, build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)
def allocate_slave(self, slave): """ Allocate a slave to this build. This tells the slave to execute setup commands for this build. :type slave: Slave """ if not self._slaves_allocated: # If this is the first slave to be allocated, update the build state. self._build.mark_started() self._slaves_allocated.append(slave) slave.setup(self._build, executor_start_index=self._num_executors_allocated) self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave) analytics.record_event(analytics.BUILD_SETUP_START, build_id=self._build.build_id(), slave_id=slave.id)
def post(self, build_id, subjob_id): slave_url = self.decoded_body.get('slave') slave = self._cluster_master.get_slave(slave_url=slave_url) file_payload = self.request.files.get('file') if not file_payload: raise RuntimeError('Result file not provided') slave_executor_id = self.decoded_body.get('metric_data', {}).get('executor_id') analytics.record_event(analytics.MASTER_RECEIVED_RESULT, executor_id=slave_executor_id, build_id=int(build_id), subjob_id=int(subjob_id), slave_id=slave.id) self._cluster_master.handle_result_reported_from_slave( slave_url, int(build_id), int(subjob_id), file_payload[0]) self._write_status()
def begin_subjob_executions_on_slave(self, slave): """ Begin subjob executions on a slave. This should be called once after the specified slave has already run build_setup commands for this build. :type slave: Slave """ analytics.record_event(analytics.BUILD_SETUP_FINISH, build_id=self._build.build_id(), slave_id=slave.id) for slave_executor_count in range(slave.num_executors): if (self._num_executors_in_use >= self._max_executors or slave_executor_count >= self._max_executors_per_slave): break slave.claim_executor() self._num_executors_in_use += 1 self.execute_next_subjob_or_free_executor(slave)
def async_run(self, port, master_url, num_executors, log_level, eventlog_file): """ Run a ClusterRunner slave service. :param port: the port on which to run the slave service :type port: int | None :param master_url: the url of the master to which this slave should attach :type master_url: string | None :param num_executors: the number of executors the slave service should use :type num_executors: int | None :param log_level: the log level at which to do application logging (or None for default log level) :type log_level: str | None :param eventlog_file: an optional alternate file in which to write event logs :type eventlog_file: str | None """ num_executors = num_executors or Configuration['num_executors'] master_url = master_url or '{}:{}'.format(Configuration['master_hostname'], Configuration['master_port']) port = port or Configuration['port'] log_level = log_level or Configuration['log_level'] eventlog_file = eventlog_file or Configuration['eventlog_file'] log.configure_logging(log_level=log_level, log_file=Configuration['log_file'].format(port)) analytics.initialize(eventlog_file) analytics.record_event(analytics.SERVICE_STARTED, service='slave') cluster_slave = ClusterSlave( port=port, num_executors=num_executors, host=Configuration['hostname'], ) application = ClusterSlaveApplication(cluster_slave) ioloop = self._start_application(application, port) self._write_pid_file(Configuration['slave_pid_file']) # connect to master once tornado ioloop is running connect_slave_to_master = functools.partial(cluster_slave.connect_to_master, master_url=master_url) ioloop.add_callback(connect_slave_to_master) # start sending heartbeat after connecting to master start_slave_heartbeat = functools.partial(cluster_slave.start_heartbeat_thread) ioloop.add_callback(start_slave_heartbeat) ioloop.start() # this call blocks until the server is stopped ioloop.close(all_fds=True) # all_fds=True is necessary here to make sure connections don't hang self._logger.notice('Slave server was stopped.')
def allocate_slave(self, slave: Slave) -> bool: """ Allocate a slave to this build. This tells the slave to execute setup commands for this build. :param slave: The slave to allocate :return: Whether slave allocation was successful; this can fail if the slave is unresponsive """ if not self._build_started: self._build_started = True self._build.mark_started() # Increment executors before triggering setup. This helps make sure the build won't take down # every slave in the cluster if setup calls fail because of a problem with the build. next_executor_index = self._num_executors_allocated self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave) analytics.record_event(analytics.BUILD_SETUP_START, build_id=self._build.build_id(), slave_id=slave.id) self._slaves_allocated.append(slave) return slave.setup(self._build, executor_start_index=next_executor_index)
def start_subjob(self, subjob: Subjob): """ Send a subjob of a build to this slave. The slave must have already run setup for the corresponding build. :param subjob: The subjob to send to this slave """ if not self.is_alive(): raise DeadSlaveError('Tried to start a subjob on a dead slave.') if self._is_in_shutdown_mode: raise SlaveMarkedForShutdownError('Tried to start a subjob on a slave in shutdown mode.') execution_url = self._slave_api.url('build', subjob.build_id(), 'subjob', subjob.subjob_id()) post_data = {'atomic_commands': subjob.atomic_commands()} try: response = self._network.post_with_digest(execution_url, post_data, Secret.get(), error_on_failure=True) except (requests.ConnectionError, requests.Timeout, RequestFailedError) as ex: raise SlaveCommunicationError('Call to slave service failed: {}.'.format(repr(ex))) from ex subjob_executor_id = response.json().get('executor_id') analytics.record_event(analytics.MASTER_TRIGGERED_SUBJOB, executor_id=subjob_executor_id, build_id=subjob.build_id(), subjob_id=subjob.subjob_id(), slave_id=self.id)
def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg='Build preparation loop is handling request for build {build_id}.') try: self._prepare_build(build) if not build.has_error: analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), log_msg='Build {build_id} successfully prepared and waiting for slaves.') self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) self._logger.exception('Could not handle build request for build {}.'.format(build.build_id()))
def _prepare_build_async(self, build, project_lock): """ :type build: Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event(analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg='Build preparation loop is handling request for build {build_id}.') try: build.prepare(self._subjob_calculator) if not build.has_error: analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True, log_msg='Build {build_id} successfully prepared.') # If the atomizer found no work to do, perform build cleanup and skip the slave allocation. if len(build.all_subjobs()) == 0: self._logger.info('Build {} has no work to perform and is exiting.', build.build_id()) build.finish() # If there is work to be done, this build must queue to be allocated slaves. else: self._logger.info('Build {} is waiting for slaves.', build.build_id()) self._builds_waiting_for_slaves.put(build) except Exception as ex: # pylint: disable=broad-except build.mark_failed(str(ex)) # WIP(joey): Build should do this internally. self._logger.exception('Could not handle build request for build {}.'.format(build.build_id())) analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)
def async_run(self, port, log_level, eventlog_file): """ Run a ClusterRunner master service. :param port: the port on which to run the slave service :type port: int | None :param log_level: the log level at which to do application logging (or None for default log level) :type log_level: str | None :param eventlog_file: an optional alternate file in which to write event logs :type eventlog_file: str | None """ port = port or Configuration['port'] log_level = log_level or Configuration['log_level'] eventlog_file = eventlog_file or Configuration['eventlog_file'] log.configure_logging(log_level=log_level, log_file=Configuration['log_file']) analytics.initialize(eventlog_file) analytics.record_event(analytics.SERVICE_STARTED, service='master') cluster_master = ClusterMaster() application = ClusterMasterApplication(cluster_master) ioloop = self._start_application(application, port) self._write_pid_file(Configuration['master_pid_file']) # log startup message once ioloop is running hostname = Configuration['hostname'] log_startup = functools.partial( self._logger.info, 'Master service is running on {}:{}.'.format(hostname, port)) ioloop.add_callback(log_startup) ioloop.start() # this call blocks until the server is stopped ioloop.close( all_fds=True ) # all_fds=True is necessary here to make sure connections don't hang self._logger.notice('Master server was stopped.')
def _execute_subjob(self, build_id, subjob_id, executor, atomic_commands): """ This is the method for executing a subjob asynchronously. This performs the work required by executing the specified command, then does a post back to the master results endpoint to signal that the work is done. :type build_id: int :type subjob_id: int :type executor: SubjobExecutor :type atomic_commands: list[str] """ subjob_event_data = {"build_id": build_id, "subjob_id": subjob_id, "executor_id": executor.id} analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data) results_file = executor.execute_subjob(build_id, subjob_id, atomic_commands, self._base_executor_index) analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data) results_url = self._master_api.url("build", build_id, "subjob", subjob_id, "result") data = {"slave": "{}:{}".format(self.host, self.port), "metric_data": {"executor_id": executor.id}} files = {"file": ("payload", open(results_file, "rb"), "application/x-compressed")} self._idle_executors.put(executor) # work is done; mark executor as idle self._network.post(results_url, data=data, files=files) # todo: check return code self._logger.info("Build {}, Subjob {} completed and sent results to master.", build_id, subjob_id)
def _prepare_build_async(self, build, project_lock): """ :type build: app.master.build.Build :type project_lock: Lock """ self._logger.info('Build {} is waiting for the project lock', build.build_id()) with project_lock: self._logger.info('Build {} has acquired project lock', build.build_id()) analytics.record_event( analytics.BUILD_PREPARE_START, build_id=build.build_id(), log_msg= 'Build preparation loop is handling request for build {build_id}.' ) try: build.prepare() if not build.is_stopped: analytics.record_event( analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=True, log_msg='Build {build_id} successfully prepared.') # If the atomizer found no work to do, perform build cleanup and skip the slave allocation. if len(build.get_subjobs()) == 0: self._logger.info( 'Build {} has no work to perform and is exiting.', build.build_id()) build.finish() # If there is work to be done, this build must queue to be allocated slaves. else: self._logger.info('Build {} is waiting for slaves.', build.build_id()) self._scheduler_pool.add_build_waiting_for_slaves( build) except Exception as ex: # pylint: disable=broad-except if not build.is_canceled: build.mark_failed( str(ex)) # WIP(joey): Build should do this internally. self._logger.exception( 'Could not handle build request for build {}.'.format( build.build_id())) analytics.record_event(analytics.BUILD_PREPARE_FINISH, build_id=build.build_id(), is_success=False)
def execute_subjob(self, build_id, subjob_id, atomic_commands, base_executor_index): """ This is the method for executing a subjob. This performs the work required by executing the specified command, then archives the results into a single file and returns the filename. :type build_id: int :type subjob_id: int :type atomic_commands: list[str] :type base_executor_index: int :rtype: str """ self._logger.info('Executing subjob (Build {}, Subjob {})...', build_id, subjob_id) # Set the current task self._current_build_id = build_id self._current_subjob_id = subjob_id # Maintain a list of atom artifact directories for compression and sending back to master atom_artifact_dirs = [] # execute every atom and keep track of time elapsed for each for atom_id, atomic_command in enumerate(atomic_commands): atom_artifact_dir = BuildArtifact.atom_artifact_directory( build_id, subjob_id, atom_id, result_root=Configuration['artifact_directory'] ) # remove and recreate the atom artifact dir shutil.rmtree(atom_artifact_dir, ignore_errors=True) fs_util.create_dir(atom_artifact_dir) atom_environment_vars = { 'ARTIFACT_DIR': atom_artifact_dir, 'ATOM_ID': atom_id, 'EXECUTOR_INDEX': self.id, # Deprecated, use MACHINE_EXECUTOR_INDEX 'MACHINE_EXECUTOR_INDEX': self.id, 'BUILD_EXECUTOR_INDEX': base_executor_index + self.id, } atom_artifact_dirs.append(atom_artifact_dir) job_name = self._project_type.job_name atom_event_data = {'build_id': build_id, 'atom_id': atom_id, 'job_name': job_name, 'subjob_id': subjob_id} analytics.record_event(analytics.ATOM_START, **atom_event_data) exit_code = self._execute_atom_command(atomic_command, atom_environment_vars, atom_artifact_dir) atom_event_data['exit_code'] = exit_code analytics.record_event(analytics.ATOM_FINISH, **atom_event_data) # Generate mapping of atom directories (for archiving) to paths in the archive file targets_to_archive_paths = {atom_dir: os.path.basename(os.path.normpath(atom_dir)) for atom_dir in atom_artifact_dirs} # zip file names must be unique for a build, so we append the subjob_id to the compressed file subjob_artifact_dir = BuildArtifact.build_artifact_directory(build_id, result_root=Configuration['artifact_directory']) tarfile_path = os.path.join(subjob_artifact_dir, 'results_{}.tar.gz'.format(subjob_id)) fs_util.compress_directories(targets_to_archive_paths, tarfile_path) # Reset the current task self._current_build_id = None self._current_subjob_id = None return tarfile_path
def execute_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands, base_executor_index): """ This is the method for executing a subjob. This performs the work required by executing the specified command, then archives the results into a single file and returns the filename. :type build_id: int :type subjob_id: int :type subjob_artifact_dir: str :type atomic_commands: list[str] :type base_executor_index: int :rtype: str """ self._logger.info('Executing subjob (Build {}, Subjob {})...', build_id, subjob_id) # Set the current task self._current_build_id = build_id self._current_subjob_id = subjob_id # Maintain a list of atom artifact directories for compression and sending back to master atom_artifact_dirs = [] # execute every atom and keep track of time elapsed for each for atom_id, atomic_command in enumerate(atomic_commands): atom_artifact_dir = os.path.join( subjob_artifact_dir, Subjob.ATOM_DIR_FORMAT.format(subjob_id, atom_id)) # remove and recreate the atom artifact dir shutil.rmtree(atom_artifact_dir, ignore_errors=True) fs_util.create_dir(atom_artifact_dir) atom_environment_vars = { 'ARTIFACT_DIR': atom_artifact_dir, 'ATOM_ID': atom_id, 'EXECUTOR_INDEX': self.id, # Deprecated, use MACHINE_EXECUTOR_INDEX 'MACHINE_EXECUTOR_INDEX': self.id, 'BUILD_EXECUTOR_INDEX': base_executor_index + self.id, } atom_artifact_dirs.append(atom_artifact_dir) job_name = self._project_type.job_name atom_event_data = { 'build_id': build_id, 'atom_id': atom_id, 'job_name': job_name, 'subjob_id': subjob_id } analytics.record_event(analytics.ATOM_START, **atom_event_data) exit_code = self._execute_atom_command(atomic_command, atom_environment_vars, atom_artifact_dir) atom_event_data['exit_code'] = exit_code analytics.record_event(analytics.ATOM_FINISH, **atom_event_data) # Generate mapping of atom directories (for archiving) to paths in the archive file targets_to_archive_paths = { atom_dir: os.path.basename(os.path.normpath(atom_dir)) for atom_dir in atom_artifact_dirs } # zip file names must be unique for a build, so we append the subjob_id to the compressed file tarfile_path = os.path.join(subjob_artifact_dir, 'results_{}.tar.gz'.format(subjob_id)) fs_util.compress_directories(targets_to_archive_paths, tarfile_path) # Reset the current task self._current_build_id = None self._current_subjob_id = None return tarfile_path