def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data( self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory( self._build_results_dir(), 'results.tar.gz')
def test_update_timing_file(self, existing_timing_data, new_timing_data, expected_final_timing_data): fs.write_file(json.dumps(existing_timing_data), self._timing_file_path) build_artifact = BuildArtifact('/some/dir/doesnt/matter') build_artifact._update_timing_file(self._timing_file_path, new_timing_data) with open(self._timing_file_path, 'r') as timing_file: updated_timing_data = json.load(timing_file) self.assertDictEqual(updated_timing_data, expected_final_timing_data)
def test_get_failed_subjob_and_atom_ids_returns_correct_ids(self): # Build artifact directory: # artifact_1_0/clusterrunner_exit_code -> 0 # artifact_1_1/clusterrunner_exit_code -> 1 # artifact_2_0/clusterrunner_exit_code -> 0 # artifact_2_1/clusterrunner_exit_code -> 1 # Expected to return: [(1,1), (2,1)] build_artifact = BuildArtifact(self._artifact_directory_path) failed_subjob_and_atoms = build_artifact.get_failed_subjob_and_atom_ids() self.assertCountEqual(failed_subjob_and_atoms, [(1, 1), (2, 1)])
def test_get_failed_subjob_and_atom_ids_returns_correct_ids(self): # Build artifact directory: # artifact_1_0/clusterrunner_exit_code -> 0 # artifact_1_1/clusterrunner_exit_code -> 1 # artifact_2_0/clusterrunner_exit_code -> 0 # artifact_2_1/clusterrunner_exit_code -> 1 # Expected to return: [(1,1), (2,1)] build_artifact = BuildArtifact(self._artifact_directory_path) failed_subjob_and_atoms = build_artifact.get_failed_subjob_and_atom_ids( ) self.assertCountEqual(failed_subjob_and_atoms, [(1, 1), (2, 1)])
def read_timings(self): """ The timing data for each atom should be stored in the atom directory. Parse them, associate them with their atoms, and return them. :rtype: dict [str, float] """ timings = {} for atom_id, atom in enumerate(self._atoms): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), self.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) timings_file_path = os.path.join(artifact_dir, BuildArtifact.TIMING_FILE) if os.path.exists(timings_file_path): with open(timings_file_path, 'r') as f: atom.actual_time = float(f.readline()) timings[atom.command_string] = atom.actual_time else: self._logger.warning('No timing data for subjob {} atom {}.', self._subjob_id, atom_id) if len(timings) == 0: self._logger.warning('No timing data for subjob {}.', self._subjob_id) return timings
def test_artifact_directory_returns_proper_artifact_path(self, expected_path, build_id, subjob_id=None, atom_id=None, result_root=None): self.assertEquals( expected_path, BuildArtifact._artifact_directory(build_id, subjob_id, atom_id, result_root=result_root), 'The generated artifact directory is incorrect.' )
def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read())
def get_console_output(self, build_id, subjob_id, atom_id, result_root, max_lines=50, offset_line=None): """ Return the console output if it exists, raises an ItemNotFound error if not. On success, the response contains keys: offset_line, num_lines, total_num_lines, and content. e.g.: { 'offset_line': 0, 'num_lines': 50, 'total_num_lines': 167, 'content': 'Lorem ipsum dolor sit amet,\nconsectetur adipiscing elit,\n...', } :type build_id: int :type subjob_id: int :type atom_id: int :param result_root: the sys path to either the results or artifacts directory where results are stored. :type result_root: str :param max_lines: The maximum total number of lines to return. If this max_lines + offset_line lines do not exist in the output file, just return what there is. :type max_lines: int :param offset_line: The line number (0-indexed) to start reading content for. If none is specified, we will return the console output starting from the end of the file. :type offset_line: int | None """ if offset_line is not None and offset_line < 0: raise BadRequestError('\'offset_line\' must be greater than or equal to zero.') if max_lines <= 0: raise BadRequestError('\'max_lines\' must be greater than zero.') artifact_dir = BuildArtifact.atom_artifact_directory(build_id, subjob_id, atom_id, result_root=result_root) output_file = os.path.join(artifact_dir, BuildArtifact.OUTPUT_FILE) if not os.path.isfile(output_file): raise ItemNotFoundError('Output file doesn\'t exist for build_id: {} subjob_id: {} atom_id: {}'.format( build_id, subjob_id, atom_id)) try: console_output = ConsoleOutput(output_file) segment = console_output.segment(max_lines, offset_line) except ValueError as e: raise BadRequestError(e) return { 'offset_line': segment.offset_line, 'num_lines': segment.num_lines, 'total_num_lines': segment.total_num_lines, 'content': segment.content, }
def test_artifact_directory_returns_proper_artifact_path( self, expected_path, build_id, subjob_id=None, atom_id=None, result_root=None): self.assertEquals( expected_path, BuildArtifact._artifact_directory(build_id, subjob_id, atom_id, result_root=result_root), 'The generated artifact directory is incorrect.')
def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory']) atom_exit_code_file_sys_path = os.path.join( artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int( atom_exit_code_file.read())
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs """ _build_id_counter = Counter( ) # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._slaves_allocated = [] self._num_executors_allocated = 0 self._num_executors_in_use = 0 self._max_executors = float('inf') self._max_executors_per_slave = float('inf') self._all_subjobs_by_id = {} self._unstarted_subjobs = None self._finished_subjobs = None self._postbuild_tasks_are_finished = False self._teardowns_finished = False self._timing_file_path = None def api_representation(self): return { 'id': self._build_id, 'status': self._status(), 'artifacts': self. _artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': self._failed_atoms( ), # todo: print the file contents instead of paths 'result': self._result(), } def prepare(self, subjobs, project_type, job_config): """ :type subjobs: list[Subjob] :type project_type: project_type.project_type.ProjectType :type job_config: JobConfig """ if not self._preparation_coin.spend(): raise RuntimeError( 'prepare() was called more than once on build {}.'.format( self._build_id)) self._project_type = project_type self._unstarted_subjobs = Queue(maxsize=len(subjobs)) self._finished_subjobs = Queue(maxsize=len(subjobs)) for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._max_executors = job_config.max_executors self._max_executors_per_slave = job_config.max_executors_per_slave self._timing_file_path = project_type.timing_file_path(job_config.name) self.is_prepared = True def finish(self): """ Called when all slaves are done with this build (and any teardown is complete) """ if self._subjobs_are_finished: self._teardowns_finished = True else: raise RuntimeError( 'Tried to finish build {} but not all subjobs are complete'. format(self._build_id)) def build_id(self): """ :rtype: int """ return self._build_id def needs_more_slaves(self): """ Determine whether or not this build should have more slaves allocated to it. :rtype: bool """ return self._num_executors_allocated < self._max_executors and not self._unstarted_subjobs.empty( ) def allocate_slave(self, slave): """ Allocate a slave to this build. This tells the slave to execute setup commands for this build. :type slave: Slave """ self._slaves_allocated.append(slave) self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave) slave.setup(self.build_id(), project_type_params=self.build_request.build_parameters()) def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def begin_subjob_executions_on_slave(self, slave): """ Begin subjob executions on a slave. This should be called once after the specified slave has already run build_setup commands for this build. :type slave: Slave """ for slave_executor_count in range(slave.num_executors): if (self._num_executors_in_use >= self._max_executors or slave_executor_count >= self._max_executors_per_slave): break slave.claim_executor() self._num_executors_in_use += 1 self.execute_next_subjob_on_slave(slave) def execute_next_subjob_on_slave(self, slave): """ Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted subjob queue is empty, we mark the slave as idle. :type slave: Slave """ try: subjob = self._unstarted_subjobs.get(block=False) self._logger.debug('Sending subjob {} (build {}) to slave {}.', subjob.subjob_id(), subjob.build_id(), slave.url) slave.start_subjob(subjob) except Empty: num_executors_in_use = slave.free_executor() if num_executors_in_use == 0: try: self._slaves_allocated.remove(slave) except ValueError: pass # We have already deallocated this slave, no need to teardown else: slave.teardown() def handle_subjob_payload(self, subjob_id, payload=None): if not payload: self._logger.warning('No payload for subjob {}.', subjob_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._logger.debug('Payload for subjob {} written.', subjob_id) except: self._logger.warning('Writing payload for subjob {} FAILED.', subjob_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self._all_subjobs_by_id[int(subjob_id)] with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._logger.error('Build {} failed: {}', self.build_id(), failure_reason) self._error_message = failure_reason def cancel(self): """ Cancel a running build """ # Early exit if build is not running if self._status() in [ BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED ]: return self._is_canceled = True # Deplete the unstarted subjob queue. # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty( ): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def artifacts_archive_file(self): return self._artifacts_archive_file @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize( ) @property def _num_atoms(self): if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]: return None return sum([ len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values() ]) @property def _subjobs_are_finished(self): return self._is_canceled or (self.is_prepared and self._finished_subjobs.full()) @property def is_finished(self): # TODO: Clean up this logic or move everything into a state machine build_fully_completed = self._postbuild_tasks_are_finished and self._teardowns_finished return self._is_canceled or build_fully_completed @property def is_unstarted(self): return self.is_prepared and self._num_executors_allocated == 0 and self._unstarted_subjobs.full( ) @property def has_error(self): return self._error_message is not None @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total) return None def _status(self): """ :rtype: BuildStatus """ if self.has_error: return BuildStatus.ERROR elif self._is_canceled: return BuildStatus.CANCELED elif not self.is_prepared or self.is_unstarted: return BuildStatus.QUEUED elif self.is_finished: return BuildStatus.FINISHED else: return BuildStatus.BUILDING def _failed_atoms(self): """ The commands which failed :rtype: list [str] | None """ if self._is_canceled: return [] if self.is_finished: # dict.values() returns a view object in python 3, so wrapping values() in a list return list(self._build_artifact.get_failed_commands().values()) return None def _result(self): """ :rtype: str | None """ if self._is_canceled: return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_commands()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ # @TODO There is a race condition here where the build is marked finished before the results archive # is prepared. If the user requests the build status before archival finishes, the 'artifacts' # value in the post body will be None. self.is_finished should be conditional on whether archival # is finished. self._create_build_artifact() self._logger.debug('Postbuild tasks completed for build {}', self.build_id()) self._postbuild_tasks_are_finished = True def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data( self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory( self._build_results_dir(), 'results.tar.gz') def _build_results_dir(self): return os.path.join( Configuration['results_directory'], str(self.build_id()), )
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._num_slaves_in_use = 0 self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._num_allocated_executors = 0 self._max_executors = float('inf') self._build_completion_lock = Lock() self._all_subjobs_by_id = {} self._unstarted_subjobs = None self._finished_subjobs = None self._postbuild_tasks_are_finished = False self._teardowns_finished = False def api_representation(self): return { 'id': self._build_id, 'status': self._status(), 'artifacts': self._artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': self._failed_atoms(), # todo: print the file contents instead of paths 'result': self._result(), } def prepare(self, subjobs, project_type, job_config): """ :type subjobs: list[Subjob] :type project_type: project_type.project_type.ProjectType :type job_config: master.job_config.JobConfig """ if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._project_type = project_type self._unstarted_subjobs = Queue(maxsize=len(subjobs)) self._finished_subjobs = Queue(maxsize=len(subjobs)) for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._max_executors = job_config.max_executors self._timing_file_path = project_type.timing_file_path(job_config.name) self.is_prepared = True def finish(self): """ Called when all slaves are done with this build (and any teardown is complete) """ if self._subjobs_are_finished: self._teardowns_finished = True else: raise RuntimeError('Tried to finish build {} but not all subjobs are complete'.format(self._build_id)) def build_id(self): """ :return: :rtype: int """ return self._build_id def needs_more_slaves(self): return self._num_allocated_executors < self._max_executors and not self._unstarted_subjobs.empty() def allocate_slave(self, slave): """ Allocate a slave to this build. :type slave: master.Slave """ self._num_slaves_in_use += 1 slave.setup(self.build_id(), project_type_params=self.build_request.build_parameters()) for _ in range(slave.num_executors): if self._num_allocated_executors >= self._max_executors: break slave.claim_executor() self._num_allocated_executors += 1 self.execute_next_subjob_on_slave(slave) def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def execute_next_subjob_on_slave(self, slave): """ Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted subjob queue is empty, we mark the slave as idle. :type slave: master.Slave """ try: subjob = self._unstarted_subjobs.get(block=False) self._logger.debug('Sending subjob {} (build {}) to slave {}.', subjob.subjob_id(), subjob.build_id(), slave.url) slave.start_subjob(subjob) except Empty: num_executors_in_use = slave.free_executor() if num_executors_in_use == 0: slave.teardown() def handle_subjob_payload(self, subjob_id, payload=None): if not payload: self._logger.warning('No payload for subjob {}.', subjob_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join( self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._logger.debug('Payload for subjob {} written.', subjob_id) except: self._logger.warning('Writing payload for subjob {} FAILED.', subjob_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self._all_subjobs_by_id[int(subjob_id)] with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._logger.error('Build {} failed: {}', self.build_id(), failure_reason) self._error_message = failure_reason @property def artifacts_archive_file(self): return self._artifacts_archive_file @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) @property def _subjobs_are_finished(self): return self.is_prepared and self._finished_subjobs.full() @property def is_finished(self): return self._subjobs_are_finished and self._postbuild_tasks_are_finished and self._teardowns_finished @property def is_unstarted(self): return self.is_prepared and self._unstarted_subjobs.full() @property def has_error(self): return self._error_message is not None @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): """ :rtype: str """ if self.has_error: return BuildStatus.ERROR elif not self.is_prepared or self.is_unstarted: return BuildStatus.QUEUED elif self.is_finished: return BuildStatus.FINISHED else: return BuildStatus.BUILDING def _failed_atoms(self): """ The commands which failed :rtype: list [str] | None """ if self.is_finished: # dict.values() returns a view object in python 3, so wrapping values() in a list return list(self._build_artifact.get_failed_commands().values()) return None def _result(self): """ :rtype: str | None """ if self.is_finished: if len(self._build_artifact.get_failed_commands()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ # @TODO There is a race condition here where the build is marked finished before the results archive # is prepared. If the user requests the build status before archival finishes, the 'artifacts' # value in the post body will be None. self.is_finished should be conditional on whether archival # is finished. self._create_build_artifact() self._logger.debug('Postbuild tasks completed for build {}', self.build_id()) self._postbuild_tasks_are_finished = True def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz') def _build_results_dir(self): return os.path.join( Configuration['results_directory'], str(self.build_id()), )
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._subjob_assignment_lock = Lock() # prevents subjobs from being skipped self._slaves_allocated = [] self._num_executors_allocated = 0 self._num_executors_in_use = 0 self._max_executors = float('inf') self._max_executors_per_slave = float('inf') self._all_subjobs_by_id = {} self._unstarted_subjobs = None self._finished_subjobs = None self._postbuild_tasks_are_finished = False self._timing_file_path = None def api_representation(self): return { 'id': self._build_id, 'status': self._status(), 'artifacts': self._artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': self._failed_atoms(), # todo: print the file contents instead of paths 'result': self._result(), 'request_params': self.build_request.build_parameters(), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo() # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update({'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError('Build failed due to an invalid project type.') def prepare(self, subjobs, job_config): """ :type subjobs: list[Subjob] :type job_config: JobConfig """ if self.project_type is None: raise RuntimeError('prepare() was called before generate_project_type() on build {}.' .format(self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) self._finished_subjobs = Queue(maxsize=len(subjobs)) for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._max_executors = job_config.max_executors self._max_executors_per_slave = job_config.max_executors_per_slave self._timing_file_path = self.project_type.timing_file_path(job_config.name) self.is_prepared = True def build_id(self): """ :rtype: int """ return self._build_id def needs_more_slaves(self): """ Determine whether or not this build should have more slaves allocated to it. :rtype: bool """ return self._num_executors_allocated < self._max_executors and not self._unstarted_subjobs.empty() def allocate_slave(self, slave): """ Allocate a slave to this build. This tells the slave to execute setup commands for this build. :type slave: Slave """ self._slaves_allocated.append(slave) slave.setup(self) self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave) analytics.record_event(analytics.BUILD_SETUP_START, build_id=self.build_id(), slave_id=slave.id) def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def begin_subjob_executions_on_slave(self, slave): """ Begin subjob executions on a slave. This should be called once after the specified slave has already run build_setup commands for this build. :type slave: Slave """ analytics.record_event(analytics.BUILD_SETUP_FINISH, build_id=self.build_id(), slave_id=slave.id) for slave_executor_count in range(slave.num_executors): if (self._num_executors_in_use >= self._max_executors or slave_executor_count >= self._max_executors_per_slave): break slave.claim_executor() self._num_executors_in_use += 1 self.execute_next_subjob_or_teardown_slave(slave) def execute_next_subjob_or_teardown_slave(self, slave): """ Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted subjob queue is empty, we teardown the slave to free it up for other builds. :type slave: Slave """ try: # This lock prevents the scenario where a subjob is pulled from the queue but cannot be assigned to this # slave because it is shutdown, so we put it back on the queue but in the meantime another slave enters # this method, finds the subjob queue empty, and is torn down. If that was the last 'living' slave, the # build would be stuck. with self._subjob_assignment_lock: subjob = self._unstarted_subjobs.get(block=False) self._logger.debug('Sending subjob {} (build {}) to slave {}.', subjob.subjob_id(), subjob.build_id(), slave.url) try: slave.start_subjob(subjob) except SlaveMarkedForShutdownError: self._unstarted_subjobs.put(subjob) # An executor is currently allocated for this subjob in begin_subjob_executions_on_slave. # Since the slave has been marked for shutdown, we need to free the executor. self._free_slave_executor(slave) except Empty: self._free_slave_executor(slave) def _free_slave_executor(self, slave): num_executors_in_use = slave.free_executor() if num_executors_in_use == 0: try: self._slaves_allocated.remove(slave) except ValueError: pass # We have already deallocated this slave, no need to teardown else: slave.teardown() def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception('Error while completing subjob; marking build as failed.') self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) raise def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join( self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) except: self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self._all_subjobs_by_id[int(subjob_id)] with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._logger.error('Build {} failed: {}', self.build_id(), failure_reason) self._error_message = failure_reason def cancel(self): """ Cancel a running build """ # Early exit if build is not running if self._status() in [BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED]: return self._is_canceled = True # Deplete the unstarted subjob queue. # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def num_executors_allocated(self): """ :rtype: int """ return self._num_executors_allocated @property def artifacts_archive_file(self): return self._artifacts_archive_file @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) @property def _subjobs_are_finished(self): return self._is_canceled or (self.is_prepared and self._finished_subjobs.full()) @property def is_finished(self): # TODO: Clean up this logic or move everything into a state machine return self._is_canceled or self._postbuild_tasks_are_finished @property def is_unstarted(self): return self.is_prepared and self._num_executors_allocated == 0 and self._unstarted_subjobs.full() @property def has_error(self): return self._error_message is not None @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): """ :rtype: BuildStatus """ if self.has_error: return BuildStatus.ERROR elif self._is_canceled: return BuildStatus.CANCELED elif not self.is_prepared or self.is_unstarted: return BuildStatus.QUEUED elif self.is_finished: return BuildStatus.FINISHED else: return BuildStatus.BUILDING def _failed_atoms(self): """ The commands which failed :rtype: list [str] | None """ if self._is_canceled: return [] if self.is_finished: # dict.values() returns a view object in python 3, so wrapping values() in a list return list(self._build_artifact.get_failed_commands().values()) return None def _result(self): """ :rtype: str | None """ if self._is_canceled: return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_commands()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._logger.debug('Postbuild tasks completed for build {}', self.build_id()) self._postbuild_tasks_are_finished = True def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz') def _build_results_dir(self): return os.path.join( Configuration['results_directory'], str(self.build_id()), ) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
def test_subjob_and_atom_ids_raises_value_error_with_incorrect_format(self, incorrect_artifact_directory): with self.assertRaises(ValueError): BuildArtifact._subjob_and_atom_ids(incorrect_artifact_directory)
def test_subjob_and_atom_ids_parses_for_properly_formatted_directory(self, artifact_directory, expected_subjob_id, expected_atom_id): subjob_id, atom_id = BuildArtifact._subjob_and_atom_ids(artifact_directory) self.assertEquals(subjob_id, expected_subjob_id) self.assertEquals(atom_id, expected_atom_id)
def test_artifact_directory_raises_value_error_if_subjob_id_or_atom_id_specified(self, subjob_id, atom_id): with self.assertRaises(ValueError): BuildArtifact._artifact_directory(1, subjob_id, atom_id)
def test_subjob_and_atom_ids_raises_value_error_with_incorrect_format( self, incorrect_artifact_directory): with self.assertRaises(ValueError): BuildArtifact._subjob_and_atom_ids(incorrect_artifact_directory)
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._setup_is_started = False self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP: Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False self._timing_file_path = None self._state_timestamps = {status: None for status in BuildStatus} # initialize all timestamps to None self._record_state_timestamp(BuildStatus.QUEUED) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [failed_atom.api_representation() for failed_atom in self._get_failed_atoms()] return { 'id': self._build_id, 'status': self._status(), 'artifacts': self._artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict(sorted( [(state.lower(), timestamp) for state, timestamp in self._state_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo() # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update({'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError('Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError('Build {} has no associated request object.'.format(self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build(self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) self._finished_subjobs = Queue(maxsize=len(subjobs)) for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path(job_config.name) self.is_prepared = True self._record_state_timestamp(BuildStatus.PREPARED) def build_id(self): """ :rtype: int """ return self._build_id def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception('Error while completing subjob; marking build as failed.') self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): self._setup_is_started = True self._record_state_timestamp(BuildStatus.BUILDING) def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._logger.error('Build {} failed: {}', self.build_id(), failure_reason) self._error_message = failure_reason self._record_state_timestamp(BuildStatus.ERROR) def cancel(self): """ Cancel a running build """ # Early exit if build is not running if self._status() in [BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED]: self._logger.notice('Ignoring cancel request for build {}. Build is already in state {}.', self._build_id, self._status()) return self._logger.notice('Canceling build {}.', self._build_id) self._is_canceled = True self._record_state_timestamp(BuildStatus.CANCELED) # Deplete the unstarted subjob queue. # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) @property def _subjobs_are_finished(self): return self._is_canceled or (self.is_prepared and self._finished_subjobs.full()) @property def is_finished(self): # TODO: Clean up this logic or move everything into a state machine return self._is_canceled or self._postbuild_tasks_are_finished @property def is_unstarted(self): return self.is_prepared and not self._setup_is_started and self._unstarted_subjobs.full() @property def has_error(self): return self._error_message is not None @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): """ :rtype: BuildStatus """ if self.has_error: return BuildStatus.ERROR elif self._is_canceled: return BuildStatus.CANCELED elif not self.is_prepared or self.is_unstarted: return BuildStatus.QUEUED elif self.is_finished: return BuildStatus.FINISHED else: return BuildStatus.BUILDING def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled: return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ :rtype: str | None """ if self._is_canceled: return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._logger.debug('Postbuild tasks completed for build {}', self.build_id()) self._postbuild_tasks_are_finished = True self._record_state_timestamp(BuildStatus.FINISHED) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz') def _build_results_dir(self): return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4())) def get_state_timestamp(self, build_status): """ Get the recorded timestamp for a given build status. This may be None if the build has not yet reached the specified state. :param build_status: The build status for which to retrieve the corresponding timestamp :type build_status: BuildStatus :return: The timestamp for the specified status :rtype: float | None """ return self._state_timestamps.get(build_status) def _record_state_timestamp(self, build_status): """ Record a timestamp for a given build status. This is used to record the timing of the various build phases and is exposed via the Build object's API representation. :param build_status: The build status for which to record a timestamp :type build_status: BuildStatus """ if self._state_timestamps.get(build_status) is not None: self._logger.warning( 'Overwriting timestamp for build {}, status {}'.format(self.build_id(), build_status)) self._state_timestamps[build_status] = time.time()
def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME)
def test_artifact_directory_raises_value_error_if_subjob_id_or_atom_id_specified( self, subjob_id, atom_id): with self.assertRaises(ValueError): BuildArtifact._artifact_directory(1, subjob_id, atom_id)
def _build_results_dir(self): return BuildArtifact.build_artifact_directory( self.build_id(), result_root=Configuration['results_directory'])
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs :type _build_id: int :type _build_request: BuildRequest :type _build_artifact: None | BuildArtifact :type _error_message: None | str :type _project_type: None | ProjectType :type _timing_file_path: None | str """ _build_id_counter = Counter( ) # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm(build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, }) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [ failed_atom.api_representation() for failed_atom in self._get_failed_atoms() ] build_state = self._status() # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing. if build_state in (BuildState.PREPARING, BuildState.PREPARED): build_state = BuildState.QUEUED return { 'id': self._build_id, 'status': build_state, 'artifacts': self. _artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict( sorted([(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo( ) # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update( {'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError( 'Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError( 'Build {} has no associated request object.'.format( self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format( self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError( 'prepare() was called more than once on build {}.'.format( self._build_id)) self._state_machine.trigger(BuildEvent.START_PREPARE) # WIP(joey): Move the following code into a PREPARING state callback # (so that it won't execute if the build has already been canceled.) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError( 'Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build( self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue( maxsize=len(subjobs)) # WIP(joey): Move this into BuildScheduler? self._finished_subjobs = Queue(maxsize=len( subjobs)) # WIP(joey): Remove this and just record finished count. for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path( job_config.name) app.util.fs.create_dir(self._build_results_dir()) self._state_machine.trigger(BuildEvent.FINISH_PREPARE) def build_id(self): """ :rtype: int """ return self._build_id @property def build_request(self): """ :rtype: BuildRequest """ return self._build_request def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception( 'Error while completing subjob; marking build as failed.') self.mark_failed( 'Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory']) atom_exit_code_file_sys_path = os.path.join( artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int( atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning( 'Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) should_trigger_postbuild_tasks = self._all_subjobs_are_finished( ) and not self._is_stopped() # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if should_trigger_postbuild_tasks: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): """ Mark the build as started. """ self._state_machine.trigger(BuildEvent.START_BUILDING) def finish(self): """ Perform postbuild task and mark this build as finished. """ # This method also transitions the FSM to finished after the postbuild tasks are complete. self._perform_async_postbuild_tasks() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason) def _on_enter_error_state(self, event): """ Store an error message for the build and log the failure. This method is triggered by a state machine transition to the ERROR state. :param event: The Fysom event object """ # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message? default_error_msg = 'An unspecified error occurred.' self._error_message = getattr(event, 'error_msg', default_error_msg) self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message) def cancel(self): """ Cancel a running build. """ self._logger.notice('Request received to cancel build {}.', self._build_id) self._state_machine.trigger(BuildEvent.CANCEL) def _on_enter_canceled_state(self, event): # Deplete the unstarted subjob queue. # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled(). # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty( ): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file # WIP(joey): Change some of these private @properties to methods. @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize( ) @property def _num_atoms(self): # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]: return None return sum([ len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values() ]) def _all_subjobs_are_finished(self): return self._finished_subjobs and self._finished_subjobs.full() @property def is_finished(self): # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here. return self._is_canceled() or self._postbuild_tasks_are_finished @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total) return None def _status(self): # WIP(joey): Rename to _state. """ :rtype: BuildState """ return self._state_machine.state @property def has_error(self): return self._status() is BuildState.ERROR def _is_canceled(self): return self._status() is BuildState.CANCELED def _is_stopped(self): return self._status() in (BuildState.ERROR, BuildState.CANCELED) def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled(): return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids( ): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ Can return three states: None: FAILURE: NO_FAILURES: :rtype: BuildResult | None """ if self._is_canceled(): return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._delete_temporary_build_artifact_files() self._postbuild_tasks_are_finished = True self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data( self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory( self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME) def _delete_temporary_build_artifact_files(self): """ Delete the temporary build result files that are no longer needed, due to the creation of the build artifact tarball. ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results. """ build_result_dir = self._build_results_dir() start_time = time.time() for path in os.listdir(build_result_dir): # The build result tar-ball is also stored in this same directory, so we must not delete it. if path == BuildArtifact.ARTIFACT_FILE_NAME: continue full_path = os.path.join(build_result_dir, path) # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn. if os.path.isdir: shutil.rmtree(full_path, ignore_errors=True) else: os.remove(full_path) end_time = time.time() - start_time self._logger.info( 'Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time) def _build_results_dir(self): return BuildArtifact.build_artifact_directory( self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs :type _build_id: int :type _build_request: BuildRequest :type _build_artifact: None | BuildArtifact :type _error_message: None | str :type _project_type: None | ProjectType :type _timing_file_path: None | str """ _build_id_counter = Counter() # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self._build_request = build_request self._artifacts_archive_file = None self._build_artifact = None self._error_message = None self._preparation_coin = SingleUseCoin() # protects against separate threads calling prepare() more than once self._project_type = None self._build_completion_lock = Lock() # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP(joey): Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False # WIP(joey): Remove and use build state. self._timing_file_path = None self._state_machine = BuildFsm( build_id=self._build_id, enter_state_callbacks={ BuildState.ERROR: self._on_enter_error_state, BuildState.CANCELED: self._on_enter_canceled_state, } ) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [failed_atom.api_representation() for failed_atom in self._get_failed_atoms()] build_state = self._status() # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing. if build_state in (BuildState.PREPARING, BuildState.PREPARED): build_state = BuildState.QUEUED return { 'id': self._build_id, 'status': build_state, 'artifacts': self._artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict(sorted( [(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo() # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update({'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError('Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError('Build {} has no associated request object.'.format(self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format(self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id)) self._state_machine.trigger(BuildEvent.START_PREPARE) # WIP(joey): Move the following code into a PREPARING state callback # (so that it won't execute if the build has already been canceled.) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build(self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Move this into BuildScheduler? self._finished_subjobs = Queue(maxsize=len(subjobs)) # WIP(joey): Remove this and just record finished count. for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path(job_config.name) app.util.fs.create_dir(self._build_results_dir()) self._state_machine.trigger(BuildEvent.FINISH_PREPARE) def build_id(self): """ :rtype: int """ return self._build_id @property def build_request(self): """ :rtype: BuildRequest """ return self._build_request def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception('Error while completing subjob; marking build as failed.') self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory'] ) atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) should_trigger_postbuild_tasks = self._all_subjobs_are_finished() and not self._is_stopped() # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if should_trigger_postbuild_tasks: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): """ Mark the build as started. """ self._state_machine.trigger(BuildEvent.START_BUILDING) def finish(self): """ Perform postbuild task and mark this build as finished. """ # This method also transitions the FSM to finished after the postbuild tasks are complete. self._perform_async_postbuild_tasks() def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason) def mark_setup_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. Because setup failures don't have any logs, we put the build_id in the setup_failed file for easier querying of worker logs. :type failure_reason: str """ self._state_machine.trigger(BuildEvent.FAIL, error_msg='{} Build Id: {}.'.format(failure_reason, self._build_id)) setup_failure_file = os.path.join(self._build_results_dir(), BuildArtifact.SETUP_FAILED_FILE) app.util.fs.write_file(str(self._build_id), setup_failure_file) self._create_build_artifact() def _on_enter_error_state(self, event): """ Store an error message for the build and log the failure. This method is triggered by a state machine transition to the ERROR state. :param event: The Fysom event object """ # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message? default_error_msg = 'An unspecified error occurred.' self._error_message = getattr(event, 'error_msg', default_error_msg) self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message) def cancel(self): """ Cancel a running build. """ self._logger.notice('Request received to cancel build {}.', self._build_id) self._state_machine.trigger(BuildEvent.CANCEL) def _on_enter_canceled_state(self, event): # Deplete the unstarted subjob queue. # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled(). # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file # WIP(joey): Change some of these private @properties to methods. @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize() @property def _num_atoms(self): # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]: return None return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()]) def _all_subjobs_are_finished(self): return self._finished_subjobs and self._finished_subjobs.full() @property def is_finished(self): # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here. return self._is_canceled() or self._postbuild_tasks_are_finished @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total ) return None def _status(self): # WIP(joey): Rename to _state. """ :rtype: BuildState """ return self._state_machine.state @property def has_error(self): return self._status() is BuildState.ERROR def _is_canceled(self): return self._status() is BuildState.CANCELED def _is_stopped(self): return self._status() in (BuildState.ERROR, BuildState.CANCELED) def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled(): return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ Can return three states: None: FAILURE: NO_FAILURES: :rtype: BuildResult | None """ if self._is_canceled(): return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._delete_temporary_build_artifact_files() self._postbuild_tasks_are_finished = True self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME) def _delete_temporary_build_artifact_files(self): """ Delete the temporary build result files that are no longer needed, due to the creation of the build artifact tarball. ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results. """ build_result_dir = self._build_results_dir() start_time = time.time() for path in os.listdir(build_result_dir): # The build result tar-ball is also stored in this same directory, so we must not delete it. if path == BuildArtifact.ARTIFACT_FILE_NAME: continue full_path = os.path.join(build_result_dir, path) # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn. if os.path.isdir: shutil.rmtree(full_path, ignore_errors=True) else: os.remove(full_path) end_time = time.time() - start_time self._logger.info('Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time) def _build_results_dir(self): return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
def test_subjob_and_atom_ids_parses_for_properly_formatted_directory( self, artifact_directory, expected_subjob_id, expected_atom_id): subjob_id, atom_id = BuildArtifact._subjob_and_atom_ids( artifact_directory) self.assertEquals(subjob_id, expected_subjob_id) self.assertEquals(atom_id, expected_atom_id)
def _build_results_dir(self): return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory'])
class Build(object): """ A build is a single execution of any configured job. This class: - exposes the overall status of the build - keeps track of the build's subjobs and their completion state - manages slaves that have been assigned to accept this build's subjobs """ _build_id_counter = Counter( ) # class-level counter for assigning build ids def __init__(self, build_request): """ :type build_request: BuildRequest """ self._logger = get_logger(__name__) self._build_id = self._build_id_counter.increment() self.build_request = build_request self._artifacts_archive_file = None self._build_artifact = None """ :type : BuildArtifact""" self._error_message = None self.is_prepared = False self._setup_is_started = False self._preparation_coin = SingleUseCoin( ) # protects against separate threads calling prepare() more than once self._is_canceled = False self._project_type = None self._build_completion_lock = Lock( ) # protects against more than one thread detecting the build's finish self._all_subjobs_by_id = {} self._unstarted_subjobs = None # WIP: Move subjob queues to BuildScheduler class. self._finished_subjobs = None self._failed_atoms = None self._postbuild_tasks_are_finished = False self._timing_file_path = None self._state_timestamps = {status: None for status in BuildStatus } # initialize all timestamps to None self._record_state_timestamp(BuildStatus.QUEUED) def api_representation(self): failed_atoms_api_representation = None if self._get_failed_atoms() is not None: failed_atoms_api_representation = [ failed_atom.api_representation() for failed_atom in self._get_failed_atoms() ] return { 'id': self._build_id, 'status': self._status(), 'artifacts': self. _artifacts_archive_file, # todo: this should probably be a url, not a file path 'details': self._detail_message, 'error_message': self._error_message, 'num_atoms': self._num_atoms, 'num_subjobs': len(self._all_subjobs_by_id), 'failed_atoms': failed_atoms_api_representation, 'result': self._result(), 'request_params': self.build_request.build_parameters(), # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries # by numerically increasing dict value, with None values sorting highest. 'state_timestamps': OrderedDict( sorted( [(state.lower(), timestamp) for state, timestamp in self._state_timestamps.items()], key=lambda item: item[1] or float('inf'))), } def generate_project_type(self): """ Instantiate the project type for this build, populating the self._project_type instance variable. As a side effect, this method also updates the build request's build_parameters dictionary with the unique workspace directory path for this build. :raises BuildProjectError when failed to instantiate project type """ # Generate a unique project build directory name that will be symlinked to the actual project directory # later on when the project gets fetched. build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo( ) # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a # build-unique generated symlink), we must manually add it to the project_type_params project_type_params = self.build_request.build_parameters() project_type_params.update( {'build_project_directory': build_specific_project_directory}) self._project_type = util.create_project_type(project_type_params) if self._project_type is None: raise BuildProjectError( 'Build failed due to an invalid project type.') def prepare(self, subjob_calculator): """ :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build :type subjob_calculator: SubjobCalculator """ if not isinstance(self.build_request, BuildRequest): raise RuntimeError( 'Build {} has no associated request object.'.format( self._build_id)) if not isinstance(self.project_type, ProjectType): raise RuntimeError('Build {} has no project set.'.format( self._build_id)) if not self._preparation_coin.spend(): raise RuntimeError( 'prepare() was called more than once on build {}.'.format( self._build_id)) self._logger.info('Fetching project for build {}.', self._build_id) self.project_type.fetch_project() self._logger.info('Successfully fetched project for build {}.', self._build_id) job_config = self.project_type.job_config() if job_config is None: raise RuntimeError( 'Build failed while trying to parse clusterrunner.yaml.') subjobs = subjob_calculator.compute_subjobs_for_build( self._build_id, job_config, self.project_type) self._unstarted_subjobs = Queue(maxsize=len(subjobs)) self._finished_subjobs = Queue(maxsize=len(subjobs)) for subjob in subjobs: self._all_subjobs_by_id[subjob.subjob_id()] = subjob self._unstarted_subjobs.put(subjob) self._timing_file_path = self._project_type.timing_file_path( job_config.name) self.is_prepared = True self._record_state_timestamp(BuildStatus.PREPARED) def build_id(self): """ :rtype: int """ return self._build_id def all_subjobs(self): """ Returns a list of subjobs for this build :rtype: list[Subjob] """ return [subjob for subjob in self._all_subjobs_by_id.values()] def subjob(self, subjob_id): """ Returns a single subjob :type subjob_id: int :rtype: Subjob """ subjob = self._all_subjobs_by_id.get(subjob_id) if subjob is None: raise ItemNotFoundError('Invalid subjob id.') return subjob def complete_subjob(self, subjob_id, payload=None): """ Handle the subjob payload and mark the given subjob id for this build as complete. :type subjob_id: int :type payload: dict """ try: self._handle_subjob_payload(subjob_id, payload) self._mark_subjob_complete(subjob_id) except Exception: self._logger.exception( 'Error while completing subjob; marking build as failed.') self.mark_failed( 'Error occurred while completing subjob {}.'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): subjob = self.subjob(subjob_id) for atom_id in range(len(subjob.atoms)): artifact_dir = BuildArtifact.atom_artifact_directory( self.build_id(), subjob.subjob_id(), atom_id, result_root=Configuration['results_directory']) atom_exit_code_file_sys_path = os.path.join( artifact_dir, BuildArtifact.EXIT_CODE_FILE) with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file: subjob.atoms[atom_id].exit_code = int( atom_exit_code_file.read()) def _handle_subjob_payload(self, subjob_id, payload): if not payload: self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id) return # Assertion: all payloads received from subjobs are uniquely named. result_file_path = os.path.join(self._build_results_dir(), payload['filename']) try: app.util.fs.write_file(payload['body'], result_file_path) app.util.fs.extract_tar(result_file_path, delete=True) self._parse_payload_for_atom_exit_code(subjob_id) except: self._logger.warning( 'Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id) raise def _read_subjob_timings_from_results(self): """ Collect timing data from all subjobs :rtype: dict [str, float] """ timings = {} for _, subjob in self._all_subjobs_by_id.items(): timings.update(subjob.read_timings()) return timings def _mark_subjob_complete(self, subjob_id): """ :type subjob_id: int """ subjob = self.subjob(subjob_id) subjob.mark_completed() with self._build_completion_lock: self._finished_subjobs.put(subjob, block=False) subjobs_are_finished = self._subjobs_are_finished # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition if subjobs_are_finished: self._logger.info("All results received for build {}!", self._build_id) SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start() def mark_started(self): self._setup_is_started = True self._record_state_timestamp(BuildStatus.BUILDING) def mark_failed(self, failure_reason): """ Mark a build as failed and set a failure reason. The failure reason should be something we can present to the end user of ClusterRunner, so try not to include detailed references to internal implementation. :type failure_reason: str """ self._logger.error('Build {} failed: {}', self.build_id(), failure_reason) self._error_message = failure_reason self._record_state_timestamp(BuildStatus.ERROR) def cancel(self): """ Cancel a running build """ # Early exit if build is not running if self._status() in [ BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED ]: self._logger.notice( 'Ignoring cancel request for build {}. Build is already in state {}.', self._build_id, self._status()) return self._logger.notice('Canceling build {}.', self._build_id) self._is_canceled = True self._record_state_timestamp(BuildStatus.CANCELED) # Deplete the unstarted subjob queue. # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty( ): try: # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty. self._unstarted_subjobs.get(block=False) except Empty: break def validate_update_params(self, update_params): """ Determine if a dict of update params are valid, and generate an error if not :param update_params: Params passed into a PUT for this build :type update_params: dict [str, str] :return: Whether the params are valid and a response containing an error message if not :rtype: tuple [bool, dict [str, str]] """ keys_and_values_allowed = {'status': ['canceled']} message = None for key, value in update_params.items(): if key not in keys_and_values_allowed.keys(): message = 'Key ({}) is not in list of allowed keys ({})'.\ format(key, ",".join(keys_and_values_allowed.keys())) elif value not in keys_and_values_allowed[key]: message = 'Value ({}) is not in list of allowed values ({}) for {}'.\ format(value, keys_and_values_allowed[key], key) if message is not None: return False, {'error': message} return True, {} def update_state(self, update_params): """ Make updates to the state of this build given a set of update params :param update_params: The keys and values to update on this build :type update_params: dict [str, str] """ success = False for key, value in update_params.items(): if key == 'status': if value == 'canceled': self.cancel() success = True return success @property def project_type(self): """ :rtype: ProjectType """ return self._project_type @property def artifacts_archive_file(self): return self._artifacts_archive_file @property def _num_subjobs_total(self): return len(self._all_subjobs_by_id) @property def _num_subjobs_finished(self): return 0 if not self._finished_subjobs else self._finished_subjobs.qsize( ) @property def _num_atoms(self): if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]: return None return sum([ len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values() ]) @property def _subjobs_are_finished(self): return self._is_canceled or (self.is_prepared and self._finished_subjobs.full()) @property def is_finished(self): # TODO: Clean up this logic or move everything into a state machine return self._is_canceled or self._postbuild_tasks_are_finished @property def is_unstarted(self): return self.is_prepared and not self._setup_is_started and self._unstarted_subjobs.full( ) @property def has_error(self): return self._error_message is not None @property def _detail_message(self): if self._num_subjobs_total > 0: return '{} of {} subjobs are complete ({:.1f}%).'.format( self._num_subjobs_finished, self._num_subjobs_total, 100 * self._num_subjobs_finished / self._num_subjobs_total) return None def _status(self): """ :rtype: BuildStatus """ if self.has_error: return BuildStatus.ERROR elif self._is_canceled: return BuildStatus.CANCELED elif not self.is_prepared or self.is_unstarted: return BuildStatus.QUEUED elif self.is_finished: return BuildStatus.FINISHED else: return BuildStatus.BUILDING def _get_failed_atoms(self): """ The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if build has completed and no atoms have failed. :rtype: list[Atom] | None """ if self._failed_atoms is None and self.is_finished: if self._is_canceled: return [] self._failed_atoms = [] for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids( ): subjob = self.subjob(subjob_id) atom = subjob.atoms[atom_id] self._failed_atoms.append(atom) return self._failed_atoms def _result(self): """ :rtype: str | None """ if self._is_canceled: return BuildResult.FAILURE if self.is_finished: if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0: return BuildResult.NO_FAILURES return BuildResult.FAILURE return None def _perform_async_postbuild_tasks(self): """ Once a build is complete, certain tasks can be performed asynchronously. """ self._create_build_artifact() self._logger.debug('Postbuild tasks completed for build {}', self.build_id()) self._postbuild_tasks_are_finished = True self._record_state_timestamp(BuildStatus.FINISHED) def _create_build_artifact(self): self._build_artifact = BuildArtifact(self._build_results_dir()) self._build_artifact.generate_failures_file() self._build_artifact.write_timing_data( self._timing_file_path, self._read_subjob_timings_from_results()) self._artifacts_archive_file = app.util.fs.compress_directory( self._build_results_dir(), 'results.tar.gz') def _build_results_dir(self): return BuildArtifact.build_artifact_directory( self.build_id(), result_root=Configuration['results_directory']) def _generate_unique_symlink_path_for_build_repo(self): """ Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself. :rtype: str """ return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4())) def get_state_timestamp(self, build_status): """ Get the recorded timestamp for a given build status. This may be None if the build has not yet reached the specified state. :param build_status: The build status for which to retrieve the corresponding timestamp :type build_status: BuildStatus :return: The timestamp for the specified status :rtype: float | None """ return self._state_timestamps.get(build_status) def _record_state_timestamp(self, build_status): """ Record a timestamp for a given build status. This is used to record the timing of the various build phases and is exposed via the Build object's API representation. :param build_status: The build status for which to record a timestamp :type build_status: BuildStatus """ if self._state_timestamps.get(build_status) is not None: self._logger.warning( 'Overwriting timestamp for build {}, status {}'.format( self.build_id(), build_status)) self._state_timestamps[build_status] = time.time()