Ejemplo n.º 1
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        self._state_machine = BuildFsm(build_id=self._build_id,
                                       enter_state_callbacks={
                                           BuildState.ERROR:
                                           self._on_enter_error_state,
                                           BuildState.CANCELED:
                                           self._on_enter_canceled_state,
                                       })
Ejemplo n.º 2
0
    def setup_build(self, build_id, project_type_params, build_executor_start_index):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for
        this build
        :type build_executor_start_index: int
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._current_build_id = build_id
        self._build_teardown_coin = SingleUseCoin()  # protects against build_teardown being executed multiple times

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(
            target=self._async_setup_build,
            name='Bld{}-Setup'.format(build_id),
            args=(executors, project_type_params, build_executor_start_index)
        ).start()
Ejemplo n.º 3
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._setup_is_started = False
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP: Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False
        self._timing_file_path = None

        self._state_timestamps = {status: None
                                  for status in BuildStatus
                                  }  # initialize all timestamps to None
        self._record_state_timestamp(BuildStatus.QUEUED)
Ejemplo n.º 4
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish
        self._slaves_allocated = []
        self._num_executors_allocated = 0
        self._num_executors_in_use = 0

        self._max_executors = float('inf')
        self._max_executors_per_slave = float('inf')

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None
        self._finished_subjobs = None
        self._postbuild_tasks_are_finished = False
        self._teardowns_finished = False
        self._timing_file_path = None
Ejemplo n.º 5
0
    def test_coin_spend_returns_true_only_once(self):
        coin = SingleUseCoin()

        self.assertTrue(coin.spend(),
                        'First call to spend() should return True.')
        self.assertFalse(coin.spend(),
                         'Subsequent calls to spend() should return False.')
        self.assertFalse(coin.spend(),
                         'Subsequent calls to spend() should return False.')
Ejemplo n.º 6
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._num_slaves_in_use = 0
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish
        self._num_allocated_executors = 0
        self._max_executors = float('inf')
        self._build_completion_lock = Lock()

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None
        self._finished_subjobs = None
        self._postbuild_tasks_are_finished = False
        self._teardowns_finished = False
Ejemplo n.º 7
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        self._state_machine = BuildFsm(
            build_id=self._build_id,
            enter_state_callbacks={
                BuildState.ERROR: self._on_enter_error_state,
                BuildState.CANCELED: self._on_enter_canceled_state,
            }
        )
Ejemplo n.º 8
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._setup_is_started = False
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP: Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False
        self._timing_file_path = None

        self._state_timestamps = {status: None
                                  for status in BuildStatus}   # initialize all timestamps to None
        self._record_state_timestamp(BuildStatus.QUEUED)
Ejemplo n.º 9
0
    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_tar_file = None  # DEPRECATED - Use zip file instead
        self._artifacts_zip_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = OrderedDict()
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        leave_state_callbacks = {
            build_state: self._on_leave_state
            for build_state in BuildState
        }
        self._state_machine = BuildFsm(
            build_id=self._build_id,
            enter_state_callbacks={
                BuildState.ERROR: self._on_enter_error_state,
                BuildState.CANCELED: self._on_enter_canceled_state,
                BuildState.PREPARING: self._on_enter_preparing_state,
            },
            leave_state_callbacks=leave_state_callbacks)

        # Number of times build_setup has failed on this build. If
        # setup_failures increases beyond MAX_SETUP_FAILURES, the build is
        # cancelled
        self.setup_failures = 0
Ejemplo n.º 10
0
    def test_signal_shutdown_process_disconnects_from_master_before_killing_executors(self):
        disconnect_api_url = 'http://{}/v1/slave/1'.format(self._FAKE_MASTER_URL)
        mock_executor = self.patch('app.slave.cluster_slave.SubjobExecutor').return_value

        parent_mock = MagicMock()  # create a parent mock so we can assert on the order of child mock calls.
        parent_mock.attach_mock(self.mock_network, 'mock_network')
        parent_mock.attach_mock(mock_executor, 'mock_executor')

        slave = self._create_cluster_slave(num_executors=3)
        slave.connect_to_master(self._FAKE_MASTER_URL)
        slave._build_teardown_coin = SingleUseCoin()
        self.trigger_graceful_app_shutdown()

        expected_disconnect_call = call.mock_network.put_with_digest(disconnect_api_url, request_params=ANY,
                                                                     secret=ANY, error_on_failure=ANY)
        expected_kill_executor_call = call.mock_executor.kill()
        self.assertEqual(1, parent_mock.method_calls.count(expected_disconnect_call),
                         'Graceful shutdown should cause the slave to make a disconnect call to the master.')
        self.assertEqual(3, parent_mock.method_calls.count(expected_kill_executor_call),
                         'Graceful shutdown should cause the slave to kill all its executors.')
        self.assertLess(parent_mock.method_calls.index(expected_disconnect_call),
                        parent_mock.method_calls.index(expected_kill_executor_call),
                        'Graceful shutdown should disconnect from the master before killing its executors.')
Ejemplo n.º 11
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs
    """
    _build_id_counter = Counter(
    )  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish
        self._slaves_allocated = []
        self._num_executors_allocated = 0
        self._num_executors_in_use = 0

        self._max_executors = float('inf')
        self._max_executors_per_slave = float('inf')

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None
        self._finished_subjobs = None
        self._postbuild_tasks_are_finished = False
        self._teardowns_finished = False
        self._timing_file_path = None

    def api_representation(self):
        return {
            'id': self._build_id,
            'status': self._status(),
            'artifacts': self.
            _artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': self._failed_atoms(
            ),  # todo: print the file contents instead of paths
            'result': self._result(),
        }

    def prepare(self, subjobs, project_type, job_config):
        """
        :type subjobs: list[Subjob]
        :type project_type: project_type.project_type.ProjectType
        :type job_config: JobConfig
        """
        if not self._preparation_coin.spend():
            raise RuntimeError(
                'prepare() was called more than once on build {}.'.format(
                    self._build_id))

        self._project_type = project_type
        self._unstarted_subjobs = Queue(maxsize=len(subjobs))
        self._finished_subjobs = Queue(maxsize=len(subjobs))

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._max_executors = job_config.max_executors
        self._max_executors_per_slave = job_config.max_executors_per_slave
        self._timing_file_path = project_type.timing_file_path(job_config.name)
        self.is_prepared = True

    def finish(self):
        """
        Called when all slaves are done with this build (and any teardown is complete)
        """
        if self._subjobs_are_finished:
            self._teardowns_finished = True
        else:
            raise RuntimeError(
                'Tried to finish build {} but not all subjobs are complete'.
                format(self._build_id))

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    def needs_more_slaves(self):
        """
        Determine whether or not this build should have more slaves allocated to it.

        :rtype: bool
        """
        return self._num_executors_allocated < self._max_executors and not self._unstarted_subjobs.empty(
        )

    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        self._slaves_allocated.append(slave)
        self._num_executors_allocated += min(slave.num_executors,
                                             self._max_executors_per_slave)

        slave.setup(self.build_id(),
                    project_type_params=self.build_request.build_parameters())

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def begin_subjob_executions_on_slave(self, slave):
        """
        Begin subjob executions on a slave. This should be called once after the specified slave has already run
        build_setup commands for this build.

        :type slave: Slave
        """
        for slave_executor_count in range(slave.num_executors):
            if (self._num_executors_in_use >= self._max_executors
                    or slave_executor_count >= self._max_executors_per_slave):
                break
            slave.claim_executor()
            self._num_executors_in_use += 1
            self.execute_next_subjob_on_slave(slave)

    def execute_next_subjob_on_slave(self, slave):
        """
        Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted
        subjob queue is empty, we mark the slave as idle.

        :type slave: Slave
        """
        try:
            subjob = self._unstarted_subjobs.get(block=False)
            self._logger.debug('Sending subjob {} (build {}) to slave {}.',
                               subjob.subjob_id(), subjob.build_id(),
                               slave.url)
            slave.start_subjob(subjob)

        except Empty:
            num_executors_in_use = slave.free_executor()
            if num_executors_in_use == 0:
                try:
                    self._slaves_allocated.remove(slave)
                except ValueError:
                    pass  # We have already deallocated this slave, no need to teardown
                else:
                    slave.teardown()

    def handle_subjob_payload(self, subjob_id, payload=None):
        if not payload:
            self._logger.warning('No payload for subjob {}.', subjob_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(),
                                        payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._logger.debug('Payload for subjob {} written.', subjob_id)
        except:
            self._logger.warning('Writing payload for subjob {} FAILED.',
                                 subjob_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self._all_subjobs_by_id[int(subjob_id)]
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            subjobs_are_finished = self._subjobs_are_finished

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if subjobs_are_finished:
            self._logger.info("All results received for build {}!",
                              self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks,
                       name='PostBuild{}'.format(self._build_id)).start()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.

        :type failure_reason: str
        """
        self._logger.error('Build {} failed: {}', self.build_id(),
                           failure_reason)
        self._error_message = failure_reason

    def cancel(self):
        """
        Cancel a running build
        """
        # Early exit if build is not running
        if self._status() in [
                BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED
        ]:
            return

        self._is_canceled = True

        # Deplete the unstarted subjob queue.
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(
        ):
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize(
        )

    @property
    def _num_atoms(self):
        if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]:
            return None
        return sum([
            len(subjob.atomic_commands())
            for subjob in self._all_subjobs_by_id.values()
        ])

    @property
    def _subjobs_are_finished(self):
        return self._is_canceled or (self.is_prepared
                                     and self._finished_subjobs.full())

    @property
    def is_finished(self):
        # TODO: Clean up this logic or move everything into a state machine
        build_fully_completed = self._postbuild_tasks_are_finished and self._teardowns_finished
        return self._is_canceled or build_fully_completed

    @property
    def is_unstarted(self):
        return self.is_prepared and self._num_executors_allocated == 0 and self._unstarted_subjobs.full(
        )

    @property
    def has_error(self):
        return self._error_message is not None

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished, self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total)
        return None

    def _status(self):
        """
        :rtype: BuildStatus
        """
        if self.has_error:
            return BuildStatus.ERROR
        elif self._is_canceled:
            return BuildStatus.CANCELED
        elif not self.is_prepared or self.is_unstarted:
            return BuildStatus.QUEUED
        elif self.is_finished:
            return BuildStatus.FINISHED
        else:
            return BuildStatus.BUILDING

    def _failed_atoms(self):
        """
        The commands which failed
        :rtype: list [str] | None
        """
        if self._is_canceled:
            return []

        if self.is_finished:
            # dict.values() returns a view object in python 3, so wrapping values() in a list
            return list(self._build_artifact.get_failed_commands().values())
        return None

    def _result(self):
        """
        :rtype: str | None
        """
        if self._is_canceled:
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_commands()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        # @TODO There is a race condition here where the build is marked finished before the results archive
        # is prepared.  If the user requests the build status before archival finishes, the 'artifacts'
        # value in the post body will be None.  self.is_finished should be conditional on whether archival
        # is finished.
        self._create_build_artifact()
        self._logger.debug('Postbuild tasks completed for build {}',
                           self.build_id())
        self._postbuild_tasks_are_finished = True

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(
            self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(
            self._build_results_dir(), 'results.tar.gz')

    def _build_results_dir(self):
        return os.path.join(
            Configuration['results_directory'],
            str(self.build_id()),
        )
Ejemplo n.º 12
0
class ClusterSlave(object):

    API_VERSION = 'v1'

    def __init__(self, port, host, num_executors=10):
        """
        :param port: The port number the slave service is running on
        :type port: int
        :param host: The hostname at which the slave is reachable
        :type host: str
        :param num_executors: The number of executors this slave should operate with -- this determines how many
            concurrent subjobs the slave can execute.
        :type num_executors: int
        """
        self.port = port
        self.host = host
        self.is_alive = True
        self._slave_id = None
        self._num_executors = num_executors
        self._logger = log.get_logger(__name__)

        self._idle_executors = Queue(maxsize=num_executors)
        self.executors_by_id = {}
        for executor_id in range(num_executors):
            executor = SubjobExecutor(executor_id)
            self._idle_executors.put(executor)
            self.executors_by_id[executor_id] = executor

        self._master_url = None
        self._network = Network(min_connection_poolsize=num_executors)
        self._master_api = None  # wait until we connect to a master first

        self._project_type = None  # this will be instantiated during build setup
        self._current_build_id = None
        self._build_teardown_coin = None

    def api_representation(self):
        """
        Gets a dict representing this resource which can be returned in an API response.
        :rtype: dict [str, mixed]
        """
        executors_representation = [executor.api_representation() for executor in self.executors_by_id.values()]
        return {
            'is_alive': self.is_alive,
            'master_url': self._master_url,
            'current_build_id': self._current_build_id,
            'slave_id': self._slave_id,
            'executors': executors_representation,
        }

    def get_status(self):
        """
        Just returns a dumb message and prints it to the console.
        """
        return 'Slave service is up. <Port: {}>'.format(self.port)

    def setup_build(self, build_id, project_type_params, build_executor_start_index):
        """
        Usually called once per build to do build-specific setup. Will block any subjobs from executing until setup
        completes. The actual setup is performed on another thread and will unblock subjobs (via an Event) once it
        finishes.

        :param build_id: The id of the build to run setup on
        :type build_id: int
        :param project_type_params: The parameters that define the project_type this build will execute in
        :type project_type_params: dict
        :param build_executor_start_index: How many executors have alreayd been allocated on other slaves for
        this build
        :type build_executor_start_index: int
        """
        self._logger.info('Executing setup for build {} (type: {}).', build_id, project_type_params.get('type'))
        self._current_build_id = build_id
        self._build_teardown_coin = SingleUseCoin()  # protects against build_teardown being executed multiple times

        # create an project_type instance for build-level operations
        self._project_type = util.create_project_type(project_type_params)

        # verify all executors are idle
        if not self._idle_executors.full():
            raise RuntimeError('Slave tried to setup build but not all executors are idle. ({}/{} executors idle.)'
                               .format(self._idle_executors.qsize(), self._num_executors))

        # Collect all the executors to pass to project_type.fetch_project(). This will create a new project_type for
        # each executor (for subjob-level operations).
        executors = list(self._idle_executors.queue)
        SafeThread(
            target=self._async_setup_build,
            name='Bld{}-Setup'.format(build_id),
            args=(executors, project_type_params, build_executor_start_index)
        ).start()

    def _async_setup_build(self, executors, project_type_params, build_executor_start_index):
        """
        Called from setup_build(). Do asynchronous setup for the build so that we can make the call to setup_build()
        non-blocking.

        :type executors: list[SubjobExecutor]
        :type project_type_params: dict
        :type build_executor_start_index: int
        """
        self._base_executor_index = build_executor_start_index
        try:
            self._project_type.fetch_project()
            for executor in executors:
                executor.configure_project_type(project_type_params)
            self._project_type.run_job_config_setup()

        except SetupFailureError as ex:
            self._logger.error(ex)
            self._logger.info('Notifying master that build setup has failed for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_FAILED)

        else:
            self._logger.info('Notifying master that build setup is complete for build {}.', self._current_build_id)
            self._notify_master_of_state_change(SlaveState.SETUP_COMPLETED)

    def teardown_build(self, build_id=None):
        """
        Called at the end of each build on each slave before it reports back to the master that it is idle again.

        :param build_id: The build id to teardown -- this parameter is used solely for correctness checking of the
            master, to make sure that the master is not erroneously sending teardown commands for other builds.
        :type build_id: int | None
        """
        if self._current_build_id is None:
            raise BadRequestError('Tried to teardown a build but no build is active on this slave.')

        if build_id is not None and build_id != self._current_build_id:
            raise BadRequestError('Tried to teardown build {}, '
                                  'but slave is running build {}!'.format(build_id, self._current_build_id))
        SafeThread(
            target=self._async_teardown_build,
            name='Bld{}-Teardwn'.format(build_id)
        ).start()

    def _async_teardown_build(self):
        """
        Called from teardown_build(). Do asynchronous teardown for the build so that we can make the call to
        teardown_build() non-blocking. Also take care of posting back to the master when teardown is complete.
        """
        self._do_build_teardown_and_reset()
        while not self._idle_executors.full():
            time.sleep(1)
        self._send_master_idle_notification()

    def _do_build_teardown_and_reset(self, timeout=None):
        """
        Kill any currently running subjobs. Run the teardown_build commands for the current build (with an optional
        timeout). Clear attributes related to the currently running build.

        :param timeout: A maximum time in seconds to allow the teardown process to run before killing
        :type timeout: int | None
        """
        # Kill all subjob executors' processes. This only has an effect if we are tearing down before a build completes.
        for executor in self.executors_by_id.values():
            executor.kill()

        # Order matters! Spend the coin if it has been initialized.
        if not self._build_teardown_coin or not self._build_teardown_coin.spend() or not self._project_type:
            return  # There is no build to tear down or teardown is already in progress.

        self._logger.info('Executing teardown for build {}.', self._current_build_id)
        # todo: Catch exceptions raised during teardown_build so we don't skip notifying master of idle/disconnect.
        self._project_type.teardown_build(timeout=timeout)
        self._logger.info('Build teardown complete for build {}.', self._current_build_id)
        self._current_build_id = None
        self._project_type = None

    def _send_master_idle_notification(self):
        if not self._is_master_responsive():
            self._logger.notice('Could not post idle notification to master because master is unresponsive.')
            return

        # Notify master that this slave is finished with teardown and ready for a new build.
        self._logger.info('Notifying master that this slave is ready for new builds.')
        self._notify_master_of_state_change(SlaveState.IDLE)

    def _disconnect_from_master(self):
        """
        Perform internal bookkeeping, as well as notify the master, that this slave is disconnecting itself
        from the slave pool.
        """
        self.is_alive = False

        if not self._is_master_responsive():
            self._logger.notice('Could not post disconnect notification to master because master is unresponsive.')
            return

        # Notify master that this slave is shutting down and should not receive new builds.
        self._logger.info('Notifying master that this slave is disconnecting.')
        self._notify_master_of_state_change(SlaveState.DISCONNECTED)

    def connect_to_master(self, master_url=None):
        """
        Notify the master that this slave exists.

        :param master_url: The URL of the master service. If none specified, defaults to localhost:43000.
        :type master_url: str | None
        """
        self.is_alive = True
        self._master_url = master_url or 'localhost:43000'
        self._master_api = UrlBuilder(self._master_url)
        connect_url = self._master_api.url('slave')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'num_executors': self._num_executors,
        }
        response = self._network.post(connect_url, data=data)
        self._slave_id = int(response.json().get('slave_id'))
        self._logger.info('Slave {}:{} connected to master on {}.', self.host, self.port, self._master_url)

        # We disconnect from the master before build_teardown so that the master stops sending subjobs. (Teardown
        # callbacks are executed in the reverse order that they're added, so we add the build_teardown callback first.)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._do_build_teardown_and_reset, timeout=30)
        UnhandledExceptionHandler.singleton().add_teardown_callback(self._disconnect_from_master)

    def _is_master_responsive(self):
        """
        Ping the master to check if it is still alive. Code using this method should treat the return value as a
        *probable* truth since the state of the master can change at any time. This method is not a replacement for
        error handling.

        :return: Whether the master is responsive or not
        :rtype: bool
        """
        # todo: This method repeats some logic we have in the deployment code (checking a service). We should DRY it up.
        is_responsive = True
        try:
            self._network.get(self._master_api.url())
        except requests.ConnectionError:
            is_responsive = False

        return is_responsive

    def start_working_on_subjob(self, build_id, subjob_id, subjob_artifact_dir, atomic_commands):
        """
        Begin working on a subjob with the given build id and subjob id. This just starts the subjob execution
        asynchronously on a separate thread.

        :type build_id: int
        :type subjob_id: int
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        :return: The text to return in the API response.
        :rtype: dict[str, int]
        """
        if build_id != self._current_build_id:
            raise BadRequestError('Attempted to start subjob {} for build {}, '
                                  'but current build id is {}.'.format(subjob_id, build_id, self._current_build_id))

        # get idle executor from queue to claim it as in-use (or block until one is available)
        executor = self._idle_executors.get()

        # Start a thread to execute the job (after waiting for setup to complete)
        SafeThread(
            target=self._execute_subjob,
            args=(build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands),
            name='Bld{}-Sub{}'.format(build_id, subjob_id),
        ).start()

        self._logger.info('Slave ({}:{}) has received subjob. (Build {}, Subjob {})', self.host, self.port, build_id,
                          subjob_id)
        return {'executor_id': executor.id}

    def _execute_subjob(self, build_id, subjob_id, executor, subjob_artifact_dir, atomic_commands):
        """
        This is the method for executing a subjob asynchronously. This performs the work required by executing the
        specified command, then does a post back to the master results endpoint to signal that the work is done.

        :type build_id: int
        :type subjob_id: int
        :type executor: SubjobExecutor
        :type subjob_artifact_dir: str
        :type atomic_commands: list[str]
        """
        subjob_event_data = {'build_id': build_id, 'subjob_id': subjob_id, 'executor_id': executor.id}

        analytics.record_event(analytics.SUBJOB_EXECUTION_START, **subjob_event_data)
        results_file = executor.execute_subjob(build_id, subjob_id, subjob_artifact_dir, atomic_commands,
                                               self._base_executor_index)
        analytics.record_event(analytics.SUBJOB_EXECUTION_FINISH, **subjob_event_data)

        results_url = self._master_api.url('build', build_id, 'subjob', subjob_id, 'result')
        data = {
            'slave': '{}:{}'.format(self.host, self.port),
            'metric_data': {'executor_id': executor.id},
        }
        files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')}

        self._idle_executors.put(executor)  # work is done; mark executor as idle
        self._network.post(results_url, data=data, files=files)  # todo: check return code

        self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id)

    def _notify_master_of_state_change(self, new_state):
        """
        Send a state notification to the master. This is used to notify the master of events occurring on the slave
        related to build execution progress.

        :type new_state: SlaveState
        """
        state_url = self._master_api.url('slave', self._slave_id)
        self._network.put_with_digest(state_url, request_params={'slave': {'state': new_state}},
                                      secret=Secret.get(), error_on_failure=True)

    def kill(self):
        """
        Exits without error.
        """
        sys.exit(0)
Ejemplo n.º 13
0
    def test_coin_spend_returns_true_only_once(self):
        coin = SingleUseCoin()

        self.assertTrue(coin.spend(), 'First call to spend() should return True.')
        self.assertFalse(coin.spend(), 'Subsequent calls to spend() should return False.')
        self.assertFalse(coin.spend(), 'Subsequent calls to spend() should return False.')
Ejemplo n.º 14
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs

    :type _build_id: int
    :type _build_request: BuildRequest
    :type _build_artifact: None | BuildArtifact
    :type _error_message: None | str
    :type _project_type: None | ProjectType
    :type _timing_file_path: None | str
    """
    _build_id_counter = Counter()  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_tar_file = None  # DEPRECATED - Use zip file instead
        self._artifacts_zip_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = OrderedDict()
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        leave_state_callbacks = {build_state: self._on_leave_state
                                 for build_state in BuildState}
        self._state_machine = BuildFsm(
            build_id=self._build_id,
            enter_state_callbacks={
                BuildState.ERROR: self._on_enter_error_state,
                BuildState.CANCELED: self._on_enter_canceled_state,
                BuildState.PREPARING: self._on_enter_preparing_state,
            },
            leave_state_callbacks=leave_state_callbacks
        )

        # Number of times build_setup has failed on this build. If
        # setup_failures increases beyond MAX_SETUP_FAILURES, the build is
        # cancelled
        self.setup_failures = 0

    def api_representation(self):
        failed_atoms_api_representation = None
        if self._get_failed_atoms() is not None:
            failed_atoms_api_representation = [failed_atom.api_representation()
                                               for failed_atom in self._get_failed_atoms()]
        build_state = self._status()
        # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing.
        if build_state in (BuildState.PREPARING, BuildState.PREPARED):
            build_state = BuildState.QUEUED

        return {
            'id': self._build_id,
            'status': build_state,
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': failed_atoms_api_representation,
            'result': self._result(),
            'request_params': self.build_request.build_parameters(),
            # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries
            # by numerically increasing dict value, with None values sorting highest.
            'state_timestamps': OrderedDict(sorted(
                [(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()],
                key=lambda item: item[1] or float('inf'))),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo()

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update({'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)
        if self._project_type is None:
            raise BuildProjectError('Build failed due to an invalid project type.')

    def prepare(self):
        if not isinstance(self.build_request, BuildRequest):
            raise RuntimeError('Build {} has no associated request object.'.format(self._build_id))

        if not isinstance(self.project_type, ProjectType):
            raise RuntimeError('Build {} has no project set.'.format(self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id))

        self._state_machine.trigger(BuildEvent.START_PREPARE)

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    @property
    def build_request(self):
        """
        :rtype: BuildRequest
        """
        return self._build_request

    def get_subjobs(self, offset: int=None, limit: int=None) -> List['Subjob']:
        """
        Returns a list of subjobs for this build
        :param offset: The starting index of the requested build
        :param limit: The number of builds requested
        """
        num_subjobs = len(self._all_subjobs_by_id)
        start, end = get_paginated_indices(offset, limit, num_subjobs)
        requested_subjobs = islice(self._all_subjobs_by_id, start, end)
        return [self._all_subjobs_by_id[key] for key in requested_subjobs]

    def subjob(self, subjob_id: int) -> Subjob:
        """Return the subjob for this build with the specified id."""
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception('Error while completing subjob; marking build as failed.')
            self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _parse_payload_for_atom_exit_code(self, subjob_id):
        subjob = self.subjob(subjob_id)
        for atom_id in range(len(subjob.atoms)):
            artifact_dir = BuildArtifact.atom_artifact_directory(
                self.build_id(),
                subjob.subjob_id(),
                atom_id,
                result_root=Configuration['results_directory']
            )
            atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE)
            with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file:
                subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read())

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(), payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._parse_payload_for_atom_exit_code(subjob_id)
        except:
            internal_errors.labels(ErrorType.SubjobWriteFailure).inc()  # pylint: disable=no-member
            self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self.subjob(subjob_id)
        subjob.mark_completed()
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            should_trigger_postbuild_tasks = self._all_subjobs_are_finished() and not self.is_stopped

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if should_trigger_postbuild_tasks:
            self._logger.info("All results received for build {}!", self._build_id)
            self.finish()

    def mark_started(self):
        """
        Mark the build as started.
        """
        self._state_machine.trigger(BuildEvent.START_BUILDING)

    def finish(self):
        """
        Perform postbuild task and mark this build as finished.
        """
        Thread(
            target=self._perform_async_postbuild_tasks,
            name='PostBuild{}'.format(self._build_id),
        ).start()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.
        :type failure_reason: str
        """
        self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason)

    def _on_enter_error_state(self, event):
        """
        Store an error message for the build and log the failure. This method is triggered by
        a state machine transition to the ERROR state.
        :param event: The Fysom event object
        """
        # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message?
        default_error_msg = 'An unspecified error occurred.'
        self._error_message = getattr(event, 'error_msg', default_error_msg)
        self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message)

    def _on_enter_preparing_state(self, event):
        """
        Prepare the build by atomization and subjobs creation.
        This method is triggered by a state machine transition to the PREPARING state.
        :param event: The Fysom event object
        :type event: BuildEvent
        """
        self._logger.info('Fetching project for build {}.', self._build_id)
        self.project_type.fetch_project()
        self._logger.info('Successfully fetched project for build {}.', self._build_id)

        job_config = self.project_type.job_config()
        if job_config is None:
            raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.')

        subjobs = compute_subjobs_for_build(self._build_id, job_config, self.project_type)

        self._unstarted_subjobs = Queue(maxsize=len(subjobs))  # WIP(joey): Move this into BuildScheduler?
        self._finished_subjobs = Queue(maxsize=len(subjobs))  # WIP(joey): Remove this and just record finished count.

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._timing_file_path = self._project_type.timing_file_path(job_config.name)
        app.util.fs.create_dir(self._build_results_dir())
        self._state_machine.trigger(BuildEvent.FINISH_PREPARE)

    def _on_leave_state(self, event):
        start_time = self._state_machine.transition_timestamps.get(event.src)
        if start_time is not None:
            elapsed = time.time() - start_time
            build_state_duration_seconds.labels(event.src.value).observe(elapsed)  # pylint: disable=no-member
        else:
            self._logger.warn('Build {} transitioned from state {} to state {} but never marked started timestamp for {}',
                              self._build_id, event.src, event.dst, event.src)

    def cancel(self):
        """
        Cancel a running build.
        """
        self._state_machine.trigger(BuildEvent.CANCEL)

    def _on_enter_canceled_state(self, event):
        """
        :param event: The Fysom event object
        :type event: BuildEvent
        """
        self._logger.notice('Canceling build {}.', self._build_id)
        # Set the kill_event to kill the subprocesses for the build
        self.project_type.kill_subprocesses()

        # Deplete the unstarted subjob queue.
        # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled().
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty():
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def artifacts_zip_file(self):
        """Return the local path to the artifacts zip archive."""
        return self._artifacts_zip_file

    @property
    def artifacts_tar_file(self):
        """
        DEPRECATED: We are transitioning to zip files from tar.gz files for artifacts.
        Return the local path to the artifacts tar.gz archive.
        """
        self._logger.warning('The tar format for build artifact files is deprecated. File: {}',
                             self._artifacts_tar_file)
        return self._artifacts_tar_file

    # WIP(joey): Change some of these private @properties to methods.
    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize()

    @property
    def _num_atoms(self):
        # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly
        if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]:
            return None
        return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()])

    def _all_subjobs_are_finished(self):
        return self._finished_subjobs and self._finished_subjobs.full()

    @property
    def is_finished(self):
        # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here.
        return self.is_canceled or self._postbuild_tasks_are_finished

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished,
                self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total
            )
        return None

    def _status(self):  # WIP(joey): Rename to _state.
        """
        :rtype: BuildState
        """
        return self._state_machine.state

    @property
    def has_error(self):
        return self._status() is BuildState.ERROR

    @property
    def is_canceled(self):
        return self._status() is BuildState.CANCELED

    @property
    def is_stopped(self):
        return self._status() in (BuildState.ERROR, BuildState.CANCELED)

    def _get_failed_atoms(self):
        """
        The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if
        build has completed and no atoms have failed.
        :rtype: list[Atom] | None
        """
        if self._failed_atoms is None and self.is_finished:
            if self.is_canceled:
                return []

            self._failed_atoms = []
            for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids():
                subjob = self.subjob(subjob_id)
                atom = subjob.atoms[atom_id]
                self._failed_atoms.append(atom)

        return self._failed_atoms

    def _result(self):
        """
        Can return three states:
            None:
            FAILURE:
            NO_FAILURES:
        :rtype: BuildResult | None
        """
        if self.is_canceled:
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, execute certain tasks like archiving the artifacts and writing timing
        data. This method also transitions the FSM to finished after the postbuild tasks are complete.
        """
        try:
            timing_data = self._read_subjob_timings_from_results()
            self._create_build_artifact(timing_data)
            serialized_build_time_seconds.observe(sum(timing_data.values()))
            self._delete_temporary_build_artifact_files()
            self._postbuild_tasks_are_finished = True
            self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE)
            self._logger.notice('Completed build (id: {}), saving to database.'.format(self._build_id))
            self.save()

        except Exception as ex:  # pylint: disable=broad-except
            internal_errors.labels(ErrorType.PostBuildFailure).inc()  # pylint: disable=no-member
            self._logger.exception('Postbuild tasks failed for build {}.'.format(self._build_id))
            self.mark_failed('Postbuild tasks failed due to an internal error: "{}"'.format(ex))

    def _create_build_artifact(self, timing_data: Dict[str, float]):  # pylint: disable=unsubscriptable-object
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(self._timing_file_path, timing_data)
        self._artifacts_tar_file = app.util.fs.tar_directory(self._build_results_dir(),
                                                             BuildArtifact.ARTIFACT_TARFILE_NAME)
        temp_tar_path = None
        try:
            # Temporarily move aside tar file so we can create a zip file, then move it back.
            # This juggling can be removed once we're no longer creating tar artifacts.
            temp_tar_path = shutil.move(self._artifacts_tar_file, tempfile.mktemp())
            self._artifacts_zip_file = app.util.fs.zip_directory(self._build_results_dir(),
                                                                 BuildArtifact.ARTIFACT_ZIPFILE_NAME)
        except Exception:  # pylint: disable=broad-except
            internal_errors.labels(ErrorType.ZipFileCreationFailure).inc()  # pylint: disable=no-member

            # Due to issue #339 we are ignoring exceptions in the zip file creation for now.
            self._logger.exception('Zipping of artifacts failed. This error will be ignored.')
        finally:
            if temp_tar_path:
                shutil.move(temp_tar_path, self._artifacts_tar_file)

    def _delete_temporary_build_artifact_files(self):
        """
        Delete the temporary build result files that are no longer needed, due to the creation of the
        build artifact tarball.

        ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results.
        """
        build_result_dir = self._build_results_dir()
        start_time = time.time()
        for path in os.listdir(build_result_dir):
            # The build result archive is also stored in this same directory, so we must not delete it.
            if path in (BuildArtifact.ARTIFACT_TARFILE_NAME, BuildArtifact.ARTIFACT_ZIPFILE_NAME):
                continue
            full_path = os.path.join(build_result_dir, path)
            # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every
            # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn.
            if os.path.isdir:
                shutil.rmtree(full_path, ignore_errors=True)
            else:
                os.remove(full_path)
        end_time = time.time() - start_time
        self._logger.info('Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time)

    def _build_results_dir(self):
        return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory'])

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))

    # pylint: disable=protected-access
    def save(self):
        """Serialize the Build object and update all of the parts to the database."""
        with Connection.get() as session:
            build_schema = session.query(BuildSchema).filter(BuildSchema.build_id == self._build_id).first()
            failed_artifact_directories_schema = session.query(FailedArtifactDirectoriesSchema) \
                .filter(FailedArtifactDirectoriesSchema.build_id == self._build_id) \
                .all()
            failed_subjob_atom_pairs_schema = session.query(FailedSubjobAtomPairsSchema) \
                .filter(FailedSubjobAtomPairsSchema.build_id == self._build_id) \
                .all()
            atoms_schema = session.query(AtomsSchema).filter(AtomsSchema.build_id == self._build_id).all()
            subjobs_schema = session.query(SubjobsSchema).filter(SubjobsSchema.build_id == self._build_id).all()

            # If this wasn't found, it's safe to assume that the build doesn't exist within the database
            if build_schema is None:
                raise ItemNotFoundError('Unable to find build (id: {}) in database.'.format(self._build_id))

            build_schema.artifacts_tar_file = self._artifacts_tar_file
            build_schema.artifacts_zip_file = self._artifacts_zip_file
            build_schema.error_message = self._error_message
            build_schema.postbuild_tasks_are_finished = self._postbuild_tasks_are_finished
            build_schema.setup_failures = self.setup_failures
            build_schema.timing_file_path = self._timing_file_path

            build_artifact_dir = None
            if self._build_artifact is not None:
                build_artifact_dir = self._build_artifact.build_artifact_dir

            build_schema.build_artifact_dir = build_artifact_dir

            if self._build_artifact is not None:
                # Clear all old directories
                session.query(FailedArtifactDirectoriesSchema) \
                    .filter(FailedArtifactDirectoriesSchema.build_id == self._build_id) \
                    .delete()

                # Commit changes so we don't delete the newly added rows later
                session.commit()

                # Add all the updated versions of the directories
                for directory in self._build_artifact._get_failed_artifact_directories():
                    failed_artifact_directory = FailedArtifactDirectoriesSchema(
                        build_id=self._build_id,
                        failed_artifact_directory=directory
                    )
                    session.add(failed_artifact_directory)

            if self._build_artifact is not None:
                # Clear all old directories
                session.query(FailedSubjobAtomPairsSchema) \
                    .filter(FailedSubjobAtomPairsSchema.build_id == self._build_id) \
                    .delete()

                # Commit changes so we don't delete the newly added rows later
                session.commit()

                # Add all the updated versions of the data
                for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids():
                    failed_subjob_and_atom_ids = FailedSubjobAtomPairsSchema(
                        build_id=self._build_id,
                        subjob_id=subjob_id,
                        atom_id=atom_id
                    )
                    session.add(failed_subjob_and_atom_ids)

            build_schema.build_parameters = json.dumps(self._build_request.build_parameters())

            fsm_timestamps = {state.lower(): timestamp for state, timestamp in self._state_machine.transition_timestamps.items()}
            build_schema.state = self._status()
            build_schema.queued_ts = fsm_timestamps['queued']
            build_schema.finished_ts = fsm_timestamps['finished']
            build_schema.prepared_ts = fsm_timestamps['prepared']
            build_schema.preparing_ts = fsm_timestamps['preparing']
            build_schema.error_ts = fsm_timestamps['error']
            build_schema.canceled_ts = fsm_timestamps['canceled']
            build_schema.building_ts = fsm_timestamps['building']

            # Subjobs
            # Clear all old Subjobs and Atoms
            session.query(SubjobsSchema) \
                .filter(SubjobsSchema.build_id == self._build_id) \
                .delete()
            session.query(AtomsSchema) \
                .filter(AtomsSchema.build_id == self._build_id) \
                .delete()

            # Commit changes so we don't delete the newly added rows later
            session.commit()

            # Add all the updated versions of Subjobs and Atoms
            subjobs = self._all_subjobs_by_id
            for subjob_id in subjobs:
                subjob = self._all_subjobs_by_id[subjob_id]
                subjob_schema = SubjobsSchema(
                    subjob_id=subjob_id,
                    build_id=self._build_id,
                    completed=subjob.completed
                )
                session.add(subjob_schema)

                # Atoms
                for atom in subjob._atoms:
                    atom_schema = AtomsSchema(
                        atom_id=atom.id,
                        build_id=self._build_id,
                        subjob_id=subjob_id,
                        command_string=atom.command_string,
                        expected_time=atom.expected_time,
                        actual_time=atom.actual_time,
                        exit_code=atom.exit_code,
                        state=atom.state
                    )
                    session.add(atom_schema)

    @classmethod
    def load_from_db(cls, build_id):
        """
        Given a build_id, fetch all the stored information from the database to reconstruct
        a Build object to represent that build.
        :param build_id: The id of the build to recreate.
        """
        with Connection.get() as session:
            build_schema = session.query(BuildSchema).filter(BuildSchema.build_id == build_id).first()
            failed_artifact_directories_schema = session.query(FailedArtifactDirectoriesSchema) \
                .filter(FailedArtifactDirectoriesSchema.build_id == build_id) \
                .all()
            failed_subjob_atom_pairs_schema = session.query(FailedSubjobAtomPairsSchema) \
                .filter(FailedSubjobAtomPairsSchema.build_id == build_id) \
                .all()
            atoms_schema = session.query(AtomsSchema).filter(AtomsSchema.build_id == build_id).all()
            subjobs_schema = session.query(SubjobsSchema).filter(SubjobsSchema.build_id == build_id).all()

            # If a query returns None, then we know the build wasn't found in the database
            if not build_schema:
                return None

            build_parameters = json.loads(build_schema.build_parameters)

            # Genereate a BuildRequest object with our query response
            build_request = BuildRequest(build_parameters)

            # Create initial Build object, we will be altering the state of this as we get more data
            build = Build(build_request)
            build._build_id = build_id

            # Manually generate ProjectType object for build and create a `job_config` since this is usually done in `prepare()`
            build.generate_project_type()
            job_config = build.project_type.job_config()

            # Manually update build data
            build._artifacts_tar_file = build_schema.artifacts_tar_file
            build._artifacts_zip_file = build_schema.artifacts_zip_file
            build._error_message = build_schema.error_message
            build._postbuild_tasks_are_finished = bool(int(build_schema.postbuild_tasks_are_finished))
            build.setup_failures = build_schema.setup_failures
            build._timing_file_path = build_schema.timing_file_path

            # Manually set the state machine timestamps
            build._state_machine._transition_timestamps = {
                BuildState.QUEUED: build_schema.queued_ts,
                BuildState.FINISHED: build_schema.finished_ts,
                BuildState.PREPARED: build_schema.prepared_ts,
                BuildState.PREPARING: build_schema.preparing_ts,
                BuildState.ERROR: build_schema.error_ts,
                BuildState.CANCELED: build_schema.canceled_ts,
                BuildState.BUILDING: build_schema.building_ts
            }
            build._state_machine._fsm.current = BuildState[build_schema.state]

            build_artifact = BuildArtifact(build_schema.build_artifact_dir)

            directories = []
            for directory in failed_artifact_directories_schema:
                directories.append(directory.failed_artifact_directory)
            build_artifact._failed_artifact_directories = directories

            pairs = []
            for pair in failed_subjob_atom_pairs_schema:
                pairs.append((pair.subjob_id, pair.atom_id))
            build_artifact._q_failed_subjob_atom_pairs = pairs

            build._build_artifact = build_artifact

            atoms_by_subjob_id = {}
            for atom in atoms_schema:
                atoms_by_subjob_id.setdefault(atom.subjob_id, [])
                atoms_by_subjob_id[atom.subjob_id].append(Atom(
                    atom.command_string,
                    atom.expected_time,
                    atom.actual_time,
                    atom.exit_code,
                    atom.state,
                    atom.atom_id,
                    atom.subjob_id
                ))

            subjobs = OrderedDict()
            for subjob in subjobs_schema:
                atoms = atoms_by_subjob_id[subjob.subjob_id]
                # Add atoms after subjob is created so we don't alter their state on initialization
                subjob_to_add = Subjob(build_id, subjob.subjob_id, build.project_type, job_config, [])
                subjob_to_add._atoms = atoms
                subjob_to_add.completed = subjob.completed
                subjobs[subjob.subjob_id] = subjob_to_add
            build._all_subjobs_by_id = subjobs

            # Place subjobs into correct queues within the build
            build._unstarted_subjobs = Queue(maxsize=len(subjobs))
            build._finished_subjobs = Queue(maxsize=len(subjobs))
            for _, subjob in subjobs.items():
                if subjob.completed:
                    build._finished_subjobs.put(subjob)
                else:
                    build._unstarted_subjobs.put(subjob)

            return build
Ejemplo n.º 15
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs

    :type _build_id: int
    :type _build_request: BuildRequest
    :type _build_artifact: None | BuildArtifact
    :type _error_message: None | str
    :type _project_type: None | ProjectType
    :type _timing_file_path: None | str
    """
    _build_id_counter = Counter(
    )  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        self._state_machine = BuildFsm(build_id=self._build_id,
                                       enter_state_callbacks={
                                           BuildState.ERROR:
                                           self._on_enter_error_state,
                                           BuildState.CANCELED:
                                           self._on_enter_canceled_state,
                                       })

    def api_representation(self):
        failed_atoms_api_representation = None
        if self._get_failed_atoms() is not None:
            failed_atoms_api_representation = [
                failed_atom.api_representation()
                for failed_atom in self._get_failed_atoms()
            ]
        build_state = self._status()
        # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing.
        if build_state in (BuildState.PREPARING, BuildState.PREPARED):
            build_state = BuildState.QUEUED

        return {
            'id':
            self._build_id,
            'status':
            build_state,
            'artifacts':
            self.
            _artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details':
            self._detail_message,
            'error_message':
            self._error_message,
            'num_atoms':
            self._num_atoms,
            'num_subjobs':
            len(self._all_subjobs_by_id),
            'failed_atoms':
            failed_atoms_api_representation,
            'result':
            self._result(),
            'request_params':
            self.build_request.build_parameters(),
            # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries
            # by numerically increasing dict value, with None values sorting highest.
            'state_timestamps':
            OrderedDict(
                sorted([(state.lower(), timestamp) for state, timestamp in
                        self._state_machine.transition_timestamps.items()],
                       key=lambda item: item[1] or float('inf'))),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo(
        )

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update(
            {'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)
        if self._project_type is None:
            raise BuildProjectError(
                'Build failed due to an invalid project type.')

    def prepare(self, subjob_calculator):
        """
        :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build
        :type subjob_calculator: SubjobCalculator
        """
        if not isinstance(self.build_request, BuildRequest):
            raise RuntimeError(
                'Build {} has no associated request object.'.format(
                    self._build_id))

        if not isinstance(self.project_type, ProjectType):
            raise RuntimeError('Build {} has no project set.'.format(
                self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError(
                'prepare() was called more than once on build {}.'.format(
                    self._build_id))

        self._state_machine.trigger(BuildEvent.START_PREPARE)
        # WIP(joey): Move the following code into a PREPARING state callback
        #  (so that it won't execute if the build has already been canceled.)

        self._logger.info('Fetching project for build {}.', self._build_id)
        self.project_type.fetch_project()
        self._logger.info('Successfully fetched project for build {}.',
                          self._build_id)

        job_config = self.project_type.job_config()
        if job_config is None:
            raise RuntimeError(
                'Build failed while trying to parse clusterrunner.yaml.')

        subjobs = subjob_calculator.compute_subjobs_for_build(
            self._build_id, job_config, self.project_type)

        self._unstarted_subjobs = Queue(
            maxsize=len(subjobs))  # WIP(joey): Move this into BuildScheduler?
        self._finished_subjobs = Queue(maxsize=len(
            subjobs))  # WIP(joey): Remove this and just record finished count.

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._timing_file_path = self._project_type.timing_file_path(
            job_config.name)
        app.util.fs.create_dir(self._build_results_dir())
        self._state_machine.trigger(BuildEvent.FINISH_PREPARE)

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    @property
    def build_request(self):
        """
        :rtype: BuildRequest
        """
        return self._build_request

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception(
                'Error while completing subjob; marking build as failed.')
            self.mark_failed(
                'Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _parse_payload_for_atom_exit_code(self, subjob_id):
        subjob = self.subjob(subjob_id)
        for atom_id in range(len(subjob.atoms)):
            artifact_dir = BuildArtifact.atom_artifact_directory(
                self.build_id(),
                subjob.subjob_id(),
                atom_id,
                result_root=Configuration['results_directory'])
            atom_exit_code_file_sys_path = os.path.join(
                artifact_dir, BuildArtifact.EXIT_CODE_FILE)
            with open(atom_exit_code_file_sys_path,
                      'r') as atom_exit_code_file:
                subjob.atoms[atom_id].exit_code = int(
                    atom_exit_code_file.read())

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.',
                                 subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(),
                                        payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._parse_payload_for_atom_exit_code(subjob_id)
        except:
            self._logger.warning(
                'Writing payload for subjob {} of build {} FAILED.', subjob_id,
                self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self.subjob(subjob_id)
        subjob.mark_completed()
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            should_trigger_postbuild_tasks = self._all_subjobs_are_finished(
            ) and not self._is_stopped()

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if should_trigger_postbuild_tasks:
            self._logger.info("All results received for build {}!",
                              self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks,
                       name='PostBuild{}'.format(self._build_id)).start()

    def mark_started(self):
        """
        Mark the build as started.
        """
        self._state_machine.trigger(BuildEvent.START_BUILDING)

    def finish(self):
        """
        Perform postbuild task and mark this build as finished.
        """
        # This method also transitions the FSM to finished after the postbuild tasks are complete.
        self._perform_async_postbuild_tasks()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.
        :type failure_reason: str
        """
        self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason)

    def _on_enter_error_state(self, event):
        """
        Store an error message for the build and log the failure. This method is triggered by
        a state machine transition to the ERROR state.
        :param event: The Fysom event object
        """
        # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message?
        default_error_msg = 'An unspecified error occurred.'
        self._error_message = getattr(event, 'error_msg', default_error_msg)
        self._logger.warning('Build {} failed: {}', self.build_id(),
                             self._error_message)

    def cancel(self):
        """
        Cancel a running build.
        """
        self._logger.notice('Request received to cancel build {}.',
                            self._build_id)
        self._state_machine.trigger(BuildEvent.CANCEL)

    def _on_enter_canceled_state(self, event):
        # Deplete the unstarted subjob queue.
        # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled().
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(
        ):
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    # WIP(joey): Change some of these private @properties to methods.
    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize(
        )

    @property
    def _num_atoms(self):
        # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly
        if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]:
            return None
        return sum([
            len(subjob.atomic_commands())
            for subjob in self._all_subjobs_by_id.values()
        ])

    def _all_subjobs_are_finished(self):
        return self._finished_subjobs and self._finished_subjobs.full()

    @property
    def is_finished(self):
        # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here.
        return self._is_canceled() or self._postbuild_tasks_are_finished

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished, self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total)
        return None

    def _status(self):  # WIP(joey): Rename to _state.
        """
        :rtype: BuildState
        """
        return self._state_machine.state

    @property
    def has_error(self):
        return self._status() is BuildState.ERROR

    def _is_canceled(self):
        return self._status() is BuildState.CANCELED

    def _is_stopped(self):
        return self._status() in (BuildState.ERROR, BuildState.CANCELED)

    def _get_failed_atoms(self):
        """
        The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if
        build has completed and no atoms have failed.
        :rtype: list[Atom] | None
        """
        if self._failed_atoms is None and self.is_finished:
            if self._is_canceled():
                return []

            self._failed_atoms = []
            for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(
            ):
                subjob = self.subjob(subjob_id)
                atom = subjob.atoms[atom_id]
                self._failed_atoms.append(atom)

        return self._failed_atoms

    def _result(self):
        """
        Can return three states:
            None:
            FAILURE:
            NO_FAILURES:
        :rtype: BuildResult | None
        """
        if self._is_canceled():
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        self._create_build_artifact()
        self._delete_temporary_build_artifact_files()
        self._postbuild_tasks_are_finished = True
        self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE)

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(
            self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(
            self._build_results_dir(), BuildArtifact.ARTIFACT_FILE_NAME)

    def _delete_temporary_build_artifact_files(self):
        """
        Delete the temporary build result files that are no longer needed, due to the creation of the
        build artifact tarball.

        ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results.
        """
        build_result_dir = self._build_results_dir()
        start_time = time.time()
        for path in os.listdir(build_result_dir):
            # The build result tar-ball is also stored in this same directory, so we must not delete it.
            if path == BuildArtifact.ARTIFACT_FILE_NAME:
                continue
            full_path = os.path.join(build_result_dir, path)
            # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every
            # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn.
            if os.path.isdir:
                shutil.rmtree(full_path, ignore_errors=True)
            else:
                os.remove(full_path)
        end_time = time.time() - start_time
        self._logger.info(
            'Completed deleting artifact files for {}, took {:.1f} seconds.',
            self._build_id, end_time)

    def _build_results_dir(self):
        return BuildArtifact.build_artifact_directory(
            self.build_id(), result_root=Configuration['results_directory'])

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'],
                            str(uuid.uuid4()))
Ejemplo n.º 16
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs
    """
    _build_id_counter = Counter()  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._setup_is_started = False
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP: Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False
        self._timing_file_path = None

        self._state_timestamps = {status: None
                                  for status in BuildStatus}   # initialize all timestamps to None
        self._record_state_timestamp(BuildStatus.QUEUED)

    def api_representation(self):
        failed_atoms_api_representation = None
        if self._get_failed_atoms() is not None:
            failed_atoms_api_representation = [failed_atom.api_representation()
                                               for failed_atom in self._get_failed_atoms()]

        return {
            'id': self._build_id,
            'status': self._status(),
            'artifacts': self._artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': failed_atoms_api_representation,
            'result': self._result(),
            'request_params': self.build_request.build_parameters(),
            # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries
            # by numerically increasing dict value, with None values sorting highest.
            'state_timestamps': OrderedDict(sorted(
                [(state.lower(), timestamp) for state, timestamp in self._state_timestamps.items()],
                key=lambda item: item[1] or float('inf'))),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo()

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update({'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)

        if self._project_type is None:
            raise BuildProjectError('Build failed due to an invalid project type.')

    def prepare(self, subjob_calculator):
        """
        :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build
        :type subjob_calculator: SubjobCalculator
        """
        if not isinstance(self.build_request, BuildRequest):
            raise RuntimeError('Build {} has no associated request object.'.format(self._build_id))

        if not isinstance(self.project_type, ProjectType):
            raise RuntimeError('Build {} has no project set.'.format(self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id))

        self._logger.info('Fetching project for build {}.', self._build_id)
        self.project_type.fetch_project()
        self._logger.info('Successfully fetched project for build {}.', self._build_id)

        job_config = self.project_type.job_config()
        if job_config is None:
            raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.')

        subjobs = subjob_calculator.compute_subjobs_for_build(self._build_id, job_config, self.project_type)

        self._unstarted_subjobs = Queue(maxsize=len(subjobs))
        self._finished_subjobs = Queue(maxsize=len(subjobs))

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._timing_file_path = self._project_type.timing_file_path(job_config.name)
        self.is_prepared = True
        self._record_state_timestamp(BuildStatus.PREPARED)

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception('Error while completing subjob; marking build as failed.')
            self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _parse_payload_for_atom_exit_code(self, subjob_id):
        subjob = self.subjob(subjob_id)
        for atom_id in range(len(subjob.atoms)):
            artifact_dir = BuildArtifact.atom_artifact_directory(
                self.build_id(),
                subjob.subjob_id(),
                atom_id,
                result_root=Configuration['results_directory']
            )
            atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE)
            with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file:
                subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read())

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(), payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._parse_payload_for_atom_exit_code(subjob_id)
        except:
            self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self.subjob(subjob_id)
        subjob.mark_completed()
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            subjobs_are_finished = self._subjobs_are_finished

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if subjobs_are_finished:
            self._logger.info("All results received for build {}!", self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start()

    def mark_started(self):
        self._setup_is_started = True
        self._record_state_timestamp(BuildStatus.BUILDING)

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.

        :type failure_reason: str
        """
        self._logger.error('Build {} failed: {}', self.build_id(), failure_reason)
        self._error_message = failure_reason
        self._record_state_timestamp(BuildStatus.ERROR)

    def cancel(self):
        """
        Cancel a running build
        """
        # Early exit if build is not running
        if self._status() in [BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED]:
            self._logger.notice('Ignoring cancel request for build {}. Build is already in state {}.',
                                self._build_id, self._status())
            return

        self._logger.notice('Canceling build {}.', self._build_id)
        self._is_canceled = True
        self._record_state_timestamp(BuildStatus.CANCELED)

        # Deplete the unstarted subjob queue.
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty():
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize()

    @property
    def _num_atoms(self):
        if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]:
            return None
        return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()])

    @property
    def _subjobs_are_finished(self):
        return self._is_canceled or (self.is_prepared and self._finished_subjobs.full())

    @property
    def is_finished(self):
        # TODO: Clean up this logic or move everything into a state machine
        return self._is_canceled or self._postbuild_tasks_are_finished

    @property
    def is_unstarted(self):
        return self.is_prepared and not self._setup_is_started and self._unstarted_subjobs.full()

    @property
    def has_error(self):
        return self._error_message is not None

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished,
                self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total
            )
        return None

    def _status(self):
        """
        :rtype: BuildStatus
        """
        if self.has_error:
            return BuildStatus.ERROR
        elif self._is_canceled:
            return BuildStatus.CANCELED
        elif not self.is_prepared or self.is_unstarted:
            return BuildStatus.QUEUED
        elif self.is_finished:
            return BuildStatus.FINISHED
        else:
            return BuildStatus.BUILDING

    def _get_failed_atoms(self):
        """
        The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if
        build has completed and no atoms have failed.
        :rtype: list[Atom] | None
        """
        if self._failed_atoms is None and self.is_finished:
            if self._is_canceled:
                return []

            self._failed_atoms = []
            for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids():
                subjob = self.subjob(subjob_id)
                atom = subjob.atoms[atom_id]
                self._failed_atoms.append(atom)

        return self._failed_atoms

    def _result(self):
        """
        :rtype: str | None
        """
        if self._is_canceled:
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        self._create_build_artifact()
        self._logger.debug('Postbuild tasks completed for build {}', self.build_id())
        self._postbuild_tasks_are_finished = True
        self._record_state_timestamp(BuildStatus.FINISHED)

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz')

    def _build_results_dir(self):
        return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory'])

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))

    def get_state_timestamp(self, build_status):
        """
        Get the recorded timestamp for a given build status. This may be None if the build has not yet reached
        the specified state.
        :param build_status: The build status for which to retrieve the corresponding timestamp
        :type build_status: BuildStatus
        :return: The timestamp for the specified status
        :rtype: float | None
        """
        return self._state_timestamps.get(build_status)

    def _record_state_timestamp(self, build_status):
        """
        Record a timestamp for a given build status. This is used to record the timing of the various build phases and
        is exposed via the Build object's API representation.
        :param build_status: The build status for which to record a timestamp
        :type build_status: BuildStatus
        """
        if self._state_timestamps.get(build_status) is not None:
            self._logger.warning(
                'Overwriting timestamp for build {}, status {}'.format(self.build_id(), build_status))
        self._state_timestamps[build_status] = time.time()
Ejemplo n.º 17
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs

    :type _build_id: int
    :type _build_request: BuildRequest
    :type _build_artifact: None | BuildArtifact
    :type _error_message: None | str
    :type _project_type: None | ProjectType
    :type _timing_file_path: None | str
    """
    _build_id_counter = Counter()  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self._build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None

        self._error_message = None
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP(joey): Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False  # WIP(joey): Remove and use build state.
        self._timing_file_path = None

        self._state_machine = BuildFsm(
            build_id=self._build_id,
            enter_state_callbacks={
                BuildState.ERROR: self._on_enter_error_state,
                BuildState.CANCELED: self._on_enter_canceled_state,
            }
        )

    def api_representation(self):
        failed_atoms_api_representation = None
        if self._get_failed_atoms() is not None:
            failed_atoms_api_representation = [failed_atom.api_representation()
                                               for failed_atom in self._get_failed_atoms()]
        build_state = self._status()
        # todo: PREPARING/PREPARED are new states -- make sure clients can handle them before exposing.
        if build_state in (BuildState.PREPARING, BuildState.PREPARED):
            build_state = BuildState.QUEUED

        return {
            'id': self._build_id,
            'status': build_state,
            'artifacts': self._artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': failed_atoms_api_representation,
            'result': self._result(),
            'request_params': self.build_request.build_parameters(),
            # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries
            # by numerically increasing dict value, with None values sorting highest.
            'state_timestamps': OrderedDict(sorted(
                [(state.lower(), timestamp) for state, timestamp in self._state_machine.transition_timestamps.items()],
                key=lambda item: item[1] or float('inf'))),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo()

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update({'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)
        if self._project_type is None:
            raise BuildProjectError('Build failed due to an invalid project type.')

    def prepare(self, subjob_calculator):
        """
        :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build
        :type subjob_calculator: SubjobCalculator
        """
        if not isinstance(self.build_request, BuildRequest):
            raise RuntimeError('Build {} has no associated request object.'.format(self._build_id))

        if not isinstance(self.project_type, ProjectType):
            raise RuntimeError('Build {} has no project set.'.format(self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id))

        self._state_machine.trigger(BuildEvent.START_PREPARE)
        # WIP(joey): Move the following code into a PREPARING state callback
        #  (so that it won't execute if the build has already been canceled.)

        self._logger.info('Fetching project for build {}.', self._build_id)
        self.project_type.fetch_project()
        self._logger.info('Successfully fetched project for build {}.', self._build_id)

        job_config = self.project_type.job_config()
        if job_config is None:
            raise RuntimeError('Build failed while trying to parse clusterrunner.yaml.')

        subjobs = subjob_calculator.compute_subjobs_for_build(self._build_id, job_config, self.project_type)

        self._unstarted_subjobs = Queue(maxsize=len(subjobs))  # WIP(joey): Move this into BuildScheduler?
        self._finished_subjobs = Queue(maxsize=len(subjobs))  # WIP(joey): Remove this and just record finished count.

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._timing_file_path = self._project_type.timing_file_path(job_config.name)
        app.util.fs.create_dir(self._build_results_dir())
        self._state_machine.trigger(BuildEvent.FINISH_PREPARE)

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    @property
    def build_request(self):
        """
        :rtype: BuildRequest
        """
        return self._build_request

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception('Error while completing subjob; marking build as failed.')
            self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _parse_payload_for_atom_exit_code(self, subjob_id):
        subjob = self.subjob(subjob_id)
        for atom_id in range(len(subjob.atoms)):
            artifact_dir = BuildArtifact.atom_artifact_directory(
                self.build_id(),
                subjob.subjob_id(),
                atom_id,
                result_root=Configuration['results_directory']
            )
            atom_exit_code_file_sys_path = os.path.join(artifact_dir, BuildArtifact.EXIT_CODE_FILE)
            with open(atom_exit_code_file_sys_path, 'r') as atom_exit_code_file:
                subjob.atoms[atom_id].exit_code = int(atom_exit_code_file.read())

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(), payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._parse_payload_for_atom_exit_code(subjob_id)
        except:
            self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self.subjob(subjob_id)
        subjob.mark_completed()
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            should_trigger_postbuild_tasks = self._all_subjobs_are_finished() and not self._is_stopped()

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if should_trigger_postbuild_tasks:
            self._logger.info("All results received for build {}!", self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start()

    def mark_started(self):
        """
        Mark the build as started.
        """
        self._state_machine.trigger(BuildEvent.START_BUILDING)

    def finish(self):
        """
        Perform postbuild task and mark this build as finished.
        """
        # This method also transitions the FSM to finished after the postbuild tasks are complete.
        self._perform_async_postbuild_tasks()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.
        :type failure_reason: str
        """
        self._state_machine.trigger(BuildEvent.FAIL, error_msg=failure_reason)

    def mark_setup_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. Because setup failures don't have any logs, we put the build_id
        in the setup_failed file for easier querying of worker logs.
        :type failure_reason: str
        """
        self._state_machine.trigger(BuildEvent.FAIL, error_msg='{} Build Id: {}.'.format(failure_reason, self._build_id))
        setup_failure_file = os.path.join(self._build_results_dir(), BuildArtifact.SETUP_FAILED_FILE)
        app.util.fs.write_file(str(self._build_id), setup_failure_file)
        self._create_build_artifact()

    def _on_enter_error_state(self, event):
        """
        Store an error message for the build and log the failure. This method is triggered by
        a state machine transition to the ERROR state.
        :param event: The Fysom event object
        """
        # WIP(joey): Should this be a reenter_state callback also? Should it check for previous error message?
        default_error_msg = 'An unspecified error occurred.'
        self._error_message = getattr(event, 'error_msg', default_error_msg)
        self._logger.warning('Build {} failed: {}', self.build_id(), self._error_message)

    def cancel(self):
        """
        Cancel a running build.
        """
        self._logger.notice('Request received to cancel build {}.', self._build_id)
        self._state_machine.trigger(BuildEvent.CANCEL)

    def _on_enter_canceled_state(self, event):
        # Deplete the unstarted subjob queue.
        # WIP(joey): Just remove this completely and adjust behavior of other methods based on self._is_canceled().
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty():
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    # WIP(joey): Change some of these private @properties to methods.
    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize()

    @property
    def _num_atoms(self):
        # todo: blacklist states instead of whitelist, or just check _all_subjobs_by_id directly
        if self._status() not in [BuildState.BUILDING, BuildState.FINISHED]:
            return None
        return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()])

    def _all_subjobs_are_finished(self):
        return self._finished_subjobs and self._finished_subjobs.full()

    @property
    def is_finished(self):
        # WIP(joey): Calling logic should check _is_canceled if it needs to instead of including the check here.
        return self._is_canceled() or self._postbuild_tasks_are_finished

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished,
                self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total
            )
        return None

    def _status(self):  # WIP(joey): Rename to _state.
        """
        :rtype: BuildState
        """
        return self._state_machine.state

    @property
    def has_error(self):
        return self._status() is BuildState.ERROR

    def _is_canceled(self):
        return self._status() is BuildState.CANCELED

    def _is_stopped(self):
        return self._status() in (BuildState.ERROR, BuildState.CANCELED)

    def _get_failed_atoms(self):
        """
        The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if
        build has completed and no atoms have failed.
        :rtype: list[Atom] | None
        """
        if self._failed_atoms is None and self.is_finished:
            if self._is_canceled():
                return []

            self._failed_atoms = []
            for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids():
                subjob = self.subjob(subjob_id)
                atom = subjob.atoms[atom_id]
                self._failed_atoms.append(atom)

        return self._failed_atoms

    def _result(self):
        """
        Can return three states:
            None:
            FAILURE:
            NO_FAILURES:
        :rtype: BuildResult | None
        """
        if self._is_canceled():
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        self._create_build_artifact()
        self._delete_temporary_build_artifact_files()
        self._postbuild_tasks_are_finished = True
        self._state_machine.trigger(BuildEvent.POSTBUILD_TASKS_COMPLETE)

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(),
                                                                      BuildArtifact.ARTIFACT_FILE_NAME)

    def _delete_temporary_build_artifact_files(self):
        """
        Delete the temporary build result files that are no longer needed, due to the creation of the
        build artifact tarball.

        ONLY call this method after _create_build_artifact() has completed. Otherwise we have lost the build results.
        """
        build_result_dir = self._build_results_dir()
        start_time = time.time()
        for path in os.listdir(build_result_dir):
            # The build result tar-ball is also stored in this same directory, so we must not delete it.
            if path == BuildArtifact.ARTIFACT_FILE_NAME:
                continue
            full_path = os.path.join(build_result_dir, path)
            # Do NOT use app.util.fs.async_delete() here. That call will generate a temp directory for every
            # atom, which can be in the thousands per build, and can lead to running up against the ulimit -Hn.
            if os.path.isdir:
                shutil.rmtree(full_path, ignore_errors=True)
            else:
                os.remove(full_path)
        end_time = time.time() - start_time
        self._logger.info('Completed deleting artifact files for {}, took {:.1f} seconds.', self._build_id, end_time)

    def _build_results_dir(self):
        return BuildArtifact.build_artifact_directory(self.build_id(), result_root=Configuration['results_directory'])

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
Ejemplo n.º 18
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs
    """
    _build_id_counter = Counter()  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish
        self._subjob_assignment_lock = Lock()  # prevents subjobs from being skipped
        self._slaves_allocated = []
        self._num_executors_allocated = 0
        self._num_executors_in_use = 0

        self._max_executors = float('inf')
        self._max_executors_per_slave = float('inf')

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None
        self._finished_subjobs = None
        self._postbuild_tasks_are_finished = False
        self._timing_file_path = None

    def api_representation(self):
        return {
            'id': self._build_id,
            'status': self._status(),
            'artifacts': self._artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': self._failed_atoms(),  # todo: print the file contents instead of paths
            'result': self._result(),
            'request_params': self.build_request.build_parameters(),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo()

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update({'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)

        if self._project_type is None:
            raise BuildProjectError('Build failed due to an invalid project type.')

    def prepare(self, subjobs, job_config):
        """
        :type subjobs: list[Subjob]
        :type job_config: JobConfig
        """
        if self.project_type is None:
            raise RuntimeError('prepare() was called before generate_project_type() on build {}.'
                               .format(self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id))

        self._unstarted_subjobs = Queue(maxsize=len(subjobs))
        self._finished_subjobs = Queue(maxsize=len(subjobs))

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._max_executors = job_config.max_executors
        self._max_executors_per_slave = job_config.max_executors_per_slave
        self._timing_file_path = self.project_type.timing_file_path(job_config.name)
        self.is_prepared = True

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    def needs_more_slaves(self):
        """
        Determine whether or not this build should have more slaves allocated to it.

        :rtype: bool
        """
        return self._num_executors_allocated < self._max_executors and not self._unstarted_subjobs.empty()

    def allocate_slave(self, slave):
        """
        Allocate a slave to this build. This tells the slave to execute setup commands for this build.

        :type slave: Slave
        """
        self._slaves_allocated.append(slave)
        slave.setup(self)
        self._num_executors_allocated += min(slave.num_executors, self._max_executors_per_slave)
        analytics.record_event(analytics.BUILD_SETUP_START, build_id=self.build_id(), slave_id=slave.id)

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def begin_subjob_executions_on_slave(self, slave):
        """
        Begin subjob executions on a slave. This should be called once after the specified slave has already run
        build_setup commands for this build.

        :type slave: Slave
        """
        analytics.record_event(analytics.BUILD_SETUP_FINISH, build_id=self.build_id(), slave_id=slave.id)
        for slave_executor_count in range(slave.num_executors):
            if (self._num_executors_in_use >= self._max_executors
                    or slave_executor_count >= self._max_executors_per_slave):
                break
            slave.claim_executor()
            self._num_executors_in_use += 1
            self.execute_next_subjob_or_teardown_slave(slave)

    def execute_next_subjob_or_teardown_slave(self, slave):
        """
        Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted
        subjob queue is empty, we teardown the slave to free it up for other builds.

        :type slave: Slave
        """
        try:
            # This lock prevents the scenario where a subjob is pulled from the queue but cannot be assigned to this
            # slave because it is shutdown, so we put it back on the queue but in the meantime another slave enters
            # this method, finds the subjob queue empty, and is torn down.  If that was the last 'living' slave, the
            # build would be stuck.
            with self._subjob_assignment_lock:
                subjob = self._unstarted_subjobs.get(block=False)
                self._logger.debug('Sending subjob {} (build {}) to slave {}.',
                                   subjob.subjob_id(), subjob.build_id(), slave.url)
                try:
                    slave.start_subjob(subjob)
                except SlaveMarkedForShutdownError:
                    self._unstarted_subjobs.put(subjob)
                    # An executor is currently allocated for this subjob in begin_subjob_executions_on_slave.
                    # Since the slave has been marked for shutdown, we need to free the executor.
                    self._free_slave_executor(slave)

        except Empty:
            self._free_slave_executor(slave)

    def _free_slave_executor(self, slave):
        num_executors_in_use = slave.free_executor()
        if num_executors_in_use == 0:
            try:
                self._slaves_allocated.remove(slave)
            except ValueError:
                pass  # We have already deallocated this slave, no need to teardown
            else:
                slave.teardown()

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception('Error while completing subjob; marking build as failed.')
            self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.', subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(
            self._build_results_dir(),
            payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
        except:
            self._logger.warning('Writing payload for subjob {} of build {} FAILED.', subjob_id, self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self._all_subjobs_by_id[int(subjob_id)]
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            subjobs_are_finished = self._subjobs_are_finished

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if subjobs_are_finished:
            self._logger.info("All results received for build {}!", self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.

        :type failure_reason: str
        """
        self._logger.error('Build {} failed: {}', self.build_id(), failure_reason)
        self._error_message = failure_reason

    def cancel(self):
        """
        Cancel a running build
        """
        # Early exit if build is not running
        if self._status() in [BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED]:
            return

        self._is_canceled = True

        # Deplete the unstarted subjob queue.
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty():
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def num_executors_allocated(self):
        """
        :rtype: int
        """
        return self._num_executors_allocated

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize()

    @property
    def _num_atoms(self):
        if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]:
            return None
        return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()])

    @property
    def _subjobs_are_finished(self):
        return self._is_canceled or (self.is_prepared and self._finished_subjobs.full())

    @property
    def is_finished(self):
        # TODO: Clean up this logic or move everything into a state machine
        return self._is_canceled or self._postbuild_tasks_are_finished

    @property
    def is_unstarted(self):
        return self.is_prepared and self._num_executors_allocated == 0 and self._unstarted_subjobs.full()

    @property
    def has_error(self):
        return self._error_message is not None

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished,
                self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total
            )
        return None

    def _status(self):
        """
        :rtype: BuildStatus
        """
        if self.has_error:
            return BuildStatus.ERROR
        elif self._is_canceled:
            return BuildStatus.CANCELED
        elif not self.is_prepared or self.is_unstarted:
            return BuildStatus.QUEUED
        elif self.is_finished:
            return BuildStatus.FINISHED
        else:
            return BuildStatus.BUILDING

    def _failed_atoms(self):
        """
        The commands which failed
        :rtype: list [str] | None
        """
        if self._is_canceled:
            return []

        if self.is_finished:
            # dict.values() returns a view object in python 3, so wrapping values() in a list
            return list(self._build_artifact.get_failed_commands().values())
        return None

    def _result(self):
        """
        :rtype: str | None
        """
        if self._is_canceled:
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_commands()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        self._create_build_artifact()
        self._logger.debug('Postbuild tasks completed for build {}', self.build_id())
        self._postbuild_tasks_are_finished = True

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz')

    def _build_results_dir(self):
        return os.path.join(
            Configuration['results_directory'],
            str(self.build_id()),
        )

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'], str(uuid.uuid4()))
Ejemplo n.º 19
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs
    """
    _build_id_counter = Counter()  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._preparation_coin = SingleUseCoin()  # protects against separate threads calling prepare() more than once

        self._project_type = None
        self._num_slaves_in_use = 0
        self._build_completion_lock = Lock()  # protects against more than one thread detecting the build's finish
        self._num_allocated_executors = 0
        self._max_executors = float('inf')
        self._build_completion_lock = Lock()

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None
        self._finished_subjobs = None
        self._postbuild_tasks_are_finished = False
        self._teardowns_finished = False

    def api_representation(self):
        return {
            'id': self._build_id,
            'status': self._status(),
            'artifacts': self._artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details': self._detail_message,
            'error_message': self._error_message,
            'num_atoms': self._num_atoms,
            'num_subjobs': len(self._all_subjobs_by_id),
            'failed_atoms': self._failed_atoms(),  # todo: print the file contents instead of paths
            'result': self._result(),
        }

    def prepare(self, subjobs, project_type, job_config):
        """
        :type subjobs: list[Subjob]
        :type project_type: project_type.project_type.ProjectType
        :type job_config: master.job_config.JobConfig
        """
        if not self._preparation_coin.spend():
            raise RuntimeError('prepare() was called more than once on build {}.'.format(self._build_id))

        self._project_type = project_type
        self._unstarted_subjobs = Queue(maxsize=len(subjobs))
        self._finished_subjobs = Queue(maxsize=len(subjobs))

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._max_executors = job_config.max_executors
        self._timing_file_path = project_type.timing_file_path(job_config.name)
        self.is_prepared = True

    def finish(self):
        """
        Called when all slaves are done with this build (and any teardown is complete)
        """
        if self._subjobs_are_finished:
            self._teardowns_finished = True
        else:
            raise RuntimeError('Tried to finish build {} but not all subjobs are complete'.format(self._build_id))

    def build_id(self):
        """
        :return:
        :rtype: int
        """
        return self._build_id

    def needs_more_slaves(self):
        return self._num_allocated_executors < self._max_executors and not self._unstarted_subjobs.empty()

    def allocate_slave(self, slave):
        """
        Allocate a slave to this build.
        :type slave: master.Slave
        """
        self._num_slaves_in_use += 1
        slave.setup(self.build_id(), project_type_params=self.build_request.build_parameters())

        for _ in range(slave.num_executors):
            if self._num_allocated_executors >= self._max_executors:
                break
            slave.claim_executor()
            self._num_allocated_executors += 1
            self.execute_next_subjob_on_slave(slave)

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def execute_next_subjob_on_slave(self, slave):
        """
        Grabs an unstarted subjob off the queue and sends it to the specified slave to be executed. If the unstarted
        subjob queue is empty, we mark the slave as idle.

        :type slave: master.Slave
        """
        try:
            subjob = self._unstarted_subjobs.get(block=False)
            self._logger.debug('Sending subjob {} (build {}) to slave {}.',
                               subjob.subjob_id(), subjob.build_id(), slave.url)
            slave.start_subjob(subjob)

        except Empty:
            num_executors_in_use = slave.free_executor()
            if num_executors_in_use == 0:
                slave.teardown()

    def handle_subjob_payload(self, subjob_id, payload=None):
        if not payload:
            self._logger.warning('No payload for subjob {}.', subjob_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(
            self._build_results_dir(),
            payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._logger.debug('Payload for subjob {} written.', subjob_id)
        except:
            self._logger.warning('Writing payload for subjob {} FAILED.', subjob_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self._all_subjobs_by_id[int(subjob_id)]
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            subjobs_are_finished = self._subjobs_are_finished

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if subjobs_are_finished:
            self._logger.info("All results received for build {}!", self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks, name='PostBuild{}'.format(self._build_id)).start()

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.

        :type failure_reason: str
        """
        self._logger.error('Build {} failed: {}', self.build_id(), failure_reason)
        self._error_message = failure_reason

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize()

    @property
    def _num_atoms(self):
        if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]:
            return None
        return sum([len(subjob.atomic_commands()) for subjob in self._all_subjobs_by_id.values()])

    @property
    def _subjobs_are_finished(self):
        return self.is_prepared and self._finished_subjobs.full()

    @property
    def is_finished(self):
        return self._subjobs_are_finished and self._postbuild_tasks_are_finished and self._teardowns_finished

    @property
    def is_unstarted(self):
        return self.is_prepared and self._unstarted_subjobs.full()

    @property
    def has_error(self):
        return self._error_message is not None

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished,
                self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total
            )
        return None

    def _status(self):
        """
        :rtype: str
        """
        if self.has_error:
            return BuildStatus.ERROR
        elif not self.is_prepared or self.is_unstarted:
            return BuildStatus.QUEUED
        elif self.is_finished:
            return BuildStatus.FINISHED
        else:
            return BuildStatus.BUILDING

    def _failed_atoms(self):
        """
        The commands which failed
        :rtype: list [str] | None
        """
        if self.is_finished:
            # dict.values() returns a view object in python 3, so wrapping values() in a list
            return list(self._build_artifact.get_failed_commands().values())
        return None

    def _result(self):
        """
        :rtype: str | None
        """
        if self.is_finished:
            if len(self._build_artifact.get_failed_commands()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        # @TODO There is a race condition here where the build is marked finished before the results archive
        # is prepared.  If the user requests the build status before archival finishes, the 'artifacts'
        # value in the post body will be None.  self.is_finished should be conditional on whether archival
        # is finished.
        self._create_build_artifact()
        self._logger.debug('Postbuild tasks completed for build {}', self.build_id())
        self._postbuild_tasks_are_finished = True

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(self._build_results_dir(), 'results.tar.gz')

    def _build_results_dir(self):
        return os.path.join(
            Configuration['results_directory'],
            str(self.build_id()),
        )
Ejemplo n.º 20
0
class Build(object):
    """
    A build is a single execution of any configured job. This class:
        - exposes the overall status of the build
        - keeps track of the build's subjobs and their completion state
        - manages slaves that have been assigned to accept this build's subjobs
    """
    _build_id_counter = Counter(
    )  # class-level counter for assigning build ids

    def __init__(self, build_request):
        """
        :type build_request: BuildRequest
        """
        self._logger = get_logger(__name__)
        self._build_id = self._build_id_counter.increment()
        self.build_request = build_request
        self._artifacts_archive_file = None
        self._build_artifact = None
        """ :type : BuildArtifact"""

        self._error_message = None
        self.is_prepared = False
        self._setup_is_started = False
        self._preparation_coin = SingleUseCoin(
        )  # protects against separate threads calling prepare() more than once
        self._is_canceled = False

        self._project_type = None
        self._build_completion_lock = Lock(
        )  # protects against more than one thread detecting the build's finish

        self._all_subjobs_by_id = {}
        self._unstarted_subjobs = None  # WIP: Move subjob queues to BuildScheduler class.
        self._finished_subjobs = None
        self._failed_atoms = None
        self._postbuild_tasks_are_finished = False
        self._timing_file_path = None

        self._state_timestamps = {status: None
                                  for status in BuildStatus
                                  }  # initialize all timestamps to None
        self._record_state_timestamp(BuildStatus.QUEUED)

    def api_representation(self):
        failed_atoms_api_representation = None
        if self._get_failed_atoms() is not None:
            failed_atoms_api_representation = [
                failed_atom.api_representation()
                for failed_atom in self._get_failed_atoms()
            ]

        return {
            'id':
            self._build_id,
            'status':
            self._status(),
            'artifacts':
            self.
            _artifacts_archive_file,  # todo: this should probably be a url, not a file path
            'details':
            self._detail_message,
            'error_message':
            self._error_message,
            'num_atoms':
            self._num_atoms,
            'num_subjobs':
            len(self._all_subjobs_by_id),
            'failed_atoms':
            failed_atoms_api_representation,
            'result':
            self._result(),
            'request_params':
            self.build_request.build_parameters(),
            # Convert self._state_timestamps to OrderedDict to make raw API response more readable. Sort the entries
            # by numerically increasing dict value, with None values sorting highest.
            'state_timestamps':
            OrderedDict(
                sorted(
                    [(state.lower(), timestamp)
                     for state, timestamp in self._state_timestamps.items()],
                    key=lambda item: item[1] or float('inf'))),
        }

    def generate_project_type(self):
        """
        Instantiate the project type for this build, populating the self._project_type instance variable.

        As a side effect, this method also updates the build request's build_parameters dictionary
        with the unique workspace directory path for this build.

        :raises BuildProjectError when failed to instantiate project type
        """
        # Generate a unique project build directory name that will be symlinked to the actual project directory
        # later on when the project gets fetched.
        build_specific_project_directory = self._generate_unique_symlink_path_for_build_repo(
        )

        # Because build_specific_project_directory is entirely internal and generated by ClusterRunner (it is a
        # build-unique generated symlink), we must manually add it to the project_type_params
        project_type_params = self.build_request.build_parameters()
        project_type_params.update(
            {'build_project_directory': build_specific_project_directory})
        self._project_type = util.create_project_type(project_type_params)

        if self._project_type is None:
            raise BuildProjectError(
                'Build failed due to an invalid project type.')

    def prepare(self, subjob_calculator):
        """
        :param subjob_calculator: Used after project fetch to atomize and group subjobs for this build
        :type subjob_calculator: SubjobCalculator
        """
        if not isinstance(self.build_request, BuildRequest):
            raise RuntimeError(
                'Build {} has no associated request object.'.format(
                    self._build_id))

        if not isinstance(self.project_type, ProjectType):
            raise RuntimeError('Build {} has no project set.'.format(
                self._build_id))

        if not self._preparation_coin.spend():
            raise RuntimeError(
                'prepare() was called more than once on build {}.'.format(
                    self._build_id))

        self._logger.info('Fetching project for build {}.', self._build_id)
        self.project_type.fetch_project()
        self._logger.info('Successfully fetched project for build {}.',
                          self._build_id)

        job_config = self.project_type.job_config()
        if job_config is None:
            raise RuntimeError(
                'Build failed while trying to parse clusterrunner.yaml.')

        subjobs = subjob_calculator.compute_subjobs_for_build(
            self._build_id, job_config, self.project_type)

        self._unstarted_subjobs = Queue(maxsize=len(subjobs))
        self._finished_subjobs = Queue(maxsize=len(subjobs))

        for subjob in subjobs:
            self._all_subjobs_by_id[subjob.subjob_id()] = subjob
            self._unstarted_subjobs.put(subjob)

        self._timing_file_path = self._project_type.timing_file_path(
            job_config.name)
        self.is_prepared = True
        self._record_state_timestamp(BuildStatus.PREPARED)

    def build_id(self):
        """
        :rtype: int
        """
        return self._build_id

    def all_subjobs(self):
        """
        Returns a list of subjobs for this build
        :rtype: list[Subjob]
        """
        return [subjob for subjob in self._all_subjobs_by_id.values()]

    def subjob(self, subjob_id):
        """
        Returns a single subjob
        :type subjob_id: int
        :rtype: Subjob
        """
        subjob = self._all_subjobs_by_id.get(subjob_id)
        if subjob is None:
            raise ItemNotFoundError('Invalid subjob id.')
        return subjob

    def complete_subjob(self, subjob_id, payload=None):
        """
        Handle the subjob payload and mark the given subjob id for this build as complete.
        :type subjob_id: int
        :type payload: dict
        """
        try:
            self._handle_subjob_payload(subjob_id, payload)
            self._mark_subjob_complete(subjob_id)

        except Exception:
            self._logger.exception(
                'Error while completing subjob; marking build as failed.')
            self.mark_failed(
                'Error occurred while completing subjob {}.'.format(subjob_id))
            raise

    def _parse_payload_for_atom_exit_code(self, subjob_id):
        subjob = self.subjob(subjob_id)
        for atom_id in range(len(subjob.atoms)):
            artifact_dir = BuildArtifact.atom_artifact_directory(
                self.build_id(),
                subjob.subjob_id(),
                atom_id,
                result_root=Configuration['results_directory'])
            atom_exit_code_file_sys_path = os.path.join(
                artifact_dir, BuildArtifact.EXIT_CODE_FILE)
            with open(atom_exit_code_file_sys_path,
                      'r') as atom_exit_code_file:
                subjob.atoms[atom_id].exit_code = int(
                    atom_exit_code_file.read())

    def _handle_subjob_payload(self, subjob_id, payload):
        if not payload:
            self._logger.warning('No payload for subjob {} of build {}.',
                                 subjob_id, self._build_id)
            return

        # Assertion: all payloads received from subjobs are uniquely named.
        result_file_path = os.path.join(self._build_results_dir(),
                                        payload['filename'])

        try:
            app.util.fs.write_file(payload['body'], result_file_path)
            app.util.fs.extract_tar(result_file_path, delete=True)
            self._parse_payload_for_atom_exit_code(subjob_id)
        except:
            self._logger.warning(
                'Writing payload for subjob {} of build {} FAILED.', subjob_id,
                self._build_id)
            raise

    def _read_subjob_timings_from_results(self):
        """
        Collect timing data from all subjobs
        :rtype: dict [str, float]
        """
        timings = {}
        for _, subjob in self._all_subjobs_by_id.items():
            timings.update(subjob.read_timings())

        return timings

    def _mark_subjob_complete(self, subjob_id):
        """
        :type subjob_id: int
        """
        subjob = self.subjob(subjob_id)
        subjob.mark_completed()
        with self._build_completion_lock:
            self._finished_subjobs.put(subjob, block=False)
            subjobs_are_finished = self._subjobs_are_finished

        # We use a local variable here which was set inside the _build_completion_lock to prevent a race condition
        if subjobs_are_finished:
            self._logger.info("All results received for build {}!",
                              self._build_id)
            SafeThread(target=self._perform_async_postbuild_tasks,
                       name='PostBuild{}'.format(self._build_id)).start()

    def mark_started(self):
        self._setup_is_started = True
        self._record_state_timestamp(BuildStatus.BUILDING)

    def mark_failed(self, failure_reason):
        """
        Mark a build as failed and set a failure reason. The failure reason should be something we can present to the
        end user of ClusterRunner, so try not to include detailed references to internal implementation.

        :type failure_reason: str
        """
        self._logger.error('Build {} failed: {}', self.build_id(),
                           failure_reason)
        self._error_message = failure_reason
        self._record_state_timestamp(BuildStatus.ERROR)

    def cancel(self):
        """
        Cancel a running build
        """
        # Early exit if build is not running
        if self._status() in [
                BuildStatus.FINISHED, BuildStatus.ERROR, BuildStatus.CANCELED
        ]:
            self._logger.notice(
                'Ignoring cancel request for build {}. Build is already in state {}.',
                self._build_id, self._status())
            return

        self._logger.notice('Canceling build {}.', self._build_id)
        self._is_canceled = True
        self._record_state_timestamp(BuildStatus.CANCELED)

        # Deplete the unstarted subjob queue.
        # TODO: Handle situation where cancel() is called while subjobs are being added to _unstarted_subjobs
        while self._unstarted_subjobs is not None and not self._unstarted_subjobs.empty(
        ):
            try:
                # A subjob may be asynchronously pulled from this queue, so we need to avoid blocking when empty.
                self._unstarted_subjobs.get(block=False)
            except Empty:
                break

    def validate_update_params(self, update_params):
        """
        Determine if a dict of update params are valid, and generate an error if not
        :param update_params: Params passed into a PUT for this build
        :type update_params: dict [str, str]
        :return: Whether the params are valid and a response containing an error message if not
        :rtype: tuple [bool, dict [str, str]]
        """
        keys_and_values_allowed = {'status': ['canceled']}
        message = None
        for key, value in update_params.items():
            if key not in keys_and_values_allowed.keys():
                message = 'Key ({}) is not in list of allowed keys ({})'.\
                    format(key, ",".join(keys_and_values_allowed.keys()))
            elif value not in keys_and_values_allowed[key]:
                message = 'Value ({}) is not in list of allowed values ({}) for {}'.\
                    format(value, keys_and_values_allowed[key], key)

        if message is not None:
            return False, {'error': message}
        return True, {}

    def update_state(self, update_params):
        """
        Make updates to the state of this build given a set of update params
        :param update_params: The keys and values to update on this build
        :type update_params: dict [str, str]
        """
        success = False
        for key, value in update_params.items():
            if key == 'status':
                if value == 'canceled':
                    self.cancel()
                    success = True
        return success

    @property
    def project_type(self):
        """
        :rtype: ProjectType
        """
        return self._project_type

    @property
    def artifacts_archive_file(self):
        return self._artifacts_archive_file

    @property
    def _num_subjobs_total(self):
        return len(self._all_subjobs_by_id)

    @property
    def _num_subjobs_finished(self):
        return 0 if not self._finished_subjobs else self._finished_subjobs.qsize(
        )

    @property
    def _num_atoms(self):
        if self._status() not in [BuildStatus.BUILDING, BuildStatus.FINISHED]:
            return None
        return sum([
            len(subjob.atomic_commands())
            for subjob in self._all_subjobs_by_id.values()
        ])

    @property
    def _subjobs_are_finished(self):
        return self._is_canceled or (self.is_prepared
                                     and self._finished_subjobs.full())

    @property
    def is_finished(self):
        # TODO: Clean up this logic or move everything into a state machine
        return self._is_canceled or self._postbuild_tasks_are_finished

    @property
    def is_unstarted(self):
        return self.is_prepared and not self._setup_is_started and self._unstarted_subjobs.full(
        )

    @property
    def has_error(self):
        return self._error_message is not None

    @property
    def _detail_message(self):
        if self._num_subjobs_total > 0:
            return '{} of {} subjobs are complete ({:.1f}%).'.format(
                self._num_subjobs_finished, self._num_subjobs_total,
                100 * self._num_subjobs_finished / self._num_subjobs_total)
        return None

    def _status(self):
        """
        :rtype: BuildStatus
        """
        if self.has_error:
            return BuildStatus.ERROR
        elif self._is_canceled:
            return BuildStatus.CANCELED
        elif not self.is_prepared or self.is_unstarted:
            return BuildStatus.QUEUED
        elif self.is_finished:
            return BuildStatus.FINISHED
        else:
            return BuildStatus.BUILDING

    def _get_failed_atoms(self):
        """
        The atoms that failed. Returns None if the build hasn't completed yet. Returns empty set if
        build has completed and no atoms have failed.
        :rtype: list[Atom] | None
        """
        if self._failed_atoms is None and self.is_finished:
            if self._is_canceled:
                return []

            self._failed_atoms = []
            for subjob_id, atom_id in self._build_artifact.get_failed_subjob_and_atom_ids(
            ):
                subjob = self.subjob(subjob_id)
                atom = subjob.atoms[atom_id]
                self._failed_atoms.append(atom)

        return self._failed_atoms

    def _result(self):
        """
        :rtype: str | None
        """
        if self._is_canceled:
            return BuildResult.FAILURE

        if self.is_finished:
            if len(self._build_artifact.get_failed_subjob_and_atom_ids()) == 0:
                return BuildResult.NO_FAILURES
            return BuildResult.FAILURE
        return None

    def _perform_async_postbuild_tasks(self):
        """
        Once a build is complete, certain tasks can be performed asynchronously.
        """
        self._create_build_artifact()
        self._logger.debug('Postbuild tasks completed for build {}',
                           self.build_id())
        self._postbuild_tasks_are_finished = True
        self._record_state_timestamp(BuildStatus.FINISHED)

    def _create_build_artifact(self):
        self._build_artifact = BuildArtifact(self._build_results_dir())
        self._build_artifact.generate_failures_file()
        self._build_artifact.write_timing_data(
            self._timing_file_path, self._read_subjob_timings_from_results())
        self._artifacts_archive_file = app.util.fs.compress_directory(
            self._build_results_dir(), 'results.tar.gz')

    def _build_results_dir(self):
        return BuildArtifact.build_artifact_directory(
            self.build_id(), result_root=Configuration['results_directory'])

    def _generate_unique_symlink_path_for_build_repo(self):
        """
        Generate a unique symlink path for a build-specific repo. This method does NOT generate the symlink itself.
        :rtype: str
        """
        return os.path.join(Configuration['build_symlink_directory'],
                            str(uuid.uuid4()))

    def get_state_timestamp(self, build_status):
        """
        Get the recorded timestamp for a given build status. This may be None if the build has not yet reached
        the specified state.
        :param build_status: The build status for which to retrieve the corresponding timestamp
        :type build_status: BuildStatus
        :return: The timestamp for the specified status
        :rtype: float | None
        """
        return self._state_timestamps.get(build_status)

    def _record_state_timestamp(self, build_status):
        """
        Record a timestamp for a given build status. This is used to record the timing of the various build phases and
        is exposed via the Build object's API representation.
        :param build_status: The build status for which to record a timestamp
        :type build_status: BuildStatus
        """
        if self._state_timestamps.get(build_status) is not None:
            self._logger.warning(
                'Overwriting timestamp for build {}, status {}'.format(
                    self.build_id(), build_status))
        self._state_timestamps[build_status] = time.time()