コード例 #1
0
    def _create_blue_batch(self, batch):
        """
        Creates a dictionary containing the data for a blue batch.

        :param batch: The batch description
        :type batch: dict
        :return: A dictionary containing a blue batch
        :rtype: dict
        :raise TrusteeServiceError: If the trustee service is unavailable or unable to collect the requested secret keys
        :raise ValueError: If there was more than one blue batch after red_to_blue
        """
        batch_id = str(batch['_id'])
        batch_secret_keys = get_batch_secret_keys(batch)
        response = self._trustee_client.collect(batch_secret_keys)

        if response['state'] == 'failed':
            debug_info = 'Trustee service failed:\n{}'.format(
                response['debug_info'])
            disable_retry = response.get('disable_retry')
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          None,
                          batch['state'],
                          disable_retry_if_failed=disable_retry)
            raise TrusteeServiceError(debug_info)

        batch_secrets = response['secrets']
        batch = fill_batch_secrets(batch, batch_secrets)

        experiment_id = batch['experimentId']

        experiment = self._mongo.db['experiments'].find_one(
            {'_id': ObjectId(experiment_id)})

        red_data = {
            'redVersion': experiment['redVersion'],
            'cli': experiment['cli'],
            'inputs': batch['inputs'],
            'outputs': batch['outputs']
        }

        blue_batches = convert_red_to_blue(red_data)

        if len(blue_batches) != 1:
            raise ValueError(
                'Got {} batches, but only one was asserted.'.format(
                    len(blue_batches)))

        return blue_batches[0]
コード例 #2
0
    def _set_offline(self, debug_info):
        print('Node offline:', self._node_name)

        self._online.clear()

        timestamp = time.time()
        bson_node_id = ObjectId(self._node_id)
        self._mongo.db['nodes'].update_one({'_id': bson_node_id}, {
            '$set': {
                'state': 'offline'
            },
            '$push': {
                'history': {
                    'state': 'offline',
                    'time': timestamp,
                    'debugInfo': debug_info
                }
            }
        })

        # change state of assigned batches
        cursor = self._mongo.db['batches'].find(
            {
                'node': self._node_name,
                'state': {
                    '$in': ['scheduled', 'processing']
                }
            }, {
                '_id': 1,
                'state': 1
            })

        for batch in cursor:
            bson_id = batch['_id']
            batch_id = str(bson_id)
            debug_info = 'Node offline: {}'.format(self._node_name)
            batch_failure(self._mongo, batch_id, debug_info, None,
                          batch['state'])
コード例 #3
0
    def _check_exited_container(self, container, batch):
        """
        Inspects the logs of the given exited container and updates the database accordingly.

        :param container: The container to inspect
        :type container: Container
        :param batch: The batch to update according to the result of the container execution.
        :type batch: dict
        """
        bson_batch_id = batch['_id']
        batch_id = str(bson_batch_id)

        try:
            stdout_logs = container.logs(stderr=False).decode('utf-8')
            stderr_logs = container.logs(stdout=False).decode('utf-8')
            docker_stats = container.stats(stream=False)
        except Exception as e:
            err_str = repr(e)
            self._log('Failed to get container logs:\n{}'.format(err_str))
            debug_info = 'Could not get logs or stats of container: {}'.format(
                err_str)
            batch_failure(self._mongo, batch_id, debug_info, None,
                          batch['state'])
            return

        data = None
        try:
            data = json.loads(stdout_logs)
        except json.JSONDecodeError as e:
            err_str = repr(e)
            debug_info = 'CC-Agent data is not a valid json object: {}\n\nstdout was:\n{}'.format(
                err_str, stdout_logs)
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          data,
                          batch['state'],
                          docker_stats=docker_stats)
            self._log(
                'Failed to load json from blue agent:\n{}'.format(err_str))
            return

        try:
            jsonschema.validate(data, agent_result_schema)
        except jsonschema.ValidationError as e:
            err_str = repr(e)
            debug_info = 'CC-Agent data sent by callback does not comply with jsonschema: {}'.format(
                err_str)
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          data,
                          batch['state'],
                          docker_stats=docker_stats)
            self._log(
                'Failed to validate blue agent output:\n{}'.format(err_str))
            return

        if data['state'] == 'failed':
            debug_info = 'Batch failed.\nContainer stderr:\n{}\ndebug info:\n{}'.format(
                stderr_logs, data['debugInfo'])
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          data,
                          batch['state'],
                          docker_stats=docker_stats)
            return

        batch = self._mongo.db['batches'].find_one({'_id': bson_batch_id}, {
            'attempts': 1,
            'node': 1,
            'state': 1
        })
        if batch['state'] != 'processing':
            debug_info = 'Batch failed.\nExited container, but not in state processing.'
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          data,
                          batch['state'],
                          docker_stats=docker_stats)
            return

        self._mongo.db['batches'].update_one(
            {
                '_id': bson_batch_id,
                'state': 'processing'
            }, {
                '$set': {
                    'state': 'succeeded'
                },
                '$push': {
                    'history': {
                        'state': 'succeeded',
                        'time': time.time(),
                        'debugInfo': None,
                        'node': batch['node'],
                        'ccagent': data,
                        'dockerStats': docker_stats
                    }
                }
            })
コード例 #4
0
 def _pull_image_failure(self, debug_info, batch_id, current_state):
     batch_failure(self._mongo, batch_id, debug_info, None, current_state)
コード例 #5
0
 def _run_batch_container_failure(self, batch_id, debug_info,
                                  current_state):
     batch_failure(self._mongo, batch_id, debug_info, None, current_state)
コード例 #6
0
    def _schedule_batch(self, next_batch, nodes, batch_count_cache):
        """
        Tries to find a node that is capable of processing the given batch. If no capable node could be found, None is
        returned.
        If a node was found, that is capable of processing the given batch, this node is written to the node property of
        the batch. The batches state is then updated to 'scheduled'.

        :param next_batch: The batch to schedule.
        :param nodes: The nodes on which the batch should be scheduled.
        :type nodes: List[CompleteNode]
        :param batch_count_cache: A dictionary mapping experiment ids to the number of batches of this experiment, which
                                  in state processing or scheduled. This dictionary is allowed to overestimate the
                                  number of batches.
        :type batch_count_cache: Dict[str, int]
        :return: The name of the node on which the given batch is scheduled
        If the batch could not be scheduled None is returned
        :raise TrusteeServiceError: If the trustee service is unavailable.
        """
        batch_id = str(next_batch['_id'])
        experiment_id = next_batch['experimentId']

        try:
            experiment = self._get_experiment_of_batch(experiment_id)
        except Exception as e:
            batch_failure(self._mongo,
                          batch_id,
                          repr(e),
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        ram = experiment['container']['settings']['ram']

        # limit the number of currently executed batches from a single experiment
        concurrency_limit = experiment.get('execution', {}).get(
            'settings', {}).get('batchConcurrencyLimit', 64)

        # number of batches which are scheduled or processing of the given experiment
        batch_count = self._get_number_of_batches_of_experiment(
            experiment_id, batch_count_cache)

        if batch_count >= concurrency_limit:
            return None

        # check impossible experiments
        if not Scheduler._check_nodes_possibly_sufficient(nodes, experiment):
            debug_info = 'There are no nodes configured that are possibly sufficient for experiment "{}"' \
                .format(next_batch['experimentId'])
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        # select node
        selected_node = Scheduler._get_best_node(nodes, experiment)

        if selected_node is None:
            return None

        # calculate ram / gpus
        selected_node.ram_available -= ram

        used_gpu_ids = None
        if selected_node.gpus_available:
            gpu_requirements = get_gpu_requirements(
                experiment['container']['settings'].get('gpus'))
            available_gpus = selected_node.gpus_available
            used_gpus = match_gpus(available_gpus,
                                   requirements=gpu_requirements)

            used_gpu_ids = []
            for gpu in used_gpus:
                used_gpu_ids.append(gpu.device_id)
                available_gpus.remove(gpu)

        # check mounting
        mount_connectors = red_get_mount_connectors_from_inputs(
            next_batch['inputs'])
        is_mounting = bool(mount_connectors)

        allow_insecure_capabilities = self._conf.d['controller']['docker'].get(
            'allow_insecure_capabilities', False)

        if not allow_insecure_capabilities and is_mounting:
            # set state to failed, because insecure_capabilities are not allowed but needed, by this batch.
            debug_info = 'FUSE support for this agency is disabled, but the following input/output-keys are ' \
                         'configured to mount inside a docker container.{}{}'.format(os.linesep, mount_connectors)
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        # update batch data
        update_result = self._mongo.db['batches'].update_one(
            {
                '_id': next_batch['_id'],
                'state': next_batch['state']
            }, {
                '$set': {
                    'state': 'scheduled',
                    'node': selected_node.node_name,
                    'usedGPUs': used_gpu_ids,
                    'mount': is_mounting
                },
                '$push': {
                    'history': {
                        'state': 'scheduled',
                        'time': time(),
                        'debugInfo': None,
                        'node': selected_node.node_name,
                        'ccagent': None,
                        'dockerStats': None
                    }
                },
                '$inc': {
                    'attempts': 1
                }
            })

        if update_result.modified_count == 1:
            # The state of the scheduled batch switched from 'registered' to 'scheduled', so increase the batch_count.
            # batch_count_cache always contains experiment_id, because _get_number_of_batches_of_experiment()
            # always inserts the given experiment_id
            batch_count_cache[experiment_id] += 1

            return selected_node.node_name
        else:
            return None