def get_gpus(docker_manager, gpu_settings, gpu_ids):
    """
    Returns a list of gpus which are sufficient for the given gpu settings. Otherwise raise an Exception

    :param docker_manager: The DockerManager used to query gpus
    :type docker_manager: DockerManager
    :param gpu_settings: The gpu settings of the red experiment specifying the required gpus
    :type gpu_settings: Dict
    :param gpu_ids: The gpu_ids specified by the user to use for the execution. If None all gpus are considered.
    :type gpu_ids: List[int] or None

    :return: A list of GPUDevices to use for this experiment
    :rtype: List[GPUDevice]

    :raise InsufficientGPUError: If GPU settings could not be fulfilled
    """
    gpus = None

    gpu_requirements = get_gpu_requirements(gpu_settings)

    # dont do anything, if no gpus are required
    if gpu_requirements or gpu_ids:
        gpu_devices = get_gpu_devices(docker_manager, gpu_ids)

        gpus = match_gpus(gpu_devices, gpu_requirements)

        # if gpu_ids are specified, ignore gpu matching
        if gpu_ids:
            gpus = gpu_devices

    return gpus
Esempio n. 2
0
    def _node_possibly_sufficient(node, experiment):
        """
        Returns True if the node could be sufficient for the experiment, even if the node does not have
        sufficient hardware at the moment (because of running batches).

        :param node: The node to check
        :type node: CompleteNode
        :param experiment: The experiment for which the node is sufficient or not.
        :return: True, if the node is possibly sufficient otherwise False
        """
        # check if node is initialized
        if (node.ram is None) or (node.gpus is None):
            return False

        if node.ram < experiment['container']['settings']['ram']:
            return False

        gpu_requirements = get_gpu_requirements(
            experiment['container']['settings'].get('gpus'))

        try:
            match_gpus(node.gpus, gpu_requirements)
        except InsufficientGPUError:
            return False
        return True
Esempio n. 3
0
    def _node_sufficient(node, experiment):
        """
        Returns True if the nodes hardware is sufficient for the experiment

        :param node: The node to test
        :type node: CompleteNode
        :param experiment: A dictionary containing hardware requirements for the experiment
        :return: True, if the nodes hardware is sufficient for the experiment, otherwise False
        """

        if not node.online:
            return False

        if node.ram_available < experiment['container']['settings']['ram']:
            return False

        # check gpus
        gpu_requirements = get_gpu_requirements(
            experiment['container']['settings'].get('gpus'))

        try:
            _gpus = match_gpus(node.gpus_available, gpu_requirements)
        except InsufficientGPUError:
            return False

        return True
Esempio n. 4
0
    def _schedule_batch(self, next_batch, nodes, batch_count_cache):
        """
        Tries to find a node that is capable of processing the given batch. If no capable node could be found, None is
        returned.
        If a node was found, that is capable of processing the given batch, this node is written to the node property of
        the batch. The batches state is then updated to 'scheduled'.

        :param next_batch: The batch to schedule.
        :param nodes: The nodes on which the batch should be scheduled.
        :type nodes: List[CompleteNode]
        :param batch_count_cache: A dictionary mapping experiment ids to the number of batches of this experiment, which
                                  in state processing or scheduled. This dictionary is allowed to overestimate the
                                  number of batches.
        :type batch_count_cache: Dict[str, int]
        :return: The name of the node on which the given batch is scheduled
        If the batch could not be scheduled None is returned
        :raise TrusteeServiceError: If the trustee service is unavailable.
        """
        batch_id = str(next_batch['_id'])
        experiment_id = next_batch['experimentId']

        try:
            experiment = self._get_experiment_of_batch(experiment_id)
        except Exception as e:
            batch_failure(self._mongo,
                          batch_id,
                          repr(e),
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        ram = experiment['container']['settings']['ram']

        # limit the number of currently executed batches from a single experiment
        concurrency_limit = experiment.get('execution', {}).get(
            'settings', {}).get('batchConcurrencyLimit', 64)

        # number of batches which are scheduled or processing of the given experiment
        batch_count = self._get_number_of_batches_of_experiment(
            experiment_id, batch_count_cache)

        if batch_count >= concurrency_limit:
            return None

        # check impossible experiments
        if not Scheduler._check_nodes_possibly_sufficient(nodes, experiment):
            debug_info = 'There are no nodes configured that are possibly sufficient for experiment "{}"' \
                .format(next_batch['experimentId'])
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        # select node
        selected_node = Scheduler._get_best_node(nodes, experiment)

        if selected_node is None:
            return None

        # calculate ram / gpus
        selected_node.ram_available -= ram

        used_gpu_ids = None
        if selected_node.gpus_available:
            gpu_requirements = get_gpu_requirements(
                experiment['container']['settings'].get('gpus'))
            available_gpus = selected_node.gpus_available
            used_gpus = match_gpus(available_gpus,
                                   requirements=gpu_requirements)

            used_gpu_ids = []
            for gpu in used_gpus:
                used_gpu_ids.append(gpu.device_id)
                available_gpus.remove(gpu)

        # check mounting
        mount_connectors = red_get_mount_connectors_from_inputs(
            next_batch['inputs'])
        is_mounting = bool(mount_connectors)

        allow_insecure_capabilities = self._conf.d['controller']['docker'].get(
            'allow_insecure_capabilities', False)

        if not allow_insecure_capabilities and is_mounting:
            # set state to failed, because insecure_capabilities are not allowed but needed, by this batch.
            debug_info = 'FUSE support for this agency is disabled, but the following input/output-keys are ' \
                         'configured to mount inside a docker container.{}{}'.format(os.linesep, mount_connectors)
            batch_failure(self._mongo,
                          batch_id,
                          debug_info,
                          None,
                          next_batch['state'],
                          disable_retry_if_failed=True)
            return None

        # update batch data
        update_result = self._mongo.db['batches'].update_one(
            {
                '_id': next_batch['_id'],
                'state': next_batch['state']
            }, {
                '$set': {
                    'state': 'scheduled',
                    'node': selected_node.node_name,
                    'usedGPUs': used_gpu_ids,
                    'mount': is_mounting
                },
                '$push': {
                    'history': {
                        'state': 'scheduled',
                        'time': time(),
                        'debugInfo': None,
                        'node': selected_node.node_name,
                        'ccagent': None,
                        'dockerStats': None
                    }
                },
                '$inc': {
                    'attempts': 1
                }
            })

        if update_result.modified_count == 1:
            # The state of the scheduled batch switched from 'registered' to 'scheduled', so increase the batch_count.
            # batch_count_cache always contains experiment_id, because _get_number_of_batches_of_experiment()
            # always inserts the given experiment_id
            batch_count_cache[experiment_id] += 1

            return selected_node.node_name
        else:
            return None