Beispiel #1
0
    async def manage_start_partition_managers(self):
        """Start and manage partitions manager.

        Ensure that all started partitions manager registered in governor.
        """
        try:
            # start all partition managers
            try:
                _logger.info('starting all partition managers ...')
                await asyncio.gather(
                    *
                    [manager.launch() for manager in self._partition_managers])
            except Exception:
                _logger.error(
                    'one of partition manager failed to start - killing the rest'
                )
                # if one of partition manager didn't start - stop all instances and set governor manager finish flag
                await self._terminate_partition_managers_and_finish()
                raise InternalError('Partition managers not started')

            _logger.info('all (%s) partition managers started successfully',
                         len(self._partition_managers))

            # check if they registered in assumed time
            _logger.debug('waiting %s secs for partition manager registration',
                          self._wait_for_register_timeout)

            start_of_waiting_for_registration = datetime.now()
            while len(self.managers) < len(self._partition_managers):
                await asyncio.sleep(0.2)

                if (datetime.now() - start_of_waiting_for_registration).total_seconds() >\
                        self._wait_for_register_timeout:
                    # timeout exceeded
                    _logger.error(
                        'not all partition managers registered - only %d on %d total',
                        len(self.managers), len(self._partition_managers))
                    await self._terminate_partition_managers_and_finish()
                    raise InternalError('Partition managers not registered')

            _logger.info('available resources: %d (%d used) cores on %d nodes',
                         self.total_resources.total_cores,
                         self.total_resources.used_cores,
                         self.total_resources.total_nodes)

            _logger.info('all partition managers registered')
        except Exception:
            _logger.error('setup of partition managers failed: %s',
                          str(sys.exc_info()))
Beispiel #2
0
    def _check_resources(self):
        """Validate if the resources has been set.

        Raises:
            InternalError: when resources has not been set
        """
        if self.resources is None:
            raise InternalError("Missing resources in scheduler algorithm")
Beispiel #3
0
    def release(self, cralloc):
        """Release allocated consumable resources.

        Args:
            cralloc (CRAllocation) - allocation to release

        Raises:
            InternalError: if allocation size is greater than used resources, no resources are released.
        """
        if not isinstance(cralloc, CRAllocation):
            raise InternalError("failed type of CR allocation - {} vs expected CRAllocation".format(
                type(cralloc).__name__))

        if cralloc.count > self.used:
            raise InternalError("failed to release more resources {} than is allocated {} (CR {})".format(
                cralloc.count, self.used, self.crtype.name))

        self.used -= cralloc.count
Beispiel #4
0
    def release(self, allocation):
        """Release allocation on a node.

        Args:
            allocation (NodeAllocation): allocated resources
        """
        if allocation.ncores > self.used:
            raise InternalError('trying to release more cores than are used on node {}'.format(self._name))

        self._free_cores = sorted(self._free_cores + allocation.cores, key=lambda c: int(c.split(',')[0]) if ',' in str(c) else int(c))

        if allocation.crs:
            if not self._crs:
                raise InternalError('trying to release crs which are not available on node {}'.format(self._name))

            for crtype, cr_bind in allocation.crs.items():
                if crtype not in self._crs:
                    raise InternalError('CR {} not available on a node {}'.format(crtype.name, self._name))

                self._crs[crtype].release(cr_bind)

        if self.resources is not None:
            self.resources.node_cores_released(allocation.ncores)
    def get_schema(cls, resources, config):
        """Create and return suitable instance of execution schema.

        Currently decision about type of execution schema is taken based on origin of resources - if QCG-PilotJob
        manager is run inside Slurm allocation, the SlurmExecution is selected. In other cases the DirectExecution
        schema is instantiated.

        Args:
            resources (Resources): available resources
            config (dict): QCG-PilotJob configuration

        Returns:
            ExecutionSchema: instance of execution schema
        """
        if resources.rtype not in __SCHEMAS__:
            raise InternalError('Unknown resources type: {}'.format(
                resources.rtype))

        return __SCHEMAS__[resources.rtype](resources, config)
    def preprocess(self, ex_job):
        """"Preprocess job iteration description before launching.
        Prepare job iteration execution arguments.

        Args
            ex_job (ExecutionJob): execution job iteration data
        """
        # as the single core jobs are launched directly by the agent or locally without slurm interaction
        # this preprocess should be executed only for parallel jobs
        if len(ex_job.allocation.nodes
               ) != 1 or ex_job.allocation.nodes[0].ncores != 1:
            job_model = ex_job.job_execution.model or 'default'

            _logger.debug(f'looking for job model {job_model}')

            preprocess_method = SlurmExecution.JOB_MODELS.get(job_model)
            if not preprocess_method:
                raise InternalError(
                    f"unknown job execution model '{job_model}'")

            method = getattr(self, preprocess_method)
            method(ex_job)
Beispiel #7
0
    async def cancel_iteration(self, job, iteration):
        """Cancel already running job.

        Args:
            job (Job): an iteration to cancel
            iteration (int, optional): an iteraiton index
        """
        # find iteration to cancel
        attempt = 0
        while True:
            try:
                exec_job = next(exec_job for exec_job in self._not_finished.values() if exec_job.job_iteration.job == job and exec_job.job_iteration.iteration == iteration)
                _logger.info(f'found execution job to cancel')
                break
            except StopIteration:
                if self._manager.queued_to_execute > 0 and attempt < 3:
                    _logger.info(f'not found iteration to cancel but there are queued jobs for executing - waiting a moment')
                    attempt += 1
                    await asyncio.sleep(0.5)
                else:
                    _logger.error(f'iteration to cancel {job.name}:{iteration} not found in executor')
                    raise InternalError('iteration to cancel not found')

        asyncio.ensure_future(exec_job.cancel())
Beispiel #8
0
    async def _launch_partition_managers(self, nodes_in_partition):
        """Launch partition managers.

        The information about Slurm allocation is gathered, and all nodes are split by ``nodes_in_partition`` to form
        a single partition, and partition manager instance is created to control each partition.

        Args:
            nodes_in_partition (int): how many nodes each partition should have

        Raises:
            InvalidRequest: when
                * ``nodes_in_partition`` is less than 1
                * governor manager has not been launched in Slurm allocation
                * missing nodes in Slurm allocation
            InternalError: when
                * missing ZMQ interface in governor manager
        """
        _logger.info('setup allocation split into partitions by %s nodes',
                     nodes_in_partition)

        if nodes_in_partition < 1:
            _logger.error(
                'Failed to partition resources - partition size must be greater or equal 1'
            )
            raise InvalidRequest(
                'Failed to partition resources - partition size must be greater or equal 1'
            )

        # get slurm resources - currently only slurm is supported
        if not in_slurm_allocation():
            _logger.error(
                'Failed to partition resources - partitioning resources is currently available only within '
                'slurm allocation')
            raise InvalidRequest(
                'Failed to partition resources - partitioning resources is currently available only '
                'within slurm allocation')

        if not self.zmq_address:
            _logger.error(
                'Failed to partition resources - missing zmq interface address'
            )
            raise InternalError(
                'Failed to partition resources - missing zmq interface address'
            )

        slurm_resources = parse_slurm_resources(self._config)

        if slurm_resources.total_nodes < 1:
            raise InvalidRequest(
                'Failed to partition resources - allocation contains no nodes')

        npartitions = math.ceil(slurm_resources.total_nodes /
                                nodes_in_partition)
        _logger.info(
            '%s partitions will be created (in allocation containing %s total nodes)',
            npartitions, slurm_resources.total_nodes)

        # launch partition manager in the same directory as governor
        partition_manager_wdir = Config.EXECUTOR_WD.get(self._config)
        partition_manager_auxdir = Config.AUX_DIR.get(self._config)

        _logger.debug('partition managers working directory %s',
                      partition_manager_wdir)

        for part_idx in range(npartitions):
            _logger.debug('creating partition manager %s configuration',
                          part_idx)

            part_node = slurm_resources.nodes[part_idx * nodes_in_partition]

            _logger.debug('partition manager node %s', part_node.name)

            self._partition_managers.append(
                PartitionManager(
                    'partition-{}'.format(part_idx), part_node.name,
                    part_idx * nodes_in_partition,
                    min(part_idx * nodes_in_partition + nodes_in_partition,
                        slurm_resources.total_nodes), partition_manager_wdir,
                    self.zmq_address, partition_manager_auxdir, self._config))

            _logger.debug('partition manager %s configuration created',
                          part_idx)

        self._min_scheduling_managers = len(self._partition_managers)

        _logger.info(
            'created partition managers configuration and set minimum scheduling managers to %s',
            self._min_scheduling_managers)

        asyncio.ensure_future(self.manage_start_partition_managers())

        # launch task in the background that schedule buffered submit requests
        self._schedule_buffered_jobs_task = asyncio.ensure_future(
            self._schedule_buffered_jobs())
Beispiel #9
0
 async def _register_in_parent(self):
     """Register governor manager in parent instances - currenlty not supported"""
     _logger.error(
         'Governing managers can not currenlty register in parent managers')
     raise InternalError(
         'Governing managers can not currenlty register in parent managers')