Esempio n. 1
0
    def _validate_response(response):
        """Validate the response from the QCG PJM.

        This method checks the format of the response and exit code.

        Args:
            response (dict): deserialized JSON response

        Returns:
            dict: validated response data

        Raises:
            InternalError: in case the response format is invalid
            ConnectionError: in case of non zero exit code
        """
        if not isinstance(response, dict) or 'code' not in response:
            raise errors.InternalError('Invalid reply from the service')

        if response['code'] != 0:
            if 'message' in response:
                raise errors.ConnectionError('Request failed - {}'.format(
                    response['message']))

            raise errors.ConnectionError('Request failed')

        if 'data' not in response:
            raise errors.InternalError('Invalid reply from the service')

        return response['data']
Esempio n. 2
0
    def _assure_connected(self):
        """Check if connection has been successfully opened.

        Raises:
            ConnectionError: if connection has not been established yet
        """
        if not self._connected:
            raise errors.ConnectionError('Not connected')
Esempio n. 3
0
    def wait4(self, names):
        """Wait for finish of specific jobs.

        This method waits until all specified jobs finish its execution (successfully or not).
        The QCG-PilotJob manager is periodically polled about status of not finished jobs. The poll interval (2 sec by
        default) can be changed by defining a 'poll_delay' key with appropriate value (in seconds) in
        configuration of instance.

        Args:
            names (str|list(str)): list of job names to get detailed information about

        Returns:
            dict - a map with job names and their terminal status

        Raises:
            InternalError: in case the response format is invalid
            ConnectionError: in case of non zero exit code, or if connection has not been established yet
        """
        if isinstance(names, str):
            job_names = [names]
        else:
            job_names = list(names)

        _logger.info("waiting for finish of %d jobs", len(job_names))

        result = {}
        not_finished = job_names
        while len(not_finished) > 0:
            try:
                jobs_status = self.status(not_finished)

                not_finished = []
                for job_name, job_data in jobs_status['jobs'].items():
                    if 'status' not in job_data['data'] or job_data[
                            'status'] != 0 or 'data' not in job_data:
                        raise errors.InternalError(
                            "Missing job's {} data".format(job_name))

                    if not Manager.is_status_finished(
                            job_data['data']['status']):
                        not_finished.append(job_name)
                    else:
                        result[job_name] = job_data['data']['status']

                if len(not_finished) > 0:
                    _logger.info("still %d jobs not finished",
                                 len(not_finished))
                    time.sleep(self._poll_delay)

            except Exception as exc:
                raise errors.ConnectionError(exc.args[0])

        _logger.info("all jobs finished")
        return result
Esempio n. 4
0
    def _disconnect(self):
        """Close connection to the QCG-PJM

        Raises:
            ConnectionError: if there was an error during closing the connection.
        """
        try:
            if self._connected:
                self._zmq_socket.close()
                self._connected = False
        except Exception as exc:
            raise errors.ConnectionError('Failed to disconnect {}'.format(
                exc.args[0]))
Esempio n. 5
0
    def _connect(self):
        """Connect to the QCG-PJM.

        The connection is made to the address defined in the constructor. The success of this method is does not mean
        that communication with QCG-PilotJob manager instance has been established, as in case of ZMQ communication,
        only when sending and receiving messages the real communication takes place.

        Raises:
            ConnectionError: in case of error during establishing connection.
        """
        self._disconnect()

        _logger.info("connecting to the PJM @ %s", self._address)
        try:
            self._zmq_socket = self._zmq_ctx.socket(zmq.REQ)  # pylint: disable=maybe-no-member
            self._zmq_socket.connect(self._address)
            self._connected = True
            _logger.info("connection established")
        except Exception as exc:
            raise errors.ConnectionError('Failed to connect to {} - {}'.format(
                self._address, exc.args[0]))
Esempio n. 6
0
    def __init__(self, server_args=None, cfg=None):
        """Initialize instance.

        Launch QCG-PilotJob manager instance in background thread and connect to it. The port number for ZMQ interface
        of QCG-PilotJob manager instance is randomly selected.

        Args:
            server_args (list(str)): the command line arguments for QCG-PilotJob manager instance

                  --net                 enable network interface
                  --net-port NET_PORT   port to listen for network interface (implies --net)
                  --net-port-min NET_PORT_MIN
                                        minimum port range to listen for network interface if
                                        exact port number is not defined (implies --net)
                  --net-port-max NET_PORT_MAX
                                        maximum port range to listen for network interface if
                                        exact port number is not defined (implies --net)
                  --file                enable file interface
                  --file-path FILE_PATH
                                        path to the request file (implies --file)
                  --wd WD               working directory for the service
                  --envschema ENVSCHEMA
                                        job environment schema [auto|slurm]
                  --resources RESOURCES
                                        source of information about available resources
                                        [auto|slurm|local] as well as a method of job
                                        execution (through local processes or as a Slurm sub
                                        jobs)
                  --report-format REPORT_FORMAT
                                        format of job report file [text|json]
                  --report-file REPORT_FILE
                                        name of the job report file
                  --nodes NODES         configuration of available resources (implies
                                        --resources local)
                  --log {critical,error,warning,info,debug,notset}
                                        log level
                  --system-core         reserve one of the core for the QCG-PJM
                  --disable-nl          disable custom launching method
                  --show-progress       print information about executing tasks
                  --governor            run manager in the governor mode, where jobs will be
                                        scheduled to execute to the dependant managers
                  --parent PARENT       address of the parent manager, current instance will
                                        receive jobs from the parent manaqger
                  --id ID               optional manager instance identifier - will be
                                        generated automatically when not defined
                  --tags TAGS           optional manager instance tags separated by commas
                  --slurm-partition-nodes SLURM_PARTITION_NODES
                                        split Slurm allocation by given number of nodes, where
                                        each group will be controlled by separate manager
                                        (implies --governor)
                  --slurm-limit-nodes-range-begin SLURM_LIMIT_NODES_RANGE_BEGIN
                                        limit Slurm allocation to specified range of nodes
                                        (starting node)
                  --slurm-limit-nodes-range-end SLURM_LIMIT_NODES_RANGE_END
                                        limit Slurm allocation to specified range of nodes
                                        (ending node)

                each command line argument and (optionaly) it's value should be passed as separate entry in the list

            cfg (dict) - the configuration; currently the following keys are supported:
              'init_timeout' - the timeout (in seconds) client should wait for QCG-PilotJob manager start until it raise
                 error, 300 by default
              'poll_delay' - the delay between following status polls in wait methods
              'log_file' - the location of the log file
              'log_level' - the log level ('DEBUG'); by default the log level is set to INFO
        """
        client_cfg = cfg or {}
        self._setup_logging(client_cfg)

        _logger.debug('initializing MP start method with "fork"')
        mp.set_start_method("fork", force=True)
        mp.freeze_support()

        if LocalManager.is_notebook():
            _logger.debug(
                'Creating a new event loop due to run in an interactive environment'
            )
            import asyncio
            asyncio.set_event_loop(asyncio.new_event_loop())

        try:
            from qcg.pilotjob.service import QCGPMServiceProcess
        except ImportError:
            raise errors.ServiceError('qcg.pilotjob library is not available')

        if not server_args:
            server_args = ['--net']
        elif '--net' not in server_args:
            server_args.append('--net')

        server_args = [str(arg) for arg in server_args]

        self.qcgpm_queue = mp.Queue()
        self.qcgpm_process = QCGPMServiceProcess(server_args, self.qcgpm_queue)
        self.qcgpm_conf = None
        _logger.debug('manager process created')

        self.qcgpm_process.start()
        _logger.debug('manager process started')

        try:
            # timeout of single iteration
            wait_single_timeout = 2
            # number of iterations
            wait_iters = int(
                client_cfg.get('init_timeout', 300) / wait_single_timeout) + 1

            _logger.debug(
                f'waiting {wait_iters * wait_single_timeout} secs for service start ...'
            )
            service_wait_start = datetime.now()

            for i in range(wait_iters):
                if not self.qcgpm_process.is_alive():
                    raise errors.ServiceError('Service not started')

                try:
                    self.qcgpm_conf = self.qcgpm_queue.get(
                        block=True, timeout=wait_single_timeout)
                    break
                except queue.Empty:
                    continue

    #                raise errors.ServiceError('Service not started - timeout')
                except Exception as exc:
                    raise errors.ServiceError('Service not started: {}'.format(
                        str(exc)))

            if not self.qcgpm_conf:
                raise errors.ServiceError('Service not started')

            if self.qcgpm_conf.get('error', None):
                raise errors.ServiceError(self.qcgpm_conf['error'])
        except Exception as ex:
            if self.qcgpm_process:
                try:
                    _logger.debug(
                        'killing pilotjob service process as not started properly'
                    )
                    self.qcgpm_process.terminate()
                except:
                    _logger.exception('failed to kill pilotjob service')
            raise

        _logger.info(
            f'service started after {(datetime.now() - service_wait_start).total_seconds()} secs'
        )

        _logger.debug('got manager configuration: %s', str(self.qcgpm_conf))
        if not self.qcgpm_conf.get('zmq_addresses', None):
            raise errors.ConnectionError(
                'Missing QCGPM network interface address')

        zmq_iface_address = self.qcgpm_conf['zmq_addresses'][0]
        _logger.info('manager zmq iface address: %s', zmq_iface_address)

        super(LocalManager, self).__init__(zmq_iface_address, cfg)