Esempio n. 1
0
    def _payload_template(self):
        meta = {
            **self.name_data.copy(),

            # Version info
            "qcengine_version": qcng.__version__,
            "manager_version": get_information("version"),

            # User info
            "username": self.client.username,

            # Pull info
            "programs": self.available_programs,
            "procedures": self.available_procedures,
            "tag": self.queue_tag
        }

        return {"meta": meta, "data": {}}
Esempio n. 2
0
    def _payload_template(self):
        meta = {
            **self.name_data.copy(),
            # Version info
            "qcengine_version": qcng.__version__,
            "manager_version": get_information("version"),
            # User info
            "username": self.client.username,
            # Pull info
            "programs": self.available_programs,
            "procedures": self.available_procedures,
            "tag": self.queue_tag,
            # Statistics
            "total_worker_walltime": self.statistics.total_worker_walltime,
            "total_task_walltime": self.statistics.total_task_walltime,
            "active_tasks": self.statistics.active_task_slots,
            "active_cores": self.statistics.active_cores,
            "active_memory": self.statistics.active_memory,
        }

        return {"meta": meta, "data": {}}
Esempio n. 3
0
    def __init__(self,
                 client: Any,
                 queue_client: Any,
                 logger: Optional[logging.Logger] = None,
                 max_tasks: int = 200,
                 queue_tag: str = None,
                 manager_name: str = "unlabeled",
                 update_frequency: Union[int, float] = 2,
                 verbose: bool = True,
                 server_error_retries: Optional[int] = 1,
                 stale_update_limit: Optional[int] = 10,
                 cores_per_task: Optional[int] = None,
                 memory_per_task: Optional[Union[int, float]] = None,
                 scratch_directory: Optional[str] = None,
                 retries: Optional[int] = 2):
        """
        Parameters
        ----------
        client : FractalClient
            A FractalClient connected to a server
        queue_client : QueueAdapter
            The DBAdapter class for queue abstraction
        logger : logging.Logger, Optional. Default: None
            A logger for the QueueManager
        max_tasks : int
            The maximum number of tasks to hold at any given time
        queue_tag : str
            Allows managers to pull from specific tags
        manager_name : str
            The cluster the manager belongs to
        update_frequency : int
            The frequency to check for new tasks in seconds
        verbose: bool, optional, Default: True
            Whether or not to have the manager be verbose (logger level debug and up)
        server_error_retries: int, optional, Default: 1
            How many times finished jobs are attempted to be pushed to the server in
            in the event of a server communication error.
            After number of attempts, the failed jobs are dropped from this manager and considered "stale"
            Set to `None` to keep retrying
        stale_update_limit: int, optional, Default: 10
            Number of stale update attempts to keep around
            If this limit is ever hit, the server initiates as shutdown as best it can
            since communication with the server has gone wrong too many times.
            Set to `None` for unlimited
        cores_per_task : int, optional, Default: None
            How many CPU cores per computation task to allocate for QCEngine
            None indicates "use however many you can detect"
        memory_per_task: int, optional, Default: None
            How much memory, in GiB, per computation task to allocate for QCEngine
            None indicates "use however much you can consume"
        scratch_directory : str, optional, Default: None
            Scratch directory location to do QCEngine compute
            None indicates "wherever the system default is"'
        retries : int, optional, Default: 2
            Number of retries that QCEngine will attempt for RandomErrors detected when running
            its computations. After this many attempts (or on any other type of error), the
            error will be raised.
        """

        # Setup logging
        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('QueueManager')

        self.name_data = {"cluster": manager_name, "hostname": socket.gethostname(), "uuid": str(uuid.uuid4())}
        self._name = self.name_data["cluster"] + "-" + self.name_data["hostname"] + "-" + self.name_data["uuid"]

        self.client = client
        self.cores_per_task = cores_per_task
        self.memory_per_task = memory_per_task
        self.scratch_directory = scratch_directory
        self.retries = retries
        self.queue_adapter = build_queue_adapter(queue_client,
                                                 logger=self.logger,
                                                 cores_per_task=self.cores_per_task,
                                                 memory_per_task=self.memory_per_task,
                                                 scratch_directory=self.scratch_directory,
                                                 retries=self.retries,
                                                 verbose=verbose)
        self.max_tasks = max_tasks
        self.queue_tag = queue_tag
        self.verbose = verbose
        self.statistics = QueueStatistics(max_concurrent_tasks=self.max_tasks,
                                          cores_per_task=cores_per_task,
                                          update_frequency=update_frequency
                                          )

        self.scheduler = None
        self.update_frequency = update_frequency
        self.periodic = {}
        self.active = 0
        self.exit_callbacks = []

        # Server response/stale job handling
        self.server_error_retries = server_error_retries
        self.stale_update_limit = stale_update_limit
        self._stale_updates_tracked = 0
        self._stale_payload_tracking = []
        self.n_stale_jobs = 0

        # QCEngine data
        self.available_programs = qcng.list_available_programs()
        self.available_procedures = qcng.list_available_procedures()

        self.logger.info("QueueManager:")
        self.logger.info("    Version:         {}\n".format(get_information("version")))

        if self.verbose:
            self.logger.info("    Name Information:")
            self.logger.info("        Cluster:     {}".format(self.name_data["cluster"]))
            self.logger.info("        Hostname:    {}".format(self.name_data["hostname"]))
            self.logger.info("        UUID:        {}\n".format(self.name_data["uuid"]))

        self.logger.info("    Queue Adapter:")
        self.logger.info("        {}\n".format(self.queue_adapter))

        if self.verbose:
            self.logger.info("    QCEngine:")
            self.logger.info("        Version:     {}".format(qcng.__version__))
            self.logger.info("        Task Cores:  {}".format(self.cores_per_task))
            self.logger.info("        Task Mem:    {}".format(self.memory_per_task))
            self.logger.info("        Scratch Dir: {}".format(self.scratch_directory))
            self.logger.info("        Programs:    {}".format(self.available_programs))
            self.logger.info("        Procedures:  {}\n".format(self.available_procedures))

        # DGAS Note: Note super happy about how this if/else turned out. Looking for alternatives.
        if self.connected():
            # Pull server info
            self.server_info = client.server_information()
            self.server_name = self.server_info["name"]
            self.server_version = self.server_info["version"]
            self.server_query_limit = self.server_info["query_limit"]
            if self.max_tasks > self.server_query_limit:
                self.max_tasks = self.server_query_limit
                self.logger.warning(
                    "Max tasks was larger than server query limit of {}, reducing to match query limit.".format(
                        self.server_query_limit))
            self.heartbeat_frequency = self.server_info["heartbeat_frequency"]

            # Tell the server we are up and running
            payload = self._payload_template()
            payload["data"]["operation"] = "startup"

            self.client._automodel_request("queue_manager", "put", payload)

            if self.verbose:
                self.logger.info("    Connected:")
                self.logger.info("        Version:     {}".format(self.server_version))
                self.logger.info("        Address:     {}".format(self.client.address))
                self.logger.info("        Name:        {}".format(self.server_name))
                self.logger.info("        Queue tag:   {}".format(self.queue_tag))
                self.logger.info("        Username:    {}\n".format(self.client.username))

        else:
            self.logger.info("    QCFractal server information:")
            self.logger.info("        Not connected, some actions will not be available")
Esempio n. 4
0
    def __init__(self,
                 client: Any,
                 queue_client: Any,
                 logger: Optional[logging.Logger] = None,
                 max_tasks: int = 200,
                 queue_tag: str = None,
                 manager_name: str = "unlabled",
                 update_frequency: Union[int, float] = 2,
                 verbose: bool = True,
                 cores_per_task: Optional[int] = None,
                 memory_per_task: Optional[Union[int, float]] = None,
                 scratch_directory: Optional[str] = None):
        """
        Parameters
        ----------
        client : FractalClient
            A FractalClient connected to a server
        queue_client : QueueAdapter
            The DBAdapter class for queue abstraction
        storage_socket : DBSocket
            A socket for the backend database
        logger : logging.Logger, Optional. Default: None
            A logger for the QueueManager
        max_tasks : int
            The maximum number of tasks to hold at any given time
        queue_tag : str
            Allows managers to pull from specific tags
        manager_name : str
            The cluster the manager belongs to
        update_frequency : int
            The frequency to check for new tasks in seconds
        cores_per_task : int, optional, Default: None
            How many CPU cores per computation task to allocate for QCEngine
            None indicates "use however many you can detect"
        memory_per_task: int, optional, Default: None
            How much memory, in GiB, per computation task to allocate for QCEngine
            None indicates "use however much you can consume"
        scratch_directory: str, optional, Default: None
            Scratch directory location to do QCEngine compute
            None indicates "wherever the system default is"
        """

        # Setup logging
        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('QueueManager')

        self.name_data = {
            "cluster": manager_name,
            "hostname": socket.gethostname(),
            "uuid": str(uuid.uuid4())
        }
        self._name = self.name_data["cluster"] + "-" + self.name_data[
            "hostname"] + "-" + self.name_data["uuid"]

        self.client = client
        self.cores_per_task = cores_per_task
        self.memory_per_task = memory_per_task
        self.scratch_directory = scratch_directory
        self.queue_adapter = build_queue_adapter(
            queue_client,
            logger=self.logger,
            cores_per_task=self.cores_per_task,
            memory_per_task=self.memory_per_task,
            scratch_directory=self.scratch_directory)
        self.max_tasks = max_tasks
        self.queue_tag = queue_tag
        self.verbose = verbose

        self.scheduler = None
        self.update_frequency = update_frequency
        self.periodic = {}
        self.active = 0
        self.exit_callbacks = []

        # QCEngine data
        self.available_programs = qcng.list_available_programs()
        self.available_procedures = qcng.list_available_procedures()

        self.logger.info("QueueManager:")
        self.logger.info("    Version:         {}\n".format(
            get_information("version")))

        if self.verbose:
            self.logger.info("    Name Information:")
            self.logger.info("        Cluster:     {}".format(
                self.name_data["cluster"]))
            self.logger.info("        Hostname:    {}".format(
                self.name_data["hostname"]))
            self.logger.info("        UUID:        {}\n".format(
                self.name_data["uuid"]))

        self.logger.info("    Queue Adapter:")
        self.logger.info("        {}\n".format(self.queue_adapter))

        if self.verbose:
            self.logger.info("    QCEngine:")
            self.logger.info("        Version:     {}".format(
                qcng.__version__))
            self.logger.info("        Task Cores:  {}".format(
                self.cores_per_task))
            self.logger.info("        Task Mem:    {}".format(
                self.memory_per_task))
            self.logger.info("        Scratch Dir: {}".format(
                self.scratch_directory))
            self.logger.info("        Programs:    {}".format(
                self.available_programs))
            self.logger.info("        Procedures:  {}\n".format(
                self.available_procedures))

        # DGAS Note: Note super happy about how this if/else turned out. Looking for alternatives.
        if self.connected():
            # Pull server info
            self.server_info = client.server_information()
            self.server_name = self.server_info["name"]
            self.server_version = self.server_info["version"]
            self.server_query_limit = self.server_info["query_limit"]
            if self.max_tasks > self.server_query_limit:
                self.max_tasks = self.server_query_limit
                self.logger.warning(
                    "Max tasks was larger than server query limit of {}, reducing to match query limit."
                    .format(self.server_query_limit))
            self.heartbeat_frequency = self.server_info["heartbeat_frequency"]

            # Tell the server we are up and running
            payload = self._payload_template()
            payload["data"]["operation"] = "startup"

            response = self.client._automodel_request("queue_manager", "put",
                                                      payload)

            if self.verbose:
                self.logger.info("    Connected:")
                self.logger.info("        Version:     {}".format(
                    self.server_version))
                self.logger.info("        Address:     {}".format(
                    self.client.address))
                self.logger.info("        Name:        {}".format(
                    self.server_name))
                self.logger.info("        Queue tag:   {}".format(
                    self.queue_tag))
                self.logger.info("        Username:    {}\n".format(
                    self.client.username))

        else:
            self.logger.info("    QCFractal server information:")
            self.logger.info(
                "        Not connected, some actions will not be available")