def _payload_template(self): meta = { **self.name_data.copy(), # Version info "qcengine_version": qcng.__version__, "manager_version": get_information("version"), # User info "username": self.client.username, # Pull info "programs": self.available_programs, "procedures": self.available_procedures, "tag": self.queue_tag } return {"meta": meta, "data": {}}
def _payload_template(self): meta = { **self.name_data.copy(), # Version info "qcengine_version": qcng.__version__, "manager_version": get_information("version"), # User info "username": self.client.username, # Pull info "programs": self.available_programs, "procedures": self.available_procedures, "tag": self.queue_tag, # Statistics "total_worker_walltime": self.statistics.total_worker_walltime, "total_task_walltime": self.statistics.total_task_walltime, "active_tasks": self.statistics.active_task_slots, "active_cores": self.statistics.active_cores, "active_memory": self.statistics.active_memory, } return {"meta": meta, "data": {}}
def __init__(self, client: Any, queue_client: Any, logger: Optional[logging.Logger] = None, max_tasks: int = 200, queue_tag: str = None, manager_name: str = "unlabeled", update_frequency: Union[int, float] = 2, verbose: bool = True, server_error_retries: Optional[int] = 1, stale_update_limit: Optional[int] = 10, cores_per_task: Optional[int] = None, memory_per_task: Optional[Union[int, float]] = None, scratch_directory: Optional[str] = None, retries: Optional[int] = 2): """ Parameters ---------- client : FractalClient A FractalClient connected to a server queue_client : QueueAdapter The DBAdapter class for queue abstraction logger : logging.Logger, Optional. Default: None A logger for the QueueManager max_tasks : int The maximum number of tasks to hold at any given time queue_tag : str Allows managers to pull from specific tags manager_name : str The cluster the manager belongs to update_frequency : int The frequency to check for new tasks in seconds verbose: bool, optional, Default: True Whether or not to have the manager be verbose (logger level debug and up) server_error_retries: int, optional, Default: 1 How many times finished jobs are attempted to be pushed to the server in in the event of a server communication error. After number of attempts, the failed jobs are dropped from this manager and considered "stale" Set to `None` to keep retrying stale_update_limit: int, optional, Default: 10 Number of stale update attempts to keep around If this limit is ever hit, the server initiates as shutdown as best it can since communication with the server has gone wrong too many times. Set to `None` for unlimited cores_per_task : int, optional, Default: None How many CPU cores per computation task to allocate for QCEngine None indicates "use however many you can detect" memory_per_task: int, optional, Default: None How much memory, in GiB, per computation task to allocate for QCEngine None indicates "use however much you can consume" scratch_directory : str, optional, Default: None Scratch directory location to do QCEngine compute None indicates "wherever the system default is"' retries : int, optional, Default: 2 Number of retries that QCEngine will attempt for RandomErrors detected when running its computations. After this many attempts (or on any other type of error), the error will be raised. """ # Setup logging if logger: self.logger = logger else: self.logger = logging.getLogger('QueueManager') self.name_data = {"cluster": manager_name, "hostname": socket.gethostname(), "uuid": str(uuid.uuid4())} self._name = self.name_data["cluster"] + "-" + self.name_data["hostname"] + "-" + self.name_data["uuid"] self.client = client self.cores_per_task = cores_per_task self.memory_per_task = memory_per_task self.scratch_directory = scratch_directory self.retries = retries self.queue_adapter = build_queue_adapter(queue_client, logger=self.logger, cores_per_task=self.cores_per_task, memory_per_task=self.memory_per_task, scratch_directory=self.scratch_directory, retries=self.retries, verbose=verbose) self.max_tasks = max_tasks self.queue_tag = queue_tag self.verbose = verbose self.statistics = QueueStatistics(max_concurrent_tasks=self.max_tasks, cores_per_task=cores_per_task, update_frequency=update_frequency ) self.scheduler = None self.update_frequency = update_frequency self.periodic = {} self.active = 0 self.exit_callbacks = [] # Server response/stale job handling self.server_error_retries = server_error_retries self.stale_update_limit = stale_update_limit self._stale_updates_tracked = 0 self._stale_payload_tracking = [] self.n_stale_jobs = 0 # QCEngine data self.available_programs = qcng.list_available_programs() self.available_procedures = qcng.list_available_procedures() self.logger.info("QueueManager:") self.logger.info(" Version: {}\n".format(get_information("version"))) if self.verbose: self.logger.info(" Name Information:") self.logger.info(" Cluster: {}".format(self.name_data["cluster"])) self.logger.info(" Hostname: {}".format(self.name_data["hostname"])) self.logger.info(" UUID: {}\n".format(self.name_data["uuid"])) self.logger.info(" Queue Adapter:") self.logger.info(" {}\n".format(self.queue_adapter)) if self.verbose: self.logger.info(" QCEngine:") self.logger.info(" Version: {}".format(qcng.__version__)) self.logger.info(" Task Cores: {}".format(self.cores_per_task)) self.logger.info(" Task Mem: {}".format(self.memory_per_task)) self.logger.info(" Scratch Dir: {}".format(self.scratch_directory)) self.logger.info(" Programs: {}".format(self.available_programs)) self.logger.info(" Procedures: {}\n".format(self.available_procedures)) # DGAS Note: Note super happy about how this if/else turned out. Looking for alternatives. if self.connected(): # Pull server info self.server_info = client.server_information() self.server_name = self.server_info["name"] self.server_version = self.server_info["version"] self.server_query_limit = self.server_info["query_limit"] if self.max_tasks > self.server_query_limit: self.max_tasks = self.server_query_limit self.logger.warning( "Max tasks was larger than server query limit of {}, reducing to match query limit.".format( self.server_query_limit)) self.heartbeat_frequency = self.server_info["heartbeat_frequency"] # Tell the server we are up and running payload = self._payload_template() payload["data"]["operation"] = "startup" self.client._automodel_request("queue_manager", "put", payload) if self.verbose: self.logger.info(" Connected:") self.logger.info(" Version: {}".format(self.server_version)) self.logger.info(" Address: {}".format(self.client.address)) self.logger.info(" Name: {}".format(self.server_name)) self.logger.info(" Queue tag: {}".format(self.queue_tag)) self.logger.info(" Username: {}\n".format(self.client.username)) else: self.logger.info(" QCFractal server information:") self.logger.info(" Not connected, some actions will not be available")
def __init__(self, client: Any, queue_client: Any, logger: Optional[logging.Logger] = None, max_tasks: int = 200, queue_tag: str = None, manager_name: str = "unlabled", update_frequency: Union[int, float] = 2, verbose: bool = True, cores_per_task: Optional[int] = None, memory_per_task: Optional[Union[int, float]] = None, scratch_directory: Optional[str] = None): """ Parameters ---------- client : FractalClient A FractalClient connected to a server queue_client : QueueAdapter The DBAdapter class for queue abstraction storage_socket : DBSocket A socket for the backend database logger : logging.Logger, Optional. Default: None A logger for the QueueManager max_tasks : int The maximum number of tasks to hold at any given time queue_tag : str Allows managers to pull from specific tags manager_name : str The cluster the manager belongs to update_frequency : int The frequency to check for new tasks in seconds cores_per_task : int, optional, Default: None How many CPU cores per computation task to allocate for QCEngine None indicates "use however many you can detect" memory_per_task: int, optional, Default: None How much memory, in GiB, per computation task to allocate for QCEngine None indicates "use however much you can consume" scratch_directory: str, optional, Default: None Scratch directory location to do QCEngine compute None indicates "wherever the system default is" """ # Setup logging if logger: self.logger = logger else: self.logger = logging.getLogger('QueueManager') self.name_data = { "cluster": manager_name, "hostname": socket.gethostname(), "uuid": str(uuid.uuid4()) } self._name = self.name_data["cluster"] + "-" + self.name_data[ "hostname"] + "-" + self.name_data["uuid"] self.client = client self.cores_per_task = cores_per_task self.memory_per_task = memory_per_task self.scratch_directory = scratch_directory self.queue_adapter = build_queue_adapter( queue_client, logger=self.logger, cores_per_task=self.cores_per_task, memory_per_task=self.memory_per_task, scratch_directory=self.scratch_directory) self.max_tasks = max_tasks self.queue_tag = queue_tag self.verbose = verbose self.scheduler = None self.update_frequency = update_frequency self.periodic = {} self.active = 0 self.exit_callbacks = [] # QCEngine data self.available_programs = qcng.list_available_programs() self.available_procedures = qcng.list_available_procedures() self.logger.info("QueueManager:") self.logger.info(" Version: {}\n".format( get_information("version"))) if self.verbose: self.logger.info(" Name Information:") self.logger.info(" Cluster: {}".format( self.name_data["cluster"])) self.logger.info(" Hostname: {}".format( self.name_data["hostname"])) self.logger.info(" UUID: {}\n".format( self.name_data["uuid"])) self.logger.info(" Queue Adapter:") self.logger.info(" {}\n".format(self.queue_adapter)) if self.verbose: self.logger.info(" QCEngine:") self.logger.info(" Version: {}".format( qcng.__version__)) self.logger.info(" Task Cores: {}".format( self.cores_per_task)) self.logger.info(" Task Mem: {}".format( self.memory_per_task)) self.logger.info(" Scratch Dir: {}".format( self.scratch_directory)) self.logger.info(" Programs: {}".format( self.available_programs)) self.logger.info(" Procedures: {}\n".format( self.available_procedures)) # DGAS Note: Note super happy about how this if/else turned out. Looking for alternatives. if self.connected(): # Pull server info self.server_info = client.server_information() self.server_name = self.server_info["name"] self.server_version = self.server_info["version"] self.server_query_limit = self.server_info["query_limit"] if self.max_tasks > self.server_query_limit: self.max_tasks = self.server_query_limit self.logger.warning( "Max tasks was larger than server query limit of {}, reducing to match query limit." .format(self.server_query_limit)) self.heartbeat_frequency = self.server_info["heartbeat_frequency"] # Tell the server we are up and running payload = self._payload_template() payload["data"]["operation"] = "startup" response = self.client._automodel_request("queue_manager", "put", payload) if self.verbose: self.logger.info(" Connected:") self.logger.info(" Version: {}".format( self.server_version)) self.logger.info(" Address: {}".format( self.client.address)) self.logger.info(" Name: {}".format( self.server_name)) self.logger.info(" Queue tag: {}".format( self.queue_tag)) self.logger.info(" Username: {}\n".format( self.client.username)) else: self.logger.info(" QCFractal server information:") self.logger.info( " Not connected, some actions will not be available")