def mark_software_available(software, version): url = "{master_api}/agents/{agent}/software/".format( master_api=config.get("master_api"), agent=config.get("agent_id")) while True: try: response = yield post_direct(url, data={ "software": software, "version": version}) except Exception as error: delay = http_retry_delay() logger.error( "Failed to post availability of software %s, " "version %s to master: %r. Will retry in %s " "seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: data = yield treq.content(response) if response.code == OK: logger.info("Posted availability of software %s, " "version %s to master.", software, version) break elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not post availability of software %s, " "version %s. The master responded with " "INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: logger.error( "Failed to post availability of software %s, " "version %s: " "Unexpected status from server %s. Data: %s", software, version, response.code, data) break if self.testing: self.operation_deferred.callback(None)
def mark_software_not_available(software, version): url = ("{master_api}/agents/{agent}/software/{software}/" "versions/{version}").format( master_api=config.get("master_api"), agent=config.get("agent_id"), software=software, version=version) while True: try: response = yield delete_direct(url) except Exception as error: delay = http_retry_delay() logger.error( "Failed to remove software %s, version %s from this " "agent on master: %r. Will retry in %s seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: data = yield treq.content(response) if response.code in [OK, ACCEPTED, NO_CONTENT]: logger.info("Removed software %s, version %s from this " "agent on master.", software, version) break elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not remove software %s, version %s from " "this agent. The master responded with " "INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: logger.error( "Failed to remove software %s, version %s from " "this agent: " "Unexpected status from server %s. Data: %s", software, version, response.code, data) break if self.testing: self.operation_deferred.callback(None)
def _get_uid_gid_value(self, value, value_name, func_name, module, module_name): """ Internal function which handles both user name and group conversion. """ # This platform does not implement the module if module is NotImplemented: logger.warning("This platform does not implement the %r module, skipping " "%s()", module_name, func_name) # Convert a user/group string to an integer elif isinstance(value, STRING_TYPES): try: if module_name == "pwd": return pwd.getpwnam(value).pw_uid elif module_name == "grp": return grp.getgrnam(value).gr_gid else: raise ValueError( "Internal error, failed to get module to use for " "conversion. Was given %r" % module ) except KeyError: logger.error("Failed to convert %s to a %s", value, func_name.split("_")[1]) if not config.get("jobtype_ignore_id_mapping_errors"): raise # Verify that the provided user/group string is real elif isinstance(value, INTEGER_TYPES): try: if module_name == "pwd": pwd.getpwuid(value) elif module_name == "grp": grp.getgrgid(value) else: raise ValueError( "Internal error, failed to get module to use for " "conversion. Was given %r" % module ) # Seems to check out, return the original value return value except KeyError: logger.error("%s %s does not seem to exist", value_name, value) if not config.get("jobtype_ignore_id_mapping_errors"): raise else: raise TypeError("Expected an integer or string for `%s`" % value_name)
def test_get_result(self): process = psutil.Process() direct_child_processes = len(process.children(recursive=False)) all_child_processes = len(process.children(recursive=True)) grandchild_processes = all_child_processes - direct_child_processes # Determine the last time we talked to the master (if ever) contacted = config.master_contacted(update=False) if isinstance(contacted, datetime): contacted = datetime.utcnow() - contacted # Determine the last time we announced ourselves to the # master (if ever) last_announce = config.get("last_announce", None) if isinstance(last_announce, datetime): last_announce = datetime.utcnow() - last_announce future_time = config["start"] + 30 process_memory = memory.process_memory() total_consumption = memory.total_consumption() expected_data = { "state": config["state"], "agent_hostname": config["agent_hostname"], "agent_process_ram": process_memory, "consumed_ram": total_consumption, "child_processes": direct_child_processes, "grandchild_processes": grandchild_processes, "pids": config["pids"], "agent_id": str(config["agent_id"]), "last_master_contact": contacted, "last_announce": last_announce, "agent_lock_file": config["agent_lock_file"], "free_ram": 4242, "uptime": total_seconds( timedelta(seconds=future_time - config["start"])), "jobs": list(config["jobtypes"].keys())} request = self.get() status = Status() with nested( mock.patch.object(memory, "free_ram", return_value=4242), mock.patch.object(time, "time", return_value=future_time), mock.patch.object( memory, "process_memory", return_value=process_memory), mock.patch.object( memory, "total_consumption", return_value=total_consumption) ): response = status.render(request) self.assertEqual(response, NOT_DONE_YET) self.assertTrue(request.finished) self.assertEqual(request.responseCode, OK) self.assertEqual(len(request.written), 1) self.assertEqual(loads(request.written[0]), expected_data)
def setUp(self): super(TestCheckSoftware, self).setUp() self.resource = Resource() sw_api_root = Resource() self.resource.putChild("software", sw_api_root) fake_software_api = Resource() sw_api_root.putChild("example_sw", fake_software_api) fake_version_index_api = Resource() fake_software_api.putChild("versions", fake_version_index_api) self.fake_version_api = FakeSoftwareVersionAPI() fake_version_index_api.putChild("1.0", self.fake_version_api) self.fake_discovery_code_api = FakeSoftwareVersionCodeAPI() self.fake_version_api.putChild( "discovery_code", self.fake_discovery_code_api) agent_api_root = AgentAPIRoot() self.resource.putChild("agents", agent_api_root) fake_agent_api = FakeAgentAPI() agent_api_root.putChild(str(config.get("agent_id")), fake_agent_api) self.fake_agent_software_api = FakeAgentSoftwareAPI() fake_agent_api.putChild("software", self.fake_agent_software_api) agent_example_sw_api = Resource() self.fake_agent_software_api.putChild("example_sw", agent_example_sw_api) agent_example_sw_version_index_api = Resource() agent_example_sw_api.putChild("versions", agent_example_sw_version_index_api) self.agent_example_sw_version_1_0_api = FakeAgentSoftwareVersionAPI() agent_example_sw_version_index_api.putChild( "1.0", self.agent_example_sw_version_1_0_api) self.site = Site(self.resource) self.server = reactor.listenTCP(random_port(), self.site) config["master_api"] = "http://127.0.0.1:%s" % self.server.port
def get(self, **_): # Get counts for child processes and grandchild processes process = psutil.Process() direct_child_processes = len(process.children(recursive=False)) all_child_processes = len(process.children(recursive=True)) grandchild_processes = all_child_processes - direct_child_processes # Determine the last time we talked to the master (if ever) contacted = config.master_contacted(update=False) if isinstance(contacted, datetime): # pragma: no cover contacted = datetime.utcnow() - contacted # Determine the last time we announced ourselves to the # master (if ever) last_announce = config.get("last_announce", None) if isinstance(last_announce, datetime): # pragma: no cover last_announce = datetime.utcnow() - last_announce data = {"state": config["state"], "agent_hostname": config["agent_hostname"], "free_ram": memory.free_ram(), "agent_process_ram": memory.process_memory(), "consumed_ram": memory.total_consumption(), "child_processes": direct_child_processes, "grandchild_processes": grandchild_processes, "pids": config["pids"], "agent_id": config["agent_id"], "last_master_contact": contacted, "last_announce": last_announce, "agent_lock_file": config["agent_lock_file"], "uptime": total_seconds( timedelta(seconds=time.time() - config["start"])), "jobs": list(config["jobtypes"].keys())} if config["farm_name"]: data["farm_name"] = config["farm_name"] return dumps(data)
def system_data(self, requery_timeoffset=False): """ Returns a dictionary of data containing information about the agent. This is the information that is also passed along to the master. """ # query the time offset and then cache it since # this is typically a blocking operation if config["agent_time_offset"] == "auto": config["agent_time_offset"] = None if requery_timeoffset or config["agent_time_offset"] is None: ntplog.info( "Querying ntp server %r for current time", config["agent_ntp_server"]) ntp_client = NTPClient() try: pool_time = ntp_client.request( config["agent_ntp_server"], version=config["agent_ntp_server_version"]) except Exception as e: ntplog.warning("Failed to determine network time: %s", e) else: config["agent_time_offset"] = \ int(pool_time.tx_time - time.time()) # format the offset for logging purposes utcoffset = datetime.utcfromtimestamp(pool_time.tx_time) iso_timestamp = utcoffset.isoformat() ntplog.debug( "network time: %s (local offset: %r)", iso_timestamp, config["agent_time_offset"]) if config["agent_time_offset"] != 0: ntplog.warning( "Agent is %r second(s) off from ntp server at %r", config["agent_time_offset"], config["agent_ntp_server"]) data = { "id": config["agent_id"], "hostname": config["agent_hostname"], "version": config.version, "os_class": system.operating_system(), "os_fullname": platform(), "ram": int(config["agent_ram"]), "cpus": config["agent_cpus"], "cpu_name": cpu.cpu_name(), "port": config["agent_api_port"], "free_ram": memory.free_ram(), "time_offset": config["agent_time_offset"] or 0, "state": config["state"], "mac_addresses": list(network.mac_addresses()), "current_assignments": config.get( "current_assignments", {}), # may not be set yet "disks": disks.disks(as_dict=True) } try: gpu_names = graphics.graphics_cards() data["gpus"] = gpu_names except graphics.GPULookupError: pass if "remote_ip" in config: data.update(remote_ip=config["remote_ip"]) if config["farm_name"]: data["farm_name"] = config["farm_name"] return data
def reannounce(self, force=False): """ Method which is used to periodically contact the master. This method is generally called as part of a scheduled task. """ # Attempt to acquire the reannounce lock but fail after 70% # of the total time between reannouncements elapses. This should # help prevent an accumulation of requests in the event the master # is having issues. try: yield self.reannounce_lock.acquire( config["agent_master_reannounce"] * .70 ) except utility.LockTimeoutError: svclog.debug("Timed out while waiting to acquire reannounce_lock") returnValue(None) if not self.should_reannounce() and not force: yield self.reannounce_lock.release() returnValue(None) svclog.debug("Announcing %s to master", config["agent_hostname"]) data = None num_retry_errors = 0 while True: # for retries try: response = yield post_direct( self.agent_api(), data={ "state": config["state"], "current_assignments": config.get( "current_assignments", {} # may not be set yet ), "free_ram": memory.free_ram(), "disks": disks.disks(as_dict=True) } ) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to announce self to the master, " "caught try-again type errors %s times in a row.", num_retry_errors) break else: svclog.debug("While announcing self to master, caught " "%s. Retrying immediately.", error.__class__.__name__) except Exception as error: if force: delay = http_retry_delay() svclog.error( "Failed to announce self to the master: %s. Will " "retry in %s seconds.", error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: # Don't retry because reannounce is called periodically svclog.error( "Failed to announce self to the master: %s. This " "request will not be retried.", error) break else: data = yield treq.json_content(response) if response.code == OK: config.master_contacted(announcement=True) svclog.info("Announced self to the master server.") break elif response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Could not announce self to the master server, " "internal server error: %s. Retrying in %s " "seconds.", data, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: svclog.warning( "Could not announce to master. Not retrying " "because of pending shutdown.") break elif response.code == NOT_FOUND: svclog.warning("The master says it does not know about our " "agent id. Posting as a new agent.") yield self.post_agent_to_master() break # If this is a client problem retrying the request # is unlikely to fix the issue so we stop here elif response.code >= BAD_REQUEST: svclog.error( "Failed to announce self to the master, bad " "request: %s. This request will not be retried.", data) break else: svclog.error( "Unhandled error when posting self to the " "master: %s (code: %s). This request will not be " "retried.", data, response.code) break yield self.reannounce_lock.release() returnValue(data)
def shutting_down(self): return config.get("shutting_down", False)
def get_software_version_data(software, version): """ Asynchronously fetches the known data about the given software version from the master. :param str software: The name of the software to get data for :param str version: The name of the version to get data for :return: Returns information about the given software version from the master """ url = "{master_api}/software/{software}/versions/{version}".\ format(master_api=config.get("master_api"), software=software, version=version) while True: try: response = yield get_direct(url) except Exception as error: delay = http_retry_delay() logger.error( "Failed to get data about software %s, version %s: %r. Will " "retry in %s seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: if response.code == OK: data = yield treq.json_content(response) returnValue(data) elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not get data for software %s, version %s, server " "responded with INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred elif response.code == NOT_FOUND: logger.error("Got 404 NOT FOUND from server on getting data " "for software %s, version %s", software, version) raise VersionNotFound("This software version was not found or " "has no discovery code.") else: logger.error( "Failed to get data for software %s, version %s: " "Unexpected status from server %s", software, version, response.code) raise Exception("Unknown return code from master: %s" % response.code)