def generate_command(self, command_header): cluster_id = str(command_header['clusterId']) if cluster_id != '-1' and cluster_id != 'null': service_name = command_header['serviceName'] component_name = command_header['role'] else: cluster_id = None service_name = None component_name = None required_config_timestamp = command_header[ 'requiredConfigTimestamp'] if 'requiredConfigTimestamp' in command_header else None command_dict = self.configuration_builder.get_configuration( cluster_id, service_name, component_name, required_config_timestamp) # remove data populated from topology to avoid merge and just override if 'clusterHostInfo' in command_header: del command_dict['clusterHostInfo'] command = Utils.update_nested(Utils.get_mutable_copy(command_dict), command_header) # topology needs to be decompressed if and only if it originates from command header if 'clusterHostInfo' in command_header and command_header[ 'clusterHostInfo']: command['clusterHostInfo'] = self.decompress_cluster_host_info( command['clusterHostInfo']) return command
def handle_heartbeat_reponse(self, response): serverId = int(response['id']) if serverId != self.responseId + 1: logger.error("Error in responseId sequence - restarting") Utils.restartAgent(self.stop_event) else: self.responseId = serverId
def handle_heartbeat_reponse(self, response): serverId = int(response['id']) if serverId != self.responseId + 1: logger.error("Error in responseId sequence - restarting") Utils.restartAgent(self.stop_event) else: self.responseId = serverId if 'restartAgent' in response and response['restartAgent'].lower() == "true": logger.warn("Restarting the agent by the request from server") Utils.restartAgent(self.stop_event)
def generate_command(self, command_header): cluster_id = str(command_header['clusterId']) if cluster_id != '-1' and cluster_id != 'null': service_name = command_header['serviceName'] component_name = command_header['role'] else: cluster_id = None service_name = None component_name = None required_config_timestamp = command_header['requiredConfigTimestamp'] if 'requiredConfigTimestamp' in command_header else None command_dict = self.configuration_builder.get_configuration(cluster_id, service_name, component_name, required_config_timestamp) command = Utils.update_nested(Utils.get_mutable_copy(command_dict), command_header) return command
def discard_stale_reports(self, cluster_reports): """ Remove reports which are already stale (meaning other process has already updated status to something different) """ with self.reports_to_discard_lock: # nothing to discard if not self.reports_to_discard: return cluster_reports reports_to_discard = self.reports_to_discard[:] new_cluster_reports = defaultdict(lambda: []) for cluster_id, cluster_reports in cluster_reports.iteritems(): for cluster_report in cluster_reports: for discarded_report in reports_to_discard: if Utils.are_dicts_equal(cluster_report, discarded_report, keys_to_skip=['status']): self.logger.info( "Discarding outdated status {0} before sending". format(cluster_report)) break else: new_cluster_reports[cluster_id].append(cluster_report) return new_cluster_reports
def __load_definitions(self): """ Loads all alert definitions from a file. All clusters are stored in a single file. This wil also populate the cluster-to-hash dictionary. :return: """ definitions = [] for cluster_id, command_json in self.alert_definitions_cache.iteritems(): clusterName = '' if not 'clusterName' in command_json else command_json['clusterName'] hostName = '' if not 'hostName' in command_json else command_json['hostName'] publicHostName = '' if not 'publicHostName' in command_json else command_json['publicHostName'] clusterHash = None if not 'hash' in command_json else command_json['hash'] # cache the cluster and cluster hash after loading the JSON if clusterName != '' and clusterHash is not None: logger.info('[AlertScheduler] Caching cluster {0} with alert hash {1}'.format(clusterName, clusterHash)) for definition in command_json['alertDefinitions']: alert = self.__json_to_callable(clusterName, hostName, publicHostName, Utils.get_mutable_copy(definition)) if alert is None: continue alert.set_helpers(self._collector, self._cluster_configuration, self.configuration_builder) definitions.append(alert) return definitions
def run(self): while not self.stop_event.is_set(): try: if self.initializer_module.is_registered: report = self.get_report() if self.initializer_module.is_registered and not Utils.are_dicts_equal( report, self.last_report, keys_to_skip=["agentTimeStampAtReporting"]): self.initializer_module.connection.send( message=report, destination=Constants.HOST_STATUS_REPORTS_ENDPOINT) self.last_report = report # don't use else to avoid race condition if not self.initializer_module.is_registered: self.last_report = {} except ConnectionIsAlreadyClosed: # server and agent disconnected during sending data. Not an issue pass except: logger.exception( "Exception in HostStatusReporter. Re-running it") self.stop_event.wait(self.report_interval) logger.info("HostStatusReporter has successfully finished")
def run(self): while not self.stop_event.is_set(): try: if self.initializer_module.is_registered: report = self.get_report() if self.initializer_module.is_registered and not Utils.are_dicts_equal( report, self.last_report, keys_to_skip=["agentTimeStampAtReporting"]): correlation_id = self.initializer_module.connection.send( message=report, destination=Constants.HOST_STATUS_REPORTS_ENDPOINT) self.server_responses_listener.listener_functions_on_success[ correlation_id] = lambda headers, message: self.save_last_report( report) except ConnectionIsAlreadyClosed: # server and agent disconnected during sending data. Not an issue pass except: logger.exception( "Exception in HostStatusReporter. Re-running it") self.stop_event.wait(self.report_interval) logger.info("HostStatusReporter has successfully finished")
def is_json_equal(): #json_topology = json.dumps(self.initializer_module.topology_cache, indent=2, sort_keys=True) #json_excepted_lopology = json.dumps(self.get_dict_from_file("topology_cache_expected.json"), indent=2, sort_keys=True) #print json_topology #print json_excepted_lopology self.assertEquals( Utils.get_mutable_copy(self.initializer_module.topology_cache), self.get_dict_from_file("topology_cache_expected.json"))
def is_json_equal(): #json_alert_definitions = json.dumps(self.initializer_module.alert_definitions_cache, indent=2, sort_keys=True) #json_excepted_definitions = json.dumps(self.get_dict_from_file("alert_definition_expected.json"), indent=2, sort_keys=True) #print json_definitions #print json_excepted_definitions self.assertEquals( Utils.get_mutable_copy( self.initializer_module.alert_definitions_cache), self.get_dict_from_file("alert_definition_expected.json"))
def rewrite_cluster_cache(self, cluster_id, cache): """ Thread-safe method for writing out the specified cluster cache and rewriting the in-memory representation. :param cluster_id: :param cache: :return: """ logger.info("Rewriting cache {0} for cluster {1}".format(self.__class__.__name__, cluster_id)) # The cache should contain exactly the data received from server. # Modifications on agent-side will lead to unnecessary cache sync every agent registration. Which is a big concern on perf clusters! # Also immutability can lead to multithreading issues. immutable_cache = Utils.make_immutable(cache) with self._cache_lock: self[cluster_id] = immutable_cache
def update_definitions(self, event_type): """ Updates the persisted alert definitions JSON. :return: """ # prune out things we don't want to store alert_definitions = [] for cluster_id, command in self.alert_definitions_cache.iteritems(): command_copy = Utils.get_mutable_copy(command) alert_definitions.append(command_copy) if event_type == "CREATE": # reschedule all jobs, creating new instances self.reschedule_all() else: # reschedule only the jobs that have changed self.reschedule()
def build(self, response_id='-1'): timestamp = int(time.time() * 1000) hostInfo = HostInfo(self.config) agentEnv = {} hostInfo.register(agentEnv, runExpensiveChecks=True) current_ping_port = self.config.get('agent', 'ping_port') register = { 'id': int(response_id), 'timestamp': timestamp, 'hostname': hostname.hostname(self.config), 'currentPingPort': int(current_ping_port), 'publicHostname': hostname.public_hostname(self.config), 'hardwareProfile': self.hardware.get(), 'agentEnv': agentEnv, 'agentVersion': Utils.read_agent_version(self.config), 'prefix': self.config.get('agent', 'prefix') } return register
def report_status_to_sender(self, headers, message, ex=None): """ Reports the status of delivery of the message to a sender @param headers: headers dictionary @param message: message payload dictionary @params ex: optional exception object for errors """ if not Constants.MESSAGE_ID in headers: return if ex: confirmation_of_received = { Constants.MESSAGE_ID: headers[Constants.MESSAGE_ID], 'status': 'ERROR', 'reason': Utils.get_traceback_as_text(ex) } else: confirmation_of_received = { Constants.MESSAGE_ID: headers[Constants.MESSAGE_ID], 'status': 'OK' } try: connection = self.initializer_module.connection except ConnectionIsAlreadyClosed: # access early copy of connection before it is exposed globally connection = self.initializer_module.heartbeat_thread.connection try: connection.send(message=confirmation_of_received, destination=Constants.AGENT_RESPONSES_TOPIC) except: logger.exception( "Could not send a confirmation '{0}' to server".format( confirmation_of_received))
def _get_mutable_copy(self): with self._cache_lock: return Utils.get_mutable_copy(self)
def restart_agent(self): logger.warn("Restarting the agent by the request from server") Utils.restartAgent(self.stop_event)
def cache_update(self, update_dict, cache_hash): """ Update the current dictionary by other one """ merged_dict = Utils.update_nested(self._get_mutable_copy(), update_dict) self.rewrite_cache(merged_dict, cache_hash)