def on_data(self, fqdn, body): """Process all incoming messages from the Corosync agent plugin Request to have the status changed for an instance. If the current state determines that a host is offline, then raise that alert. old messages should not be processed. datetime is in UTC of the node's localtime in the standard ISO string format """ try: host = ManagedHost.objects.get(fqdn=fqdn) except ManagedHost.DoesNotExist: # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be # processed. Something has spoken to us and we don't know anything about it so really we can't do anything # other than drop it. log.warning( "Corosync message from unknown host %s, the message was dropped." % fqdn) return # If corosync is not configured yet, or we don't actually have corosync - then ignore the input if (not host.corosync_configuration ) or host.corosync_configuration.state == 'unconfigured': return if body.get('state'): job_scheduler_notify.notify(host.corosync_configuration, timezone.now(), {'state': body['state']['corosync']}) job_scheduler_notify.notify(host.pacemaker_configuration, timezone.now(), {'state': body['state']['pacemaker']}) if body['state']['corosync'] == 'stopped': return else: if host.corosync_configuration.state != 'started': return if body.get('crm_info'): nodes = body['crm_info']['nodes'] dt = body['crm_info']['datetime'] options = body['crm_info'].get('options', {'stonith_enabled': None}) stonith_enabled = options['stonith_enabled'] try: dt = IMLDateTime.parse(dt) except ValueError: if dt != '': log.warning( "Invalid date or tz string from corosync plugin: %s" % dt) raise def is_new(peer_node_identifier): return (peer_node_identifier not in self._host_status or self._host_status[peer_node_identifier].datetime < dt) peers_str = "; ".join([ "%s: online=%s, new=%s" % (peer_node_identifier, data['online'], is_new(peer_node_identifier)) for peer_node_identifier, data in nodes.items() ]) log.debug("Incoming peer report from %s: %s" % (fqdn, peers_str)) # NB: This will ignore any unknown peers in the report. cluster_nodes = ManagedHost.objects.select_related( 'ha_cluster_peers').filter( Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys())) unknown_nodes = set(nodes.keys()) - set([ h.nodename for h in cluster_nodes ]) - set([h.fqdn for h in cluster_nodes]) # Leaving this out for now - because they raise issue caused by limitations in the simulator and # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past. # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set()) if unknown_nodes: log.warning("Unknown nodes in report from %s: %s" % (fqdn, unknown_nodes)) if stonith_enabled is not None: StonithNotEnabledAlert.notify(host.corosync_configuration, stonith_enabled is False) CorosyncNoPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) == 1) # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2) # Consider all nodes in the peer group for this reporting agent for host in cluster_nodes: try: data = nodes[host.nodename] node_identifier = host.nodename except KeyError: data = nodes[host.fqdn] node_identifier = host.fqdn cluster_peer_keys = sorted( [node.pk for node in cluster_nodes if node is not host]) if is_new(node_identifier) and host.corosync_configuration: host_reported_online = data['online'] == 'true' log.debug("Corosync processing " "peer %s of %s " % (host.fqdn, fqdn)) # Raise an Alert - system suppresses duplicates log.debug("Alert notify on %s: active=%s" % (host, not host_reported_online)) HostOfflineAlert.notify(host, not host_reported_online) if host_reported_online == False: log.debug("Host %s offline" % host.fqdn) else: log.debug("Host %s online" % host.fqdn) # Attempt to save the state. if host.corosync_configuration.corosync_reported_up != host_reported_online: job_scheduler_notify.notify( host.corosync_configuration, timezone.now(), {'corosync_reported_up': host_reported_online}) peer_host_peer_keys = sorted( [h.pk for h in host.ha_cluster_peers.all()]) if peer_host_peer_keys != cluster_peer_keys: job_scheduler_notify.notify( host, timezone.now(), {'ha_cluster_peers': cluster_peer_keys}) # Keep internal track of the hosts state. self._host_status[node_identifier] = self.HostStatus( status=host_reported_online, datetime=dt)
def test_removals(self): """Test that after objects are removed all GETs still work The idea is to go through a add hosts, create FS, remove FS, remove hosts cycle and then do a spider of the API to ensure that there aren't any exceptions rendering things (e.g. due to trying to dereference removed things incorrectly)""" host = synthetic_host('myserver') self.create_simple_filesystem(host) # Create a command/job/step result referencing the host command = Command.objects.create(message="test command", complete=True, errored=True) job = StopLNetJob.objects.create( lnet_configuration=host.lnet_configuration, state='complete', errored=True) command.jobs.add(job) step_klass, args = job.get_steps()[0] StepResult.objects.create(job=job, backtrace="an error", step_klass=step_klass, args=args, step_index=0, step_count=1, state='failed') # There will now be an CommandErroredAlert because the command above failed. alerts = self.deserialize( self.api_client.get("/api/alert/"))['objects'] self.assertEqual(len(alerts), 1) self.assertEqual(alerts[0]['alert_type'], 'CommandErroredAlert') # Now create an alert/event referencing the host HostOfflineAlert.notify(host, True) alerts = self.deserialize( self.api_client.get("/api/alert/", data={'active': True}))['objects'] self.assertEqual(len(alerts), 1) self.assertEqual(alerts[0]['alert_type'], 'HostOfflineAlert') # Double check that is 2 alerts in total. alerts = self.deserialize( self.api_client.get("/api/alert/"))['objects'] self.assertEqual(len(alerts), 2) # Cause JobScheduler() to delete the objects, check the objects are gone in the API # and the API can still be spidered cleanly job = ForceRemoveHostJob(host=host) for step_klass, args in job.get_steps(): step_klass(job, args, None, None, None).run(args) # Check everything is gone self.assertEqual(ManagedTarget.objects.count(), 0) self.assertEqual(ManagedHost.objects.count(), 0) self.assertEqual(Volume.objects.count(), 0) self.assertEqual(VolumeNode.objects.count(), 0) self.assertListEqual( self.deserialize( self.api_client.get("/api/alert/?active=true"))['objects'], []) self.assertListEqual( self.deserialize(self.api_client.get("/api/volume/"))['objects'], []) self.assertListEqual( self.deserialize( self.api_client.get("/api/volume_node/"))['objects'], []) self.assertListEqual( self.deserialize(self.api_client.get("/api/target/"))['objects'], []) self.assertListEqual( self.deserialize(self.api_client.get("/api/host/"))['objects'], []) self.assertListEqual( self.deserialize( self.api_client.get("/api/filesystem/"))['objects'], []) # Check resources still render without exceptions self.spider_api()