def on_message(self, message): try: # Deserialize any datetimes which were serialized for JSON deserialized_update_attrs = {} model_klass = ContentType.objects.get_by_natural_key(*message["instance_natural_key"]).model_class() for attr, value in message["update_attrs"].items(): try: field = [f for f in model_klass._meta.fields if f.name == attr][0] except IndexError: # e.g. _id names, they aren't datetimes so ignore them deserialized_update_attrs[attr] = value else: if isinstance(field, DateTimeField): deserialized_update_attrs[attr] = IMLDateTime.parse(value) else: deserialized_update_attrs[attr] = value log.debug("on_message: %s %s" % (message, deserialized_update_attrs)) self._job_scheduler.notify( message["instance_natural_key"], message["instance_id"], message["time"], deserialized_update_attrs, message["from_states"], ) except: # Log bad messages and continue, swallow the exception to avoid # bringing down the whole service log.warning("on_message: bad message: %s" % traceback.format_exc())
def test_HYD648(self): """Test that datetimes in the API have a timezone""" synthetic_host('myserver') response = self.api_client.get("/api/host/") self.assertHttpOK(response) host = self.deserialize(response)['objects'][0] t = IMLDateTime.parse(host['state_modified_at']) self.assertNotEqual(t.tzinfo, None)
def run(self, host_id, host_data): host = ManagedHost.objects.get(pk=host_id) self.started_at = IMLDateTime.parse(host_data["started_at"]) self.host = host self.host_data = host_data log.debug("UpdateScan.run: %s" % self.host) self.audit_host()
def test_fetch_not_dismissed_alerts_since_last_sample(self): data = { "begin__gte": str(self.sample_date), "dismissed": 'false', "severity__in": ['WARNING', 'ERROR'] } response = self.api_client.get("/api/alert/", data=data) self.assertHttpOK(response) objects = self.deserialize(response)['objects'] self.assertEqual(len(objects), 2, self.dump_objects(objects)) for ev in objects: self.assertEqual(ev['dismissed'], False) self.assertTrue(ev['severity'] in ['WARNING', 'ERROR']) self.assertTrue(IMLDateTime.parse(ev['begin']) >= self.sample_date)
def test_fetch_not_dismissed_alerts_since_last_sample(self): data = { "begin__gte": str(self.sample_date), "dismissed": False, "severity__in": ["WARNING", "ERROR"] } response = self.api_client.get("/api/alert/", data=data) self.assertHttpOK(response) objects = self.deserialize(response)["objects"] self.assertEqual(len(objects), 2, self.dump_objects(objects)) for ev in objects: self.assertEqual(ev["dismissed"], False) self.assertTrue(ev["severity"] in ["WARNING", "ERROR"]) self.assertTrue(IMLDateTime.parse(ev["begin"]) >= self.sample_date)
def _wait_for_server_boot_time(self, fqdn, old_boot_time=None): running_time = 0 while running_time < TEST_TIMEOUT: hosts = self.get_list("/api/host/") for host in hosts: if host['fqdn'] == fqdn: if host['boot_time'] is not None: boot_time = IMLDateTime.parse(host['boot_time']) if old_boot_time: if boot_time > old_boot_time: return boot_time else: return boot_time running_time += 1 time.sleep(1) self.assertLess(running_time, TEST_TIMEOUT, "Timed out waiting for host boot_time to be set.")
def on_data(self, fqdn, body): with transaction.commit_on_success(): with LogMessage.delayed as log_messages: for msg in body['log_lines']: try: log_messages.insert(dict( fqdn = fqdn, message = msg['message'], severity = msg['severity'], facility = msg['facility'], tag = msg['source'], datetime = IMLDateTime.parse(msg['datetime']).as_datetime, message_class = LogMessage.get_message_class(msg['message']) )) self._table_size += 1 self._parser.parse(fqdn, msg) except Exception, e: self.log.error("Error %s ingesting systemd-journal entry: %s" % (e, msg))
def on_data(self, fqdn, body): with transaction.atomic(): with DelayedContextFrom(LogMessage) as log_messages: for msg in body["log_lines"]: try: log_messages.insert( dict( fqdn=fqdn, message=msg["message"], severity=msg["severity"], facility=msg["facility"], tag=msg["source"], datetime=IMLDateTime.parse( msg["datetime"]).as_datetime, message_class=LogMessage.get_message_class( msg["message"]), )) self._table_size += 1 self._parser.parse(fqdn, msg) except Exception as e: self.log.error( "Error %s ingesting systemd-journal entry: %s" % (e, msg))
def metric_dispatch(self, request, **kwargs): """ GET parameters: :metrics: Comma separated list of strings (e.g. kbytesfree,kbytestotal) :job: 'id', 'user', or 'name'. only supply one metric of 'read_bytes', 'write_bytes', 'read_iops', 'write_iops', 'metadata_iops' :begin: Time ISO8601 string, e.g. '2008-09-03T20:56:35.450686Z' :end: Time ISO8601 string, e.g. '2008-09-03T20:56:35.450686Z' :latest: boolean -- if true, you are asking for a single time point, the latest value :max_points: maximum number of datapoints returned, may result in lower resolution samples :num_points: return exact number of data points scaled for the date range :reduce_fn: one of 'average', 'sum' :group_by: an attribute name of the object you're fetching. For example, to get the total OST stats for a filesystem, when requesting from the target resource, use reduce_fn=sum, group_by=filesystem. If the group_by attribute is absent from a record in the results, that record is discarded. """ errors = defaultdict(list) if request.method != 'GET': return self.create_response(request, "", response_class=HttpMethodNotAllowed) latest, update = (request.GET.get(name, '').lower() in ('true', '1') for name in ('latest', 'update')) if update and latest: errors['update'].append("update and latest are mutually exclusive") metrics = filter(None, request.GET.get('metrics', '').split(',')) job = request.GET.get('job', '') if job: if len(metrics) != 1: errors['job'].append("Job metrics must be a single string") if latest: errors['job'].append("Job metrics and latest are incompatible") num_points = 0 if 'num_points' in request.GET: try: num_points = int(request.GET['num_points']) except ValueError: errors['num_points'].append( "num_points must be a valid integer") if latest or update: errors['num_points'].append( "num_points requires a fixed begin and end") begin = end = None if not latest: try: begin = IMLDateTime.parse(request.GET['begin']) except KeyError: errors['begin'].append( "This field is mandatory when latest=false") except ValueError: errors['begin'].append("Malformed time string") try: end = IMLDateTime.parse(request.GET['end']) except KeyError: if update or num_points: errors['end'].append( "This field is mandatory when latest=false") else: end = timezone.now() except ValueError: errors['end'].append("Malformed time string") if update: begin, end = end, timezone.now() try: max_points = int(request.GET.get('max_points', 1000)) except ValueError: errors['max_points'].append("max_points must be a valid integer") if errors: return self.create_response(request, errors, response_class=HttpBadRequest) if 'pk' in kwargs: return self.get_metric_detail(request, metrics, begin, end, job, max_points, num_points, **kwargs) return self.get_metric_list(request, metrics, begin, end, job, max_points, num_points, **kwargs)
def send(self): events = [] envelope = dict(fqdn=self.client.fqdn, copytool=self.copytool.id, events=events) envelope_size = len(json.dumps(envelope)) while True: try: event = self.retry_queue.get_nowait() copytool_log.debug("Got event from retry queue: %s" % event) except Queue.Empty: try: raw_event = self.send_queue.get_nowait() event = json.loads(raw_event) copytool_log.debug("Got event from send queue: %s" % event) except Queue.Empty: break except ValueError: copytool_log.error("Invalid JSON: %s" % raw_event) break try: date = IMLDateTime.parse(event['event_time']) event['event_time'] = date.astimezone( tz=FixedOffset(0)).strftime("%Y-%m-%d %H:%M:%S+00:00") except ValueError as e: copytool_log.error("Invalid event date in event '%s': %s" % (event, e)) break # During restore operations, we don't know the data_fid until # after the operation has started (i.e. RUNNING). The tricky part # is that when the restore completes, the source_fid is set to # data_fid, so unless we do this swap we'll lose track of the # operation. if 'RUNNING' in event['event_type']: if event['source_fid'] in self.active_operations: self.active_operations[ event['data_fid']] = self.active_operations.pop( event['source_fid']) if self.active_operations.get(event.get('data_fid', None), None): event['active_operation'] = self.active_operations[ event['data_fid']] if 'FINISH' in event['event_type']: try: del self.active_operations[event['data_fid']] except KeyError: pass copytool_log.debug("event: %s" % json.dumps(event)) event_size = len(json.dumps(event)) if event_size > MAX_BYTES_PER_POST: copytool_log.error("Oversized event dropped: %s" % event) break if events and event_size > MAX_BYTES_PER_POST - envelope_size: copytool_log.info("Requeueing oversized message " "(%d + %d > %d, %d messages)" % (event_size, envelope_size, MAX_BYTES_PER_POST, len(events))) self.retry_queue.put(event) break events.append(event) envelope_size += event_size if events: copytool_log.debug("EventRelay sending %d events" % len(events)) try: data = self.client.post(envelope) copytool_log.debug("Got data back from POST: %s" % data) try: self.active_operations.update(data['active_operations']) except (KeyError, TypeError): pass # Reset any backoff delay that might have been added self.reset_backoff() except HttpError: copytool_log.error("Failed to relay events, requeueing") for event in envelope['events']: self.retry_queue.put(event) self.backoff()
def get(self, request): """ Send messages TO the agent. Handle a long-polling GET for messages to the agent """ fqdn = self.valid_fqdn(request) if not fqdn: return HttpForbidden() server_boot_time = IMLDateTime.parse(request.GET["server_boot_time"]) client_start_time = IMLDateTime.parse(request.GET["client_start_time"]) messages = [] try: reset_required = self.hosts.update(fqdn, server_boot_time, client_start_time) except ManagedHost.DoesNotExist: # This should not happen because the HTTPS frontend should have the # agent certificate revoked before removing the ManagedHost from the database log.error("GET from unknown server %s" % fqdn) return HttpResponseBadRequest("Unknown server '%s'" % fqdn) if reset_required: # This is the case where the http_agent service restarts, so # we have to let the agent know that all open sessions # are now over. messages.append({ "fqdn": fqdn, "type": "SESSION_TERMINATE_ALL", "plugin": None, "session_id": None, "session_seq": None, "body": None, }) log.debug("MessageView.get: composing messages for %s" % fqdn) queues = self.queues.get(fqdn) # If this handler is sitting on the TX queue, draining messages, then # when a new session starts, *before* sending any TX messages, we have to # make sure it has been disconnected, to avoid the TX messages being sent # to an 'old' session (old session meaning TCP connection from a now-dead agent) with queues.tx_lock: try: first_message = queues.tx.get(block=True, timeout=self.LONG_POLL_TIMEOUT) if first_message["type"] == "TX_BARRIER": if first_message["client_start_time"] != request.GET[ "client_start_time"]: log.warning("Cancelling GET due to barrier %s %s" % (first_message["client_start_time"], request.GET["client_start_time"])) return HttpResponse(json.dumps({"messages": []}), content_type="application/json") else: messages.append(first_message) except Queue.Empty: pass else: # TODO: limit number of messages per response while True: try: message = queues.tx.get(block=False) if message["type"] == "TX_BARRIER": if message["client_start_time"] != request.GET[ "client_start_time"]: log.warning( "Cancelling GET due to barrier %s %s" % (message["client_start_time"], request.GET["client_start_time"])) return HttpResponse( json.dumps({"messages": []}), content_type="application/json") else: messages.append(message) except Queue.Empty: break messages = self._filter_valid_messages(fqdn, messages) log.debug("MessageView.get: responding to %s with %s messages (%s)" % (fqdn, len(messages), client_start_time)) return HttpResponse(json.dumps({"messages": messages}), content_type="application/json")
def on_data(self, fqdn, body): """Process all incoming messages from the Corosync agent plugin Request to have the status changed for an instance. If the current state determines that a host is offline, then raise that alert. old messages should not be processed. datetime is in UTC of the node's localtime in the standard ISO string format """ try: host = ManagedHost.objects.get(fqdn=fqdn) except ManagedHost.DoesNotExist: # This might happen when we are deleting a host and the queues mean a message is still sat waiting to be # processed. Something has spoken to us and we don't know anything about it so really we can't do anything # other than drop it. log.warning( "Corosync message from unknown host %s, the message was dropped." % fqdn) return # If corosync is not configured yet, or we don't actually have corosync - then ignore the input if (not host.corosync_configuration ) or host.corosync_configuration.state == 'unconfigured': return if body.get('state'): job_scheduler_notify.notify(host.corosync_configuration, timezone.now(), {'state': body['state']['corosync']}) job_scheduler_notify.notify(host.pacemaker_configuration, timezone.now(), {'state': body['state']['pacemaker']}) if body['state']['corosync'] == 'stopped': return else: if host.corosync_configuration.state != 'started': return if body.get('crm_info'): nodes = body['crm_info']['nodes'] dt = body['crm_info']['datetime'] options = body['crm_info'].get('options', {'stonith_enabled': None}) stonith_enabled = options['stonith_enabled'] try: dt = IMLDateTime.parse(dt) except ValueError: if dt != '': log.warning( "Invalid date or tz string from corosync plugin: %s" % dt) raise def is_new(peer_node_identifier): return (peer_node_identifier not in self._host_status or self._host_status[peer_node_identifier].datetime < dt) peers_str = "; ".join([ "%s: online=%s, new=%s" % (peer_node_identifier, data['online'], is_new(peer_node_identifier)) for peer_node_identifier, data in nodes.items() ]) log.debug("Incoming peer report from %s: %s" % (fqdn, peers_str)) # NB: This will ignore any unknown peers in the report. cluster_nodes = ManagedHost.objects.select_related( 'ha_cluster_peers').filter( Q(nodename__in=nodes.keys()) | Q(fqdn__in=nodes.keys())) unknown_nodes = set(nodes.keys()) - set([ h.nodename for h in cluster_nodes ]) - set([h.fqdn for h in cluster_nodes]) # Leaving this out for now - because they raise issue caused by limitations in the simulator and # test system as a whole. Difficult to know if they will or won't be raised it all depends on the past. # CorosyncUnknownPeersAlert.notify(host.corosync_configuration, unknown_nodes != set()) if unknown_nodes: log.warning("Unknown nodes in report from %s: %s" % (fqdn, unknown_nodes)) if stonith_enabled is not None: StonithNotEnabledAlert.notify(host.corosync_configuration, stonith_enabled is False) CorosyncNoPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) == 1) # CorosyncToManyPeersAlert.notify(host.corosync_configuration, len(cluster_nodes) > 2) # Consider all nodes in the peer group for this reporting agent for host in cluster_nodes: try: data = nodes[host.nodename] node_identifier = host.nodename except KeyError: data = nodes[host.fqdn] node_identifier = host.fqdn cluster_peer_keys = sorted( [node.pk for node in cluster_nodes if node is not host]) if is_new(node_identifier) and host.corosync_configuration: host_reported_online = data['online'] == 'true' log.debug("Corosync processing " "peer %s of %s " % (host.fqdn, fqdn)) # Raise an Alert - system suppresses duplicates log.debug("Alert notify on %s: active=%s" % (host, not host_reported_online)) HostOfflineAlert.notify(host, not host_reported_online) if host_reported_online == False: log.debug("Host %s offline" % host.fqdn) else: log.debug("Host %s online" % host.fqdn) # Attempt to save the state. if host.corosync_configuration.corosync_reported_up != host_reported_online: job_scheduler_notify.notify( host.corosync_configuration, timezone.now(), {'corosync_reported_up': host_reported_online}) peer_host_peer_keys = sorted( [h.pk for h in host.ha_cluster_peers.all()]) if peer_host_peer_keys != cluster_peer_keys: job_scheduler_notify.notify( host, timezone.now(), {'ha_cluster_peers': cluster_peer_keys}) # Keep internal track of the hosts state. self._host_status[node_identifier] = self.HostStatus( status=host_reported_online, datetime=dt)
def __init__(self, event_time, **kwargs): self.timestamp = IMLDateTime.parse(event_time) self._parse_type(kwargs["event_type"]) self.error = None self.__dict__.update(kwargs)
def test_prase(self): for str, date_time in [ ( "2016/10/09T19:20:21+1000", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(600) ), ), ( "2016/10/09T19:20:21.12345+1000", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(600), ), ), ( "2016-10-09 19:20:21-1000", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600) ), ), ( "2016-10-09 19:20:21.12345-1000", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(-600), ), ), ( "19:20:21-1000 2016-10/09", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600) ), ), ( "19:20:21.12345-0000 2016/10-09", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0) ), ), ( " 19:20:21.12345 -1000 2016/10-09 ", IMLDateTime( year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(-600), ), ), ("2016/10-09", IMLDateTime(year=2016, month=10, day=9, tzinfo=FixedOffset(0))), ( "19:20:21.12345-0000", IMLDateTime( year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0) ), ), ( "19:20:21-0500", IMLDateTime( year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-300) ), ), ]: self.assertEqual(IMLDateTime.parse(str), date_time)