def blocking_shutdown(): log.info("%s beginning shutdown" % self.fqdn) self._enter_shutdown() shutdown_start_time = IMLDateTime.utcnow() self.shutdown_agent() self._cluster.leave(self.nodename) shutdown_end_time = IMLDateTime.utcnow() shutdown_time = (shutdown_end_time - shutdown_start_time).seconds while shutdown_time < MIN_SHUTDOWN_DURATION: if not simulate_shutdown: break remaining_delay = MIN_SHUTDOWN_DURATION - shutdown_time log.info("%s, sleeping for %d seconds on shutdown" % (self.fqdn, remaining_delay)) shutdown_time += remaining_delay time.sleep(remaining_delay) self._exit_shutdown() log.info("%s shutdown complete" % self.fqdn) if reboot: log.info("%s rebooting" % self.fqdn) self.startup(simulate_shutdown)
def test_message_callback(self): """Test that when a callback is included in a Message(), it is invoked after the message is sent""" client = mock.Mock() client._fqdn = "test_server" client.device_plugins = mock.Mock() client.device_plugins.get_plugins = mock.Mock( return_value=['test_plugin']) client.boot_time = IMLDateTime.utcnow() client.start_time = IMLDateTime.utcnow() callback = mock.Mock() # Disable poll() so that it's not trying to set up sessions, just doing passthrough of messages with mock.patch("chroma_agent.agent_client.HttpWriter.poll"): try: writer = HttpWriter(client) writer.start() message = Message("DATA", "test_plugin", {'key1': 'val1'}, 'session_foo', 666, callback=callback) writer.put(message) TIMEOUT = 2 i = 0 while True: if client.post.call_count and callback.call_count: break else: time.sleep(1) i += 1 if i > TIMEOUT: raise RuntimeError( "Timeout waiting for .post() and callback (%s %s)" % (client.post.call_count, callback.call_count)) # Should have sent back the result self.assertEqual(client.post.call_count, 1) self.assertDictEqual( client.post.call_args[0][0], { 'messages': [message.dump(client._fqdn)], 'server_boot_time': client.boot_time.isoformat() + "Z", 'client_start_time': client.start_time.isoformat() + "Z" }) # Should have invoked the callback self.assertEqual(callback.call_count, 1) finally: writer.stop() writer.join()
def start_session(self): # This fake plugin needs to look at it corosync defined peers of # this fake server and determine # which are online. This happens in production by shelling out the # call crm_mon --one-shot --as-xml # To simulate this, the _server object which is a FakeServer, must # be able to tell this server what it's peers are. # This implementation looks at ALL the servers in the simulator, # and those ones that are also join'ed in the cluster are online. log.debug('cluster nodes: %s' % self._server._cluster.state['nodes']) nodes = [ (node_dict['nodename'], node_dict['online']) for node_dict in self._server._cluster.state['nodes'].values() ] log.debug('Nodes and state: %s' % nodes) dt = IMLDateTime.utcnow().isoformat() message = self.get_test_message(utc_iso_date_str=dt, node_status_list=nodes) log.debug(message) return message
def _scan(self, initial=False): started_at = IMLDateTime.utcnow().isoformat() audit = local.LocalAudit() # Only set resource_locations if we have the management package try: from chroma_agent.action_plugins import manage_targets resource_locations = manage_targets.get_resource_locations() except ImportError: resource_locations = None mounts = self._scan_mounts() if initial: packages = scan_packages() else: packages = None # FIXME: HYD-1095 we should be sending a delta instead of a full dump every time # FIXME: At this time the 'capabilities' attribute is unused on the manager return { "started_at": started_at, "agent_version": agent_version(), "capabilities": plugin_manager.ActionPluginManager().capabilities, "metrics": audit.metrics(), "properties": audit.properties(), "mounts": mounts, "packages": packages, "resource_locations": resource_locations }
def inject_log_message(self, message): log.debug("Injecting log message %s/%s" % (self.fqdn, message)) self._log_messages.append({ 'source': 'cluster_sim', 'severity': 1, 'facility': 1, 'message': message, 'datetime': IMLDateTime.utcnow().isoformat() })
def _parse_crm_as_xml(self, raw): """ Parse the crm_mon response returns dict of nodes status or None if corosync is down """ return_dict = None try: root = xml.fromstring(raw) except ParseError: # not xml, might be a known error message if CorosyncPlugin.COROSYNC_CONNECTION_FAILURE not in raw: daemon_log.warning("Bad xml from corosync crm_mon: %s" % raw) else: return_dict = {} # Got node info, pack it up and return tm_str = root.find('summary/last_update').get('time') tm_datetime = IMLDateTime.strptime(tm_str, '%a %b %d %H:%M:%S %Y') return_dict.update({ 'datetime': IMLDateTime.convert_datetime_to_utc(tm_datetime).strftime( "%Y-%m-%dT%H:%M:%S+00:00") }) nodes = {} for node in root.findall("nodes/node"): host = node.get("name") nodes.update({host: node.attrib}) return_dict['nodes'] = nodes return_dict['options'] = {'stonith_enabled': False} cluster_options = root.find('summary/cluster_options') if cluster_options is not None: return_dict['options'].update({ 'stonith_enabled': cluster_options.get('stonith-enabled') == 'true' }) return return_dict
def test_priorities(self): """ Test that messages are consumed for POST based on the priority of the payload (data plane), or at the highest priority if no payload (control plane) """ client = mock.Mock() client._fqdn = "test_server" client.boot_time = IMLDateTime.utcnow() client.start_time = IMLDateTime.utcnow() writer = HttpWriter(client) def inject_messages(*args, **kwargs): # A control plane message writer.put( Message("SESSION_CREATE_REQUEST", "plugin_fuz", None, None, None)) low_body = DevicePluginMessage('low', PRIO_LOW) normal_body = DevicePluginMessage('normal', PRIO_NORMAL) high_body = DevicePluginMessage('high', PRIO_HIGH) writer.put(Message("DATA", "plugin_foo", low_body, "foo", 0)) writer.put(Message("DATA", "plugin_bar", normal_body, "foo", 1)) writer.put(Message("DATA", "plugin_baz", high_body, "foo", 2)) inject_messages() writer.send() self.assertEqual(client.post.call_count, 1) messages = client.post.call_args[0][0]['messages'] self.assertEqual(len(messages), 4) # First two messages (of equal priority) arrive in order or insertion self.assertEqual(messages[0]['plugin'], 'plugin_fuz') self.assertEqual(messages[1]['plugin'], 'plugin_baz') # Remaining messages arrive in priority order self.assertEqual(messages[2]['plugin'], 'plugin_bar') self.assertEqual(messages[3]['plugin'], 'plugin_foo')
def pop_log_messages(self): messages = self._log_messages d = datetime.datetime.now() messages.extend([{ 'source': 'cluster_sim', 'severity': 1, 'facility': 1, 'message': "%s %s %s" % (self.fqdn, d, a), 'datetime': IMLDateTime.utcnow().isoformat() } for a in range(0, self.log_rate)]) self._log_messages = [] return messages
def start_session(self): return { 'log_lines': [{ 'source': 'cluster_sim', 'severity': 1, 'facility': 1, 'message': 'Lustre: Cluster simulator syslog session start %s %s' % (self._server.fqdn, datetime.datetime.now()), 'datetime': IMLDateTime.utcnow().isoformat() + 'Z' }] }
def blocking_startup(): log.info("%s beginning startup" % self.fqdn) startup_time = 0 # Mixing shutdown/startup state here is a little weird, but # this only happens when it's explicitly requested by a caller # that wants full server simulation. The end result is that # simulating a reboot is easier if we deal with waiting for # the shutdown to finish here before moving on to startup. while simulate_bootup and self.shutting_down: log.info("%s, waiting for shutdown (%d)" % (self.fqdn, startup_time)) startup_time += 1 time.sleep(1) # This may happen to the losing thread(s) during startup after # a multi-PDU power cycle. if self.starting_up: log.info( "%s exiting startup because another thread is already doing it?" % self.fqdn) return self._enter_startup() self.boot_time = IMLDateTime.utcnow() while startup_time < MIN_STARTUP_DURATION: if not simulate_bootup: break remaining_delay = MIN_STARTUP_DURATION - startup_time log.info("%s, sleeping for %d seconds on boot" % (self.fqdn, remaining_delay)) startup_time += remaining_delay time.sleep(remaining_delay) self.start_agent() self._cluster.join(self.nodename) self._exit_startup() log.info("%s startup complete" % self.fqdn)
def update_session(self, first=False): mounts = [] for resource in self._server._cluster.get_running_resources( self._server.nodename): mounts.append({ 'device': resource['device_path'], 'fs_uuid': resource['uuid'], 'mount_point': resource['mount_point'], 'recovery_status': {} }) if first: packages = self._server.scan_packages() else: packages = None return { 'resource_locations': self._server._cluster.resource_locations(), 'capabilities': ['manage_targets'], 'metrics': { 'raw': { 'node': self._server.get_node_stats(), 'lustre': self._server.get_lustre_stats(), 'lustre_client_mounts': self._server.lustre_client_mounts } }, 'packages': packages, 'mounts': mounts, "properties": { 'zfs_installed': False, 'distro': 'CentOS', 'distro_version': 6.6, 'python_version_major_minor': 2.6, 'python_patchlevel': 6, 'kernel_version': '2.6.32-504.8.1.el6_lustre.x86_64' }, 'started_at': IMLDateTime.utcnow().isoformat() + 'Z', 'agent_version': 'dummy' }
def test_oversized_messages(self): """ Test that oversized messages are dropped and the session is terminated """ # Monkey-patch this setting to a lower limit to make testing easier MAX_BYTES_PER_POST = 1024 from chroma_agent.agent_client import MAX_BYTES_PER_POST as LARGE_MAX_BYTES_PER_POST def set_post_limit(size): import chroma_agent.agent_client chroma_agent.agent_client.MAX_BYTES_PER_POST = size self.addCleanup(set_post_limit, LARGE_MAX_BYTES_PER_POST) set_post_limit(MAX_BYTES_PER_POST) client = mock.Mock() client._fqdn = "test_server" client.boot_time = IMLDateTime.utcnow() client.start_time = IMLDateTime.utcnow() writer = HttpWriter(client) def fake_post(envelope): if len(json.dumps(envelope)) > MAX_BYTES_PER_POST: daemon_log.info("fake_post(): rejecting oversized message") raise HttpError() client.post = mock.Mock(side_effect=fake_post) TestPlugin = mock.Mock() mock_plugin_instance = mock.Mock() mock_plugin_instance.start_session = mock.Mock( return_value={'foo': 'bar'}) client.device_plugins.get = mock.Mock( return_value=lambda (plugin_name): mock_plugin_instance) client.device_plugins.get_plugins = mock.Mock( return_value={'test_plugin': TestPlugin}) client.sessions = SessionTable(client) daemon_log.setLevel(logging.DEBUG) import string from random import choice oversized_string = "".join( choice(string.printable) for i in range(MAX_BYTES_PER_POST)) # There should be one message to set up the session writer.poll('test_plugin') self.assertTrue(writer.send()) self.assertEqual(client.post.call_count, 1) messages = client.post.call_args[0][0]['messages'] self.assertEqual(len(messages), 1) self.assertEqual(messages[0]['type'], "SESSION_CREATE_REQUEST") # Pretend we got a SESSION_CREATE_RESPONSE client.sessions.create('test_plugin', 'id_foo') self.assertEqual(len(client.sessions._sessions), 1) # Inject a normal and an oversized message normal_body = DevicePluginMessage('normal', PRIO_NORMAL) oversized_body = DevicePluginMessage(oversized_string, PRIO_NORMAL) writer.put(Message("DATA", "test_plugin", normal_body, "id_foo", 0)) writer.put(Message("DATA", "test_plugin", oversized_body, "id_foo", 1)) # Only the normal message should get through self.assertTrue(writer.send()) self.assertEqual(client.post.call_count, 2) messages = client.post.call_args[0][0]['messages'] self.assertEqual(len(messages), 1) self.assertEqual(messages[0]['type'], "DATA") # The oversized message should be dropped and the session # terminated self.assertFalse(writer.send()) self.assertEqual(client.post.call_count, 3) self.assertEqual(len(client.sessions._sessions), 0) # However, we should eventually get a new session for the # offending plugin writer.poll('test_plugin') self.assertTrue(writer.send()) self.assertEqual(client.post.call_count, 4) messages = client.post.call_args[0][0]['messages'] self.assertEqual(len(messages), 1) self.assertEqual(messages[0]['type'], "SESSION_CREATE_REQUEST")
def test_session_backoff(self): """Test that when messages to the manager are being dropped due to POST failure, sending SESSION_CREATE_REQUEST messages has a power-of-two backoff wait""" client = mock.Mock() client._fqdn = "test_server" client.boot_time = IMLDateTime.utcnow() client.start_time = IMLDateTime.utcnow() writer = client.writer = HttpWriter(client) reader = client.reader = HttpReader(client) daemon_log.setLevel(logging.DEBUG) TestPlugin = mock.Mock() mock_plugin_instance = mock.Mock() mock_plugin_instance.start_session = mock.Mock( return_value={'foo': 'bar'}) client.device_plugins.get = mock.Mock( return_value=lambda (plugin_name): mock_plugin_instance) client.device_plugins.get_plugins = mock.Mock( return_value={'test_plugin': TestPlugin}) client.sessions = SessionTable(client) client.post = mock.Mock(side_effect=HttpError()) # Pick an arbitrary time to use as a base for simulated waits t_0 = datetime.datetime.now() old_datetime = datetime.datetime try: def expect_message_at(t): datetime.datetime.now = mock.Mock( return_value=t - datetime.timedelta(seconds=0.1)) writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 0) datetime.datetime.now = mock.Mock( return_value=t + datetime.timedelta(seconds=0.1)) writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 1) datetime.datetime = mock.Mock() datetime.datetime.now = mock.Mock(return_value=t_0) # Stage 1: failing to POST, backing off # ===================================== # Poll should put some session creation messages writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 1) # Another poll immediately after shouldn't add any messages (MIN_SESSION_BACKOFF hasn't passed) writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 1) # Send should consume the messages, and they go to nowhere because the POST fails writer.send() client.post.assert_called_once() self.assertEqual(len(client.post.call_args[0][0]['messages']), 1) # First time boundary: where the first repeat should happen from chroma_agent.agent_client import MIN_SESSION_BACKOFF t_1 = t_0 + MIN_SESSION_BACKOFF expect_message_at(t_1) # Have another crack at sending, it should fail and empty the queue writer.send() self.assertTrue(writer._messages.empty()) # Second time boundary: where the second repeat should happen t_2 = t_1 + MIN_SESSION_BACKOFF * 2 expect_message_at(t_2) # Stage 2: success in POST, session creation # ========================================== # This time we'll let the message go through, and a session to begin. client.post = mock.Mock() writer.send() # HttpReader receives a response from the manager, and should reset the backoff counters. reader._handle_messages([{ 'type': 'SESSION_CREATE_RESPONSE', 'plugin': 'test_plugin', 'session_id': 'id_foo', 'session_seq': 0, 'body': {} }]) self.assertEqual(len(client.sessions._sessions), 1) session = client.sessions.get('test_plugin') # State 3: POSTs start failing again, see delay again # =================================================== # Break the POST link again client.post = mock.Mock(side_effect=HttpError()) # Poll will get a DATA message from initial_scan session.initial_scan = mock.Mock(return_value={'foo': 'bar'}) writer.poll('test_plugin') session.initial_scan.assert_called_once() self.assertFalse(writer._messages.empty()) # Send will fail to send it, and as a result destroy the session writer.send() self.assertTrue(writer._messages.empty()) self.assertEqual(len(client.sessions._sessions), 0) # Move to some point beyond the first backoff cycle t_3 = t_0 + datetime.timedelta(seconds=60) datetime.datetime.now = mock.Mock(return_value=t_3) writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 1) writer.poll('test_plugin') self.assertEqual(writer._messages.qsize(), 1) writer.send() self.assertEqual(writer._messages.qsize(), 0) # Check the backoff time has gone back to MIN_SESSION_BACKOFF t_4 = t_3 + MIN_SESSION_BACKOFF expect_message_at(t_4) finally: datetime.datetime = old_datetime
def __init__(self, simulator, fake_cluster, server_id, fqdn, nodename, nid_tuples_or_network_interfaces, worker=False, client_mounts=None): self.fqdn = fqdn super(FakeServer, self).__init__(simulator.folder) self.nodename = nodename # When reloaded from a file the network interfaces are presented as a dict but when created from code they # are represented as nid_tuples. if type(nid_tuples_or_network_interfaces) is dict: self.network_interfaces = nid_tuples_or_network_interfaces else: self.network_interfaces = {} interface_no = 0 for nid in nid_tuples_or_network_interfaces: self.network_interfaces[nid[0]] = { 'interface_no': interface_no, 'type': nid[1], 'lnd_type': nid[1], 'lnd_network': nid[2] } interface_no += 1 self.boot_time = IMLDateTime.utcnow() self.id = server_id self.log_rate = 0 self._log_messages = [] self._simulator = simulator self._devices = simulator.devices self._cluster = fake_cluster self._lock = threading.Lock() self._agent_client = None self._cluster.join(nodename, fqdn=fqdn) if simulator.folder: # If simulator is persistent, save Crypto in same place crypto_folder = os.path.join(simulator.folder, "%s_crypto" % self.fqdn) else: # If simulator is ephemeral, create temp dir for Crypto crypto_folder = tempfile.mkdtemp() if not os.path.exists(crypto_folder): os.makedirs(crypto_folder) self.crypto = Crypto(crypto_folder) if not client_mounts: client_mounts = {} self.state['id'] = server_id self.state['network_interfaces'] = self.network_interfaces self.state['nodename'] = nodename self.state['fqdn'] = fqdn self.state['worker'] = worker self.state['client_mounts'] = client_mounts self.state['packages'][ 'chroma-agent'] = self._simulator.available_packages( self.node_type)[ 'chroma-agent'] # The agent is installed at this point. self.save()
def test_prase(self): for str, date_time in [("2016/10/09T19:20:21+1000", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(600))), ("2016/10/09T19:20:21.12345+1000", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(600))), ("2016-10-09 19:20:21-1000", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600))), ("2016-10-09 19:20:21.12345-1000", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(-600))), ("19:20:21-1000 2016-10/09", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-600))), ("19:20:21.12345-0000 2016/10-09", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0))), (" 19:20:21.12345 -1000 2016/10-09 ", IMLDateTime(year=2016, month=10, day=9, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(-600))), ("2016/10-09", IMLDateTime(year=2016, month=10, day=9, tzinfo=FixedOffset(0))), ("19:20:21.12345-0000", IMLDateTime(year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=123450, tzinfo=FixedOffset(0))), ("19:20:21-0500", IMLDateTime(year=1900, month=1, day=1, hour=19, minute=20, second=21, microsecond=0, tzinfo=FixedOffset(-300))) ]: self.assertEqual(IMLDateTime.parse(str), date_time)
def send(self): events = [] envelope = dict(fqdn=self.client.fqdn, copytool=self.copytool.id, events=events) envelope_size = len(json.dumps(envelope)) while True: try: event = self.retry_queue.get_nowait() copytool_log.debug("Got event from retry queue: %s" % event) except Queue.Empty: try: raw_event = self.send_queue.get_nowait() event = json.loads(raw_event) copytool_log.debug("Got event from send queue: %s" % event) except Queue.Empty: break except ValueError: copytool_log.error("Invalid JSON: %s" % raw_event) break try: date = IMLDateTime.parse(event['event_time']) event['event_time'] = date.astimezone( tz=FixedOffset(0)).strftime("%Y-%m-%d %H:%M:%S+00:00") except ValueError as e: copytool_log.error("Invalid event date in event '%s': %s" % (event, e)) break # During restore operations, we don't know the data_fid until # after the operation has started (i.e. RUNNING). The tricky part # is that when the restore completes, the source_fid is set to # data_fid, so unless we do this swap we'll lose track of the # operation. if 'RUNNING' in event['event_type']: if event['source_fid'] in self.active_operations: self.active_operations[ event['data_fid']] = self.active_operations.pop( event['source_fid']) if self.active_operations.get(event.get('data_fid', None), None): event['active_operation'] = self.active_operations[ event['data_fid']] if 'FINISH' in event['event_type']: try: del self.active_operations[event['data_fid']] except KeyError: pass copytool_log.debug("event: %s" % json.dumps(event)) event_size = len(json.dumps(event)) if event_size > MAX_BYTES_PER_POST: copytool_log.error("Oversized event dropped: %s" % event) break if events and event_size > MAX_BYTES_PER_POST - envelope_size: copytool_log.info("Requeueing oversized message " "(%d + %d > %d, %d messages)" % (event_size, envelope_size, MAX_BYTES_PER_POST, len(events))) self.retry_queue.put(event) break events.append(event) envelope_size += event_size if events: copytool_log.debug("EventRelay sending %d events" % len(events)) try: data = self.client.post(envelope) copytool_log.debug("Got data back from POST: %s" % data) try: self.active_operations.update(data['active_operations']) except (KeyError, TypeError): pass # Reset any backoff delay that might have been added self.reset_backoff() except HttpError: copytool_log.error("Failed to relay events, requeueing") for event in envelope['events']: self.retry_queue.put(event) self.backoff()