def test_aborted(self): """Aborted event operations. """ event = events.AbortedTraceEvent(timestamp=1, source='tests', instanceid='proid.foo#123', why='reason', payload='test') self.assertEqual( event.to_dict(), { 'event_type': 'aborted', 'timestamp': 1, 'source': 'tests', 'instanceid': 'proid.foo#123', 'why': 'reason', 'payload': 'test', }) self.assertEqual(event.to_data(), ( 1, 'tests', 'proid.foo#123', 'aborted', 'reason', 'test', )) self.assertEqual( event, events.AbortedTraceEvent.from_data(timestamp=1, source='tests', instanceid='proid.foo#123', event_type='aborted', event_data='reason', payload='test'))
def _abort_task(self, appname, exception): """Set task into aborted state in case of scheduling error.""" if self.events_dir: appevents.post( self.events_dir, traceevents.AbortedTraceEvent(instanceid=appname, why=type(exception).__name__))
def start_container(container_root, manifest): """Treadmill container boot process. """ _LOGGER.info('Initializing container: %s', container_root) app = app_manifest.read(manifest) try: pivot_root.make_root(container_root) os.chdir('/') except Exception as err: # pylint: disable=broad-except event = traceevents.AbortedTraceEvent( instanceid=app['name'], why=app_abort.AbortedReason.PIVOT_ROOT.value, payload=str(err), ) _abort(event, container_root) # reraise err to exit start_container raise err # XXX: Debug info _LOGGER.debug('Current mounts: %s', pprint.pformat(fs_linux.list_mounts())) # Clean the environ # TODO: Remove me once clean environment management is merged in. os.environ.pop('PYTHONPATH', None) os.environ.pop('LC_ALL', None) os.environ.pop('LANG', None) # Clear aliases path. os.environ.pop('TREADMILL_ALIASES_PATH', None) subproc.safe_exec(['s6_svscan', '-s', '/services'])
def _abort_task(self, appname, exception): """Set task into aborted state in case of scheduling error.""" if self.events_dir: appevents.post( self.events_dir, traceevents.AbortedTraceEvent(instanceid=appname, why=app_abort.SCHEDULER, payload=exception))
def test_post_zk(self): """Test appevents.post.zk.""" zkclient_mock = mock.Mock() zkclient_mock.get_children.return_value = [] appevents.post_zk( zkclient_mock, events.PendingTraceEvent( instanceid='foo.bar#123', why='created', payload='' ) ) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending,created', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) zkclient_mock.reset_mock() appevents.post_zk( zkclient_mock, events.PendingDeleteTraceEvent( instanceid='foo.bar#123', why='deleted' ) ) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending_delete,deleted', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) zkclient_mock.reset_mock() appevents.post_zk( zkclient_mock, events.AbortedTraceEvent( instanceid='foo.bar#123', why='test' ) ) zkclient_mock.create.assert_has_calls([ mock.call( '/trace/007B/foo.bar#123,100,baz,aborted,test', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ), mock.call( '/finished/foo.bar#123', b'{data: test, host: baz, state: aborted, when: \'100\'}\n', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) ])
def test_post(self): """Test appevents.post.""" appevents.post( self.root, events.AbortedTraceEvent(why='container_error', instanceid='foo.bar#123', payload=None)) self.assertTrue( os.path.exists( os.path.join(self.root, '100,foo.bar#123,aborted,container_error')))
def report_aborted(tm_env, instance, why=None, payload=None): """Report an aborted instance. Called when aborting after failed configure step or from cleanup. """ if payload is not None: payload = str(payload) appevents.post( tm_env.app_events_dir, events.AbortedTraceEvent(instanceid=instance, why=_why_str(why), payload=payload))
def finish(tm_env, zkclient, container_dir, watchdog): """Frees allocated resources and mark then as available. """ with lc.LogContext(_LOGGER, os.path.basename(container_dir), lc.ContainerAdapter) as log: log.info('finishing %r', container_dir) _stop_container(container_dir) # Check if application reached restart limit inside the container. # # The container directory will be moved, this check is done first. # # If restart limit was reached, application node will be removed from # Zookeeper at the end of the cleanup process, indicating to the # scheduler that the server is ready to accept new load. exitinfo, aborted, aborted_reason = _collect_exit_info(container_dir) app = runtime.load_app(container_dir) if app: _cleanup(tm_env, zkclient, container_dir, app) else: app = runtime.load_app(container_dir, appcfg.APP_JSON) if app: # All resources are cleaned up. If the app terminated inside the # container, remove the node from Zookeeper, which will notify the # scheduler that it is safe to reuse the host for other load. if aborted: appevents.post( tm_env.app_events_dir, events.AbortedTraceEvent( instanceid=app.name, why=None, # TODO(boysson): extract this info payload=aborted_reason ) ) if exitinfo: _post_exit_event(tm_env, app, exitinfo) # cleanup monitor with container information if app: apphook.cleanup(tm_env, app) # Delete the app directory (this includes the tarball, if any) shutil.rmtree(container_dir) # cleanup was succesful, remove the watchdog watchdog.remove() log.logger.info('Finished cleanup: %s', container_dir)
def test_aborted(self, stdout_mock): """Test printing Aborted event. """ event = events.AbortedTraceEvent(timestamp=1, source='tests', instanceid='proid.foo#123', why='unknown', payload='test') self.trace_printer.process(event) self.assertEqual( stdout_mock.getvalue(), 'Thu, 01 Jan 1970 00:00:01+0000 - ' 'proid.foo#123 aborted on tests [reason: unknown]\n')
def abort(tm_env, event, exc=None, reason=None): """Abort a unconfigured application. Called when aborting after failed configure step. """ # If aborting after failed configure step, the 'name' attribute is # derived from the event file name. instanceid = os.path.basename(event) _LOGGER.info('Aborting %s', instanceid) # Report start failure. if reason is None and exc: reason = type(exc).__name__ appevents.post( tm_env.app_events_dir, events.AbortedTraceEvent(why=reason, instanceid=instanceid, payload=None))
def test_report_aborted(self): """Tests report abort sequence.""" context.GLOBAL.zk.url = 'zookeeper://xxx@hhh:123/treadmill/mycell' treadmill.zkutils.connect.return_value = kazoo.client.KazooClient() kazoo.client.KazooClient.get_children.return_value = [] kazoo.client.KazooClient.exists.return_value = True kazoo.client.KazooClient.create.reset() kazoo.client.KazooClient.delete.reset() app_abort.report_aborted(self.tm_env, 'proid.myapp#001', why=app_abort.AbortedReason.TICKETS, payload='test') treadmill.appevents.post.assert_called_with( mock.ANY, events.AbortedTraceEvent( instanceid='proid.myapp#001', why='tickets', payload='test', ), )
def test_abort(self): """Tests abort sequence.""" context.GLOBAL.zk.url = 'zookeeper://xxx@hhh:123/treadmill/mycell' treadmill.zkutils.connect.return_value = kazoo.client.KazooClient() kazoo.client.KazooClient.get_children.return_value = [] kazoo.client.KazooClient.exists.return_value = True # Check abort sequence when name is not part of the manifest, rather # derived from the manifest appname. manifest_file = os.path.join(self.root, 'schema', 'proid.myapp#001') kazoo.client.KazooClient.create.reset() kazoo.client.KazooClient.delete.reset() app_abort.abort(self.tm_env, manifest_file, exc=Exception('test')) treadmill.appevents.post.assert_called_with( mock.ANY, events.AbortedTraceEvent( instanceid='proid.myapp#001', why='Exception', payload=None, ), )
def test_finish_aborted(self): """Tests container finish procedure when node is aborted. """ manifest = { 'app': 'proid.myapp', 'cell': 'test', 'cpu': '100%', 'disk': '100G', 'environment': 'dev', 'host_ip': '172.31.81.67', 'memory': '100M', 'name': 'proid.myapp#001', 'proid': 'foo', 'shared_network': False, 'task': '001', 'uniqueid': '0000000ID1234', 'archive': ['/var/tmp/treadmill'], 'endpoints': [{ 'port': 8000, 'name': 'http', 'real_port': 5000, 'proto': 'tcp', }], 'services': [{ 'name': 'web_server', 'command': '/bin/false', 'restart': { 'limit': 3, 'interval': 60, }, }], 'ephemeral_ports': { 'tcp': [], 'udp': [], } } treadmill.appmgr.manifest.read.return_value = manifest app_unique_name = 'proid.myapp-001-0000000ID1234' mock_ld_client = self.app_env.svc_localdisk.make_client.return_value localdisk = { 'block_dev': '/dev/foo', } mock_ld_client.get.return_value = localdisk mock_nwrk_client = self.app_env.svc_network.make_client.return_value network = { 'vip': '192.168.0.2', 'gateway': '192.168.254.254', 'veth': 'testveth.0', } mock_nwrk_client.get.return_value = network app_dir = os.path.join(self.root, 'apps', app_unique_name) # Create content in app root directory, verify that it is archived. fs.mkdir_safe(os.path.join(app_dir, 'root', 'xxx')) fs.mkdir_safe(os.path.join(app_dir, 'services')) # Simulate daemontools finish script, marking the app is done. with open(os.path.join(app_dir, 'aborted'), 'w') as aborted: aborted.write('something went wrong') mock_zkclient = kazoo.client.KazooClient() app_finish.finish(self.app_env, mock_zkclient, app_dir) treadmill.appevents.post( mock.ANY, events.AbortedTraceEvent(instanceid='proid.myapp#001', why=None, payload={ 'why': 'something went wrong', 'node': 'hostname', })) treadmill.rrdutils.flush_noexc.assert_called_with( os.path.join(self.root, 'metrics', 'apps', app_unique_name + '.rrd')) shutil.copy.assert_called_with( os.path.join(self.root, 'metrics', 'apps', app_unique_name + '.rrd'), os.path.join(app_dir, 'metrics.rrd'))
def finish(tm_env, zkclient, container_dir): """Frees allocated resources and mark then as available. :param tm_env: Treadmill application environment :type tm_env: `appmgr.AppEnvironment` :param container_dir: Full path to the application container directory :type container_dir: ``str`` """ # FIXME(boysson): Clean should be done inside the container. The watchdog # value below is inflated to account for the extra # archiving time. name_dir = os.path.basename(container_dir) with lc.LogContext(_LOGGER, name_dir, lc.ContainerAdapter) as log: log.info('finishing %r', container_dir) watchdog_name = '{name}-{app}'.format(name=__name__, app=name_dir) watchdog = tm_env.watchdogs.create( watchdog_name, '5m', 'Cleanup of ' '%r stalled' % container_dir) _stop_container(container_dir) # Check if application reached restart limit inside the container. # # The container directory will be moved, this check is done first. # # If restart limit was reached, application node will be removed from # Zookeeper at the end of the cleanup process, indicating to the # scheduler that the server is ready to accept new load. exitinfo, aborted, aborted_reason = _collect_exit_info(container_dir) app = _load_app(container_dir, _STATE_YML) if app: _cleanup(tm_env, zkclient, container_dir, app) else: app = _load_app(container_dir, _APP_YML) if app: # All resources are cleaned up. If the app terminated inside the # container, remove the node from Zookeeper, which will notify the # scheduler that it is safe to reuse the host for other load. if aborted: appevents.post( tm_env.app_events_dir, events.AbortedTraceEvent( instanceid=app.name, why=None, # TODO(boysson): extract this info payload=aborted_reason)) if exitinfo: _post_exit_event(tm_env, app, exitinfo) # Delete the app directory (this includes the tarball, if any) shutil.rmtree(container_dir) # cleanup was succesful, remove the watchdog watchdog.remove() log.info('Finished cleanup: %s', container_dir)
def test_post(self): """Test appevents.post.""" # Disable W0212(protected-access) # pylint: disable=W0212 zkclient_mock = mock.Mock() zkclient_mock.get_children.return_value = [] watcher = appevents.AppEventsWatcher(zkclient_mock, self.root) appevents.post( self.root, events.PendingTraceEvent( instanceid='foo.bar#123', why='created', )) path = os.path.join(self.root, '100,foo.bar#123,pending,created') self.assertTrue(os.path.exists(path)) watcher._on_created(path) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending,created', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY) zkclient_mock.reset_mock() appevents.post( self.root, events.PendingDeleteTraceEvent(instanceid='foo.bar#123', why='deleted')) path = os.path.join(self.root, '100,foo.bar#123,pending_delete,deleted') self.assertTrue(os.path.exists(path)) watcher._on_created(path) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending_delete,deleted', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY) zkclient_mock.reset_mock() appevents.post( self.root, events.AbortedTraceEvent(instanceid='foo.bar#123', why='test')) path = os.path.join(self.root, '100,foo.bar#123,aborted,test') self.assertTrue(os.path.exists(path)) watcher._on_created(path) self.assertEqual(zkclient_mock.create.call_args_list, [ mock.call('/trace/007B/foo.bar#123,100,baz,aborted,test', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY), mock.call( '/finished/foo.bar#123', b'{data: test, host: baz, state: aborted, when: \'100\'}\n', makepath=True, ephemeral=False, acl=mock.ANY, sequence=False) ])