def remove_app(self, appname): """Remove app from scheduler.""" if appname not in self.cell.apps: return app = self.cell.apps[appname] if app.server: self.backend.delete(z.path.placement(app.server, appname)) if self.app_events_dir: trace.post(self.app_events_dir, app_events.DeletedTraceEvent(instanceid=appname)) # If finished does not exist, it means app is terminated by # explicit request, not because it finished on the node. if not self.backend.exists(z.path.finished(appname)): self.backend.put( z.path.finished(appname), { 'state': 'terminated', 'when': time.time(), 'host': app.server, 'data': None }, ) super(Master, self).remove_app(appname)
def _post_oom_event(tm_env, appname): """Post killed event due to oom.""" trace.post(tm_env.app_events_dir, events.KilledTraceEvent( instanceid=appname, is_oom=True, ))
def _post_exit_event(tm_env, appname, exitinfo): """Post finished event based on exit info.""" trace.post( tm_env.app_events_dir, events.FinishedTraceEvent(instanceid=appname, rc=exitinfo.get('return_code', 256), signal=exitinfo.get('signal', 256), payload=exitinfo))
def _abort_task(self, appname, exception): """Set task into aborted state in case of scheduling error.""" if self.app_events_dir: trace.post( self.app_events_dir, app_events.AbortedTraceEvent( instanceid=appname, why=app_abort.AbortedReason.SCHEDULER.value, payload=exception))
def _run(self, manifest): context.GLOBAL.zk.conn.add_listener(zkutils.exit_on_lost) with lc.LogContext(_LOGGER, self._service.name, lc.ContainerAdapter) as log: log.info('Running %r', self._service.directory) manifest['ephemeral_ports']['tcp'] = [] manifest['ephemeral_ports']['udp'] = [] _create_docker_log_symlink(self._service.data_dir) app = runtime.save_app(manifest, self._service.data_dir) volume_mapping = self._get_volume_mapping() app_presence = presence.EndpointPresence(context.GLOBAL.zk.conn, manifest) app_presence.register_identity() app_presence.register_running() client = self._get_client() try: container = _create_container(self._tm_env, self._get_config(), client, app, volume_mapping) except docker.errors.ImageNotFound: raise exc.ContainerSetupError( 'Image {0} was not found'.format(app.image), app_abort.AbortedReason.IMAGE) container.start() container.reload() _update_network_info_in_manifest(container, manifest) # needs to share manifest with container if volume_mapping: container_data_dir = next(iter(volume_mapping)) runtime.save_app(manifest, container_data_dir, app_json='app.json') _LOGGER.info('Container is running.') app_presence.register_endpoints() trace.post( self._tm_env.app_events_dir, events.ServiceRunningTraceEvent(instanceid=app.name, uniqueid=app.uniqueid, service='docker')) _print_container_logs(container)
def report_aborted(tm_env, instance, why=None, payload=None): """Report an aborted instance. Called when aborting after failed configure step or from cleanup. """ if payload is not None: payload = str(payload) trace.post( tm_env.app_events_dir, events.AbortedTraceEvent(instanceid=instance, why=_why_str(why), payload=payload))
def _update_task(self, appname, server, why): """Creates/updates application task with the new placement.""" # Servers in the cell have full control over task node. if self.app_events_dir: if server: trace.post( self.app_events_dir, app_events.ScheduledTraceEvent(instanceid=appname, where=server, why=why)) else: trace.post( self.app_events_dir, app_events.PendingTraceEvent(instanceid=appname, why=why))
def _finish(self): app = runtime.load_app(self._service.data_dir, runtime.STATE_JSON) if app: client = self._get_client() container = state = None name = appcfg.app_unique_name(app) try: container = client.containers.get(name) state = container.attrs.get('State') except docker.errors.NotFound: pass if container is not None: try: container.remove(force=True) except docker.errors.APIError: _LOGGER.error('Failed to remove %s', container.id) aborted = _check_aborted(self._service.data_dir) if aborted is not None: app_abort.report_aborted(self._tm_env, app.name, why=aborted.get('why'), payload=aborted.get('payload')) elif state is not None: if state.get('OOMKilled', False): event = events.KilledTraceEvent( instanceid=app.name, is_oom=True, ) else: event = events.FinishedTraceEvent(instanceid=app.name, rc=state.get( 'ExitCode', 256), signal=0, payload=state) trace.post(self._tm_env.app_events_dir, event) if os.name == 'nt': credential_spec.cleanup(name, client) try: runtime.archive_logs(self._tm_env, name, self._service.data_dir) except Exception: # pylint: disable=W0703 _LOGGER.exception('Unexpected exception storing local logs.')
def process_blackedout_servers(self, servers): """Callback invoked when server blacklist is modified.""" events = [] servers_blacklist = set(servers) for servername in servers_blacklist - self.servers_blacklist: _LOGGER.info('Server blackout: %s', servername) events.append( server_events.ServerBlackoutTraceEvent(servername=servername)) for servername in self.servers_blacklist - servers_blacklist: _LOGGER.info('Server blackout cleared: %s', servername) events.append( server_events.ServerBlackoutClearedTraceEvent( servername=servername)) for event in events: if self.server_events_dir: trace.post(self.server_events_dir, event) self.servers_blacklist = servers_blacklist
def _record_server_state(self, servername): """Record server state.""" super(Master, self)._record_server_state(servername) server = self.servers.get(servername) if not server: _LOGGER.warning('Server not found: %s', servername) return placement_node = z.path.placement(servername) state, since = server.get_state() self.backend.put(placement_node, { 'state': state.value, 'since': since }) if self.server_events_dir: trace.post( self.server_events_dir, server_events.ServerStateTraceEvent(servername=servername, state=state.value))
def configure(tm_env, event, runtime, runtime_param=None): """Creates directory necessary for starting the application. :param runtime_param: describe runtime paramater :type runtime_param: ``str list`` if not None contains list of 'parami=xyz' used for passing param to runtime This operation is idem-potent (it can be repeated). The directory layout is:: - (treadmill root)/ - apps/ - (app unique name)/ - data/ - app_start - app.json - manifest.yml env/ - TREADMILL_* run finish log/ - run The 'run' script is responsible for creating container environment and starting the container. The 'finish' script is invoked when container terminates and will deallocate any resources (NAT rules, etc) that were allocated for the container. """ # Load the app from the event try: manifest_data = load_runtime_manifest(tm_env, event, runtime) except IOError: # File is gone. Nothing to do. _LOGGER.exception('No event to load: %r', event) return None # Freeze the app data into a namedtuple object app = utils.to_obj(manifest_data) # Generate a unique name for the app uniq_name = appcfg.app_unique_name(app) # Write the actual container start script if os.name == 'nt': run_script = ( '{treadmill}/scripts/treadmill sproc run {param} .'.format( treadmill=subproc.resolve('treadmill'), param='--runtime-param {}'.format(','.join(runtime_param)) if runtime_param else '', )) else: run_script = 'exec {treadmill}/bin/treadmill sproc run ../'.format( treadmill=subproc.resolve('treadmill'), ) # Create the service for that container container_svc = supervisor.create_service( tm_env.apps_dir, name=uniq_name, app_run_script=run_script, userid='root', downed=False, monitor_policy={ 'limit': 0, 'interval': 60, 'tombstone': { 'uds': False, 'path': tm_env.running_tombstone_dir, 'id': app.name } }, environ={}, environment=app.environment) data_dir = container_svc.data_dir # Copy the original event as 'manifest.yml' in the container dir try: shutil.copyfile(event, os.path.join(data_dir, 'manifest.yml')) except IOError as err: # File is gone, cleanup. if err.errno == errno.ENOENT: shutil.rmtree(container_svc.directory) _LOGGER.exception('Event gone: %r', event) return None else: raise # Store the app.json in the container directory fs.write_safe(os.path.join(data_dir, appcfg.APP_JSON), lambda f: f.writelines(utils.json_genencode(manifest_data)), mode='w', permission=0o644) trace.post( tm_env.app_events_dir, events.ConfiguredTraceEvent(instanceid=app.name, uniqueid=app.uniqueid)) return container_svc.directory
def test_post(self): """Test trace.post.""" # Disable W0212(protected-access) # pylint: disable=W0212 zkclient_mock = mock.Mock() zkclient_mock.get_children.return_value = [] publisher = events_publisher.EventsPublisher( zkclient_mock, app_events_dir=self.app_events_dir, server_events_dir=self.server_events_dir ) trace.post( self.app_events_dir, app_events.PendingTraceEvent( instanceid='foo.bar#123', why='created', ) ) path = os.path.join( self.app_events_dir, '100,foo.bar#123,pending,created' ) self.assertTrue(os.path.exists(path)) publisher._on_created(path, app_zk.publish) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending,created', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) zkclient_mock.reset_mock() trace.post( self.app_events_dir, app_events.PendingDeleteTraceEvent( instanceid='foo.bar#123', why='deleted' ) ) path = os.path.join( self.app_events_dir, '100,foo.bar#123,pending_delete,deleted' ) self.assertTrue(os.path.exists(path)) publisher._on_created(path, app_zk.publish) zkclient_mock.create.assert_called_once_with( '/trace/007B/foo.bar#123,100,baz,pending_delete,deleted', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) zkclient_mock.reset_mock() trace.post( self.app_events_dir, app_events.AbortedTraceEvent( instanceid='foo.bar#123', why='test' ) ) path = os.path.join( self.app_events_dir, '100,foo.bar#123,aborted,test' ) self.assertTrue(os.path.exists(path)) publisher._on_created(path, app_zk.publish) self.assertEqual(zkclient_mock.create.call_args_list, [ mock.call( '/trace/007B/foo.bar#123,100,baz,aborted,test', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ), mock.call( '/finished/foo.bar#123', json.dumps({ 'data': 'test', 'host': 'baz', 'state': 'aborted', 'when': '100' }, sort_keys=True).encode(), makepath=True, ephemeral=False, acl=mock.ANY, sequence=False ) ]) zkclient_mock.reset_mock() trace.post( self.server_events_dir, server_events.ServerStateTraceEvent( servername='test.xx.com', state='up' ) ) path = os.path.join( self.server_events_dir, '100,test.xx.com,server_state,up' ) self.assertTrue(os.path.exists(path)) publisher._on_created(path, server_zk.publish) zkclient_mock.create.assert_called_once_with( '/server-trace/005D/test.xx.com,100,baz,server_state,up', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY ) zkclient_mock.reset_mock() trace.post( self.server_events_dir, server_events.ServerBlackoutTraceEvent( servername='test.xx.com' ) ) path = os.path.join( self.server_events_dir, '100,test.xx.com,server_blackout,' ) self.assertTrue(os.path.exists(path)) publisher._on_created(path, server_zk.publish) zkclient_mock.create.assert_called_once_with( '/server-trace/005D/test.xx.com,100,baz,server_blackout,', b'', ephemeral=False, makepath=True, sequence=False, acl=mock.ANY )
def _run(self, manifest): context.GLOBAL.zk.conn.add_listener(zkutils.exit_on_lost) with lc.LogContext(_LOGGER, self._service.name, lc.ContainerAdapter) as log: log.info('Running %r', self._service.directory) manifest['ephemeral_ports']['tcp'] = [] manifest['ephemeral_ports']['udp'] = [] # create container_data dir container_data_dir = os.path.join(self._service.data_dir, 'container_data') log.info('container_data %r', container_data_dir) fs.mkdir_safe(container_data_dir) # volume mapping config : read-only mapping volume_mapping = { container_data_dir: { 'bind': 'c:\\container_data', 'mode': 'ro' } } app = runtime.save_app(manifest, self._service.data_dir) app_presence = presence.EndpointPresence(context.GLOBAL.zk.conn, manifest) app_presence.register_identity() app_presence.register_running() client = self._get_client() try: container = _create_container(self._tm_env, self._get_config(), client, app, volume_mapping) except docker.errors.ImageNotFound: raise exc.ContainerSetupError( 'Image {0} was not found'.format(app.image), app_abort.AbortedReason.IMAGE) container.start() container.reload() _update_network_info_in_manifest(container, manifest) runtime.save_app(manifest, container_data_dir, app_json='app.json') _LOGGER.info('Container is running.') app_presence.register_endpoints() trace.post( self._tm_env.app_events_dir, events.ServiceRunningTraceEvent(instanceid=app.name, uniqueid=app.uniqueid, service='docker')) while container.status == 'running': container.wait(timeout=10) container.reload()