def boot(approot, runtime, core_cpu_shares, core_cpuset_cpus, apps_cpuset_cpus, core_memory_limit, preserve_mounts): """Treadmill boot process. """ _LOGGER.info('Initializing Treadmill: %s (%s)', approot, runtime) tm_env = appenv.AppEnvironment(approot) tm_env.initialize(None) # We preserve anything mounted on the install root (mounted by # plugins?) and whatever path provided on the commandline. fs_linux.cleanup_mounts( [tm_env.root + '*'] + preserve_mounts.split(',') ) _cgroup_init( core_cpu_shares, core_cpuset_cpus, apps_cpuset_cpus, core_memory_limit ) subproc.safe_exec( [ 's6_svscan', '-s', tm_env.init_dir ] )
def create_endpoint_file(approot, port, appname, endpoint): """Create and link local endpoint file""" hostport = '%s:%s' % (sysinfo.hostname(), port) zkclinet = context.GLOBAL.zk.conn endpoint_proid_path = z.path.endpoint_proid(appname) acl = zkclinet.make_servers_acl() _LOGGER.info('Ensuring %s exists with ACL %r', endpoint_proid_path, acl) zkutils.ensure_exists(zkclinet, endpoint_proid_path, acl=[acl]) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. zkutils.ensure_deleted(zkclinet, endpoint_path) time.sleep(5) zkutils.put(zkclinet, endpoint_path, hostport) tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all(appname=appname, endpoint=endpoint, proto='tcp') endpoints_mgr.create_spec( appname=appname, endpoint=endpoint, proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), )
def __init__(self, root, runtime, runtime_param=None): _LOGGER.info('init appcfgmgr: %s, %s, %s', root, runtime, runtime_param) self.tm_env = appenv.AppEnvironment(root=root) self._is_active = False self._runtime = runtime self._runtime_param = runtime_param
def test_get_spec(self): """Test get endpoint spec with partial pattern match. """ tm_env = appenv.AppEnvironment(root=self.root) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) # pylint: disable=W0212 self.assertIsNone(endpoints_mgr.get_spec()) endpoints_mgr.create_spec( appname='appname##0000000001', proto='tcp', endpoint='nodeinfo', real_port=12345, pid=5213, port=8000, owner=None, ) self.assertIsNotNone(endpoints_mgr.get_spec(proto='tcp')) self.assertEqual( endpoints_mgr.get_spec(proto='tcp'), endpoints_mgr.get_spec(endpoint='nodeinfo'), ) self.assertEqual( endpoints_mgr.get_spec(proto='tcp'), endpoints_mgr.get_spec(proto='tcp', endpoint='nodeinfo'), )
def run(approot, runtime, container_dir): """Runs container given a container dir.""" # Intercept SIGTERM from s6 supervisor, so that initialization is not # left in broken state. with lc.LogContext(_LOGGER, os.path.basename(container_dir), lc.ContainerAdapter) as log: terminated = utils.make_signal_flag(utils.term_signal()) tm_env = None try: log.info('run %r %r', approot, container_dir) tm_env = appenv.AppEnvironment(approot) app_runtime.get_runtime(runtime, tm_env, container_dir).run(terminated) # If we reach here, the application was terminated. except Exception as exc: # pylint: disable=W0703 if not terminated: log.critical('Failed to start, app will be aborted.', exc_info=True) app_abort.flag_aborted(tm_env, container_dir, exc) else: log.logger.info('Exception while handling term, ignore.', exc_info=True)
def run(approot, instance): """Starts discovery publisher process.""" tm_env = appenv.AppEnvironment(approot) publisher = endpoints.EndpointPublisher(tm_env.endpoints_dir, context.GLOBAL.zk.conn, instance=instance) publisher.run()
def register_cmd(approot, refresh_interval, manifest, container_dir): """Register container presence.""" try: _LOGGER.info('Configuring sigterm handler.') signal.signal(utils.term_signal(), sigterm_handler) tm_env = appenv.AppEnvironment(approot) app = app_manifest.read(manifest) app_presence = presence.EndpointPresence(context.GLOBAL.zk.conn, app) # If tickets are not ok, app will be aborted. # # If tickets acquired successfully, services will start, and # tickets will be refreshed after each interval. refresh = False try: app_presence.register() refresh = _get_tickets(app, container_dir) _start_service_sup(tm_env, app, container_dir) except exc.ContainerSetupError as err: app_abort.abort(container_dir, why=err.reason, payload=traceback.format_exc()) while True: # Need to sleep anyway even if not refreshing tickets. time.sleep(refresh_interval) if refresh: _refresh_tickets(app, container_dir) finally: _LOGGER.info('Stopping zookeeper.') context.GLOBAL.zk.conn.stop()
def test_update_app_rrds(self): """Test update container rrds""" data = { 'foo.bar-00001-KKmc7hBHskLWh': { 'timestamp': 1 }, 'foo.bar-00002-KKmc7hBHskLWj': { 'timestamp': 3, 'memory.usage_in_bytes': 10, 'memory.soft_limit_in_bytes': 10, 'memory.limit_in_bytes': 10, 'cpuacct.usage': 3000000000, 'cpu.shares': 1024, 'blkio.throttle.io_service_bytes': { '3:0': { 'Read': 5, 'Write': 3 } }, 'blkio.throttle.io_serviced': { '3:0': { 'Read': 5, 'Write': 3 } }, 'fs.used_bytes': 10, }, } rrdclient = mock.Mock() tm_env = appenv.AppEnvironment(self.root) # pylint: disable=W0212 metrics._update_app_rrds(data, self.root, rrdclient, 5, tm_env) rrdclient.create.assert_has_calls([ mock.call('{}/foo.bar-00002-KKmc7hBHskLWj.rrd'.format(self.root), 5, 10), mock.call('{}/foo.bar-00001-KKmc7hBHskLWh.rrd'.format(self.root), 5, 10), ], any_order=True) metrics_data = { 'hardmem': 10, 'softmem': 10, 'blk_write_iops': 3, 'memusage': 10, 'fs_used_bytes': 10, 'blk_read_bps': 5, 'cpuusage_ratio': 0.000244140625, 'cpuusage': 0.005, 'blk_read_iops': 5, 'cputotal': 3000000000, 'blk_write_bps': 3, 'timestamp': 3, } rrdclient.update.assert_has_calls([ mock.call('{}/foo.bar-00002-KKmc7hBHskLWj.rrd'.format(self.root), metrics_data, metrics_time=3) ])
def finish(approot, runtime, container_dir): """Finish treadmill application on the node.""" with lc.LogContext(_LOGGER, os.path.basename(container_dir), lc.ContainerAdapter) as log: log.info('finish (approot %s)', approot) tm_env = appenv.AppEnvironment(approot) app_runtime.get_runtime(runtime, tm_env, container_dir).finish()
def cleanup_instance(approot, runtime, instance, runtime_param): """Actually do the cleanup of the instance. """ param = utils.equals_list2dict(runtime_param or []) tm_env = appenv.AppEnvironment(root=approot) cleaner = cleanup.Cleanup(tm_env) cleaner.invoke(runtime, instance, param)
def run(approot, config_dir): """Runs monitor.""" tm_env = None if approot: tm_env = appenv.AppEnvironment(root=approot) mon = monitor.Monitor(tm_env=tm_env, config_dir=config_dir) mon.run()
def server(approot, register, port, auth, modules, title, cors_origin): """Runs nodeinfo server.""" if port == 0: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('0.0.0.0', 0)) port = sock.getsockname()[1] sock.close() hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) if register: zkclient = context.GLOBAL.zk.conn zkclient.add_listener(zkutils.exit_on_lost) appname = 'root.%s#%010d' % (hostname, os.getpid()) app_pattern = 'root.%s#*' % (hostname) path = z.path.endpoint(appname, 'tcp', 'nodeinfo') _LOGGER.info('register endpoint: %s %s', path, hostport) zkutils.create(zkclient, path, hostport, acl=[zkclient.make_servers_acl()], ephemeral=True) # TODO: remove "legacy" endpoint registration once conversion is # complete. tm_env = appenv.AppEnvironment(approot) # TODO: need to figure out how to handle windows. assert os.name != 'nt' endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all( app_pattern, endpoint='nodeinfo', proto='tcp' ) endpoints_mgr.create_spec( appname=appname, endpoint='nodeinfo', proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), ) _LOGGER.info('Starting nodeinfo server on port: %s', port) utils.drop_privileges() api_paths = [] if modules: api_modules = {module: None for module in modules} api_paths = api.init( api_modules, title.replace('_', ' '), cors_origin ) rest_server = rest.TcpRestServer(port, auth_type=auth, protect=api_paths) rest_server.run()
def node_services(approot, scan_dir): """Setup a node services monitor enforcing restart policies. """ tm_env = appenv.AppEnvironment(root=approot) mon = monitor.Monitor(scan_dirs=scan_dir, service_dirs=None, policy_impl=monitor.MonitorRestartPolicy, down_action=monitor.MonitorNodeDown(tm_env)) mon.run()
def tm_env(_metrics_api=None): """Lazy instantiate app environment.""" if not self._tm_env: # TODO: we need to pass this parameter to api, unfortunately # in current api framework it is not trivial. approot = os.environ['TREADMILL_APPROOT'] _LOGGER.info('Using approot: %s', approot) self._tm_env = appenv.AppEnvironment(approot) return self._tm_env
def alert_monitor_cmd(approot, plugin, max_queue_length, wait_interval): """Publish alerts. """ tm_env = appenv.AppEnvironment(root=approot) watcher = dirwatch.DirWatcher(tm_env.alerts_dir) watcher.on_created = _get_on_create_handler( _load_alert_backend(plugin)) _serve_forever(watcher, tm_env.alerts_dir, max_queue_length, wait_interval)
def containers(approot, scan_dir): """Setup a monitor for the running containers. """ tm_env = appenv.AppEnvironment(root=approot) mon = monitor.Monitor( scan_dirs=scan_dir, service_dirs=None, policy_impl=monitor.MonitorRestartPolicy, down_action=monitor.MonitorContainerCleanup(tm_env)) mon.run()
def run(approot, scan_interval, instance): """Starts portscan process.""" _LOGGER.info('Staring portscan: scan interval: %d', scan_interval) tm_env = appenv.AppEnvironment(approot) scanner = endpoints.PortScanner(tm_env.endpoints_dir, context.GLOBAL.zk.conn, scan_interval=scan_interval, instance=instance) scanner.run()
def services(approot, container_dir, service_dir): """Setup a services monitor enforcing restart policies. """ tm_env = appenv.AppEnvironment(root=approot) mon = monitor.Monitor( scan_dirs=None, service_dirs=service_dir, policy_impl=monitor.MonitorRestartPolicy, down_action=monitor.MonitorContainerDown(container_dir), event_hook=monitor.PresenceMonitorEventHook(tm_env)) mon.run()
def setUp(self, mock_resource_service): # W0221 Arguments number differs from overridden method # pylint: disable=W0221 def _fake_service_factory(impl, *_args, **_kw_args): """Generate a unique mock object for each service implementation. """ return mock.Mock(name=impl) mock_resource_service.side_effect = _fake_service_factory self.root = tempfile.mkdtemp() self.tm_env = appenv.AppEnvironment(root=self.root)
def finish(approot, runtime, container_dir, runtime_param): """Finish treadmill application on the node.""" # Run with finish context as finish runs in cleanup. with lc.LogContext(_LOGGER, os.path.basename(container_dir), lc.ContainerAdapter) as log: log.info('finish (approot %s)', approot) tm_env = appenv.AppEnvironment(approot) param = utils.equals_list2dict(runtime_param or []) app_runtime.get_runtime( runtime, tm_env, container_dir, param ).finish()
def top(no_lock, api, approot, once): """Sync LDAP data with Zookeeper data.""" tm_env = appenv.AppEnvironment(root=approot) if not no_lock: lock = zkutils.make_lock(context.GLOBAL.zk.conn, z.path.election(__name__)) _LOGGER.info('Waiting for leader lock.') with lock: _run_sync(api, tm_env.alerts_dir, once) else: _LOGGER.info('Running without lock.') _run_sync(api, tm_env.alerts_dir, once)
def alert_monitor_cmd(approot, plugin): """Publish alerts.""" tm_env = appenv.AppEnvironment(root=approot) watcher = dirwatch.DirWatcher(tm_env.alerts_dir) watcher.on_created = _get_on_create_handler( _load_alert_backend(plugin) ) # if there are alerts in alerts_dir already for alert_file in os.listdir(tm_env.alerts_dir): watcher.on_created(os.path.join(tm_env.alerts_dir, alert_file)) _serve_forever(watcher)
def run(approot, runtime, container_dir): """Runs container given a container dir.""" # Make sure container_dir is a fully resolved path. container_dir = os.path.realpath(container_dir) _LOGGER.info('run %r %r', approot, container_dir) tm_env = appenv.AppEnvironment(approot) try: app_runtime.get_runtime(runtime, tm_env, container_dir).run() except Exception as exc: # pylint: disable=W0703 _LOGGER.exception('Failed to start, app will be aborted.') app_abort.flag_aborted(tm_env, container_dir, exc)
def cgroup(): """Runs cgroup node service.""" root_dir = local_ctx['root-dir'] watchdogs_dir = local_ctx['watchdogs-dir'] svc = services.ResourceService( service_dir=os.path.join(root_dir, 'cgroup_svc'), impl='cgroup', ) svc.run( watchdogs_dir=os.path.join(root_dir, watchdogs_dir), tm_env=appenv.AppEnvironment(root_dir), )
def cleaning(approot, scan_dir): """Setup a monitor for the running containers. """ tm_env = appenv.AppEnvironment(root=approot) def _policy_factory(): return monitor.CleanupMonitorRestartPolicy(tm_env) mon = monitor.Monitor(scan_dirs=scan_dir, service_dirs=None, policy_impl=_policy_factory, down_action=monitor.MonitorNodeDown( tm_env, prefix='cleanup-')) mon.run()
def __init__(self, approot, interval): self.cache = {'treadmill': {}, 'core': {}, 'app': {}} self._interval = interval self._tm_env = appenv.AppEnvironment(root=approot) self._sys_svcs = _sys_svcs(approot) # TODO: sys_maj_min will be used changing treadmill.metrics.app_metrics self._sys_maj_min = '{}:{}'.format( *fs_linux.maj_min_from_path(approot)) self._sys_block_dev = fs_linux.maj_min_to_blk( *fs_linux.maj_min_from_path(approot)) # if interval is zero, we just read one time if interval <= 0: self._read() else: self._loop()
def setUp(self): self.root = tempfile.mkdtemp() os.environ['TREADMILL_APPROOT'] = self.root self.tm_env = appenv.AppEnvironment(root=self.root) fs.mkdir_safe(self.tm_env.apps_dir) fs.mkdir_safe(self.tm_env.archives_dir) full_names = ( ('proid.simplehttp', '0001025686', 'ymweWiRm86C7A'), ('proid.myapi.test', '0001027473', 'kJoV4j0DU6dtJ'), ) for app, instance, uniq in full_names: link = '#'.join([app, instance]) fs.mkfile_safe(os.path.join(self.tm_env.running_dir, link)) target = '-'.join([app, instance, uniq]) fs.mkdir_safe(os.path.join(self.tm_env.apps_dir, target, 'data')) fs.symlink_safe( os.path.join(self.tm_env.running_dir, link), os.path.join(self.tm_env.apps_dir, target), ) files = ( # incorrect file 'proid.app-foo-bar#123.sys.tar.gz', 'proid.app#123.sys.tar.gz', # correct file 'proid.app-123-uniq.sys.tar.gz', 'proid.test.sleep-901-uniq.sys.tar.gz', ) for f in files: fs.mkfile_safe(os.path.join(self.tm_env.archives_dir, f)) self.api = local.API()
def run(approot, runtime, container_dir, runtime_param=None): """Runs container given a container dir.""" # Make sure container_dir is a fully resolved path. container_dir = os.path.realpath(container_dir) service = supervisor.open_service(container_dir) _LOGGER.info('run %r %r', approot, container_dir) tm_env = appenv.AppEnvironment(approot) param = utils.equals_list2dict(runtime_param or []) try: app_runtime.get_runtime( runtime, tm_env, service, param ).run() except exc.ContainerSetupError as err: _LOGGER.exception('Failed to start, app will be aborted.') app_abort.flag_aborted(service.data_dir, why=err.reason, payload=traceback.format_exc()) except Exception as err: # pylint: disable=W0703 _LOGGER.exception('Failed to start, app will be aborted.') app_abort.flag_aborted(service.data_dir, why=app_abort.AbortedReason.UNKNOWN, payload=traceback.format_exc())
def __init__(self, root): _LOGGER.info('init eventmgr: %s', root) self.tm_env = appenv.AppEnvironment(root=root) self._hostname = sysinfo.hostname()
def accept_cmd(tkt_spool_dir, approot, port, appname, endpoint, keytab): """Run ticket locker acceptor.""" if keytab: _construct_keytab(keytab) if port == 0: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind(('0.0.0.0', 0)) port = sock.getsockname()[1] sock.close() hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) endpoint_proid_path = z.path.endpoint_proid(appname) acl = context.GLOBAL.zk.conn.make_servers_acl() _LOGGER.info( 'Ensuring %s exists with ACL %r', endpoint_proid_path, acl ) zkutils.ensure_exists( context.GLOBAL.zk.conn, endpoint_proid_path, acl=[acl] ) endpoint_path = z.path.endpoint(appname, 'tcp', endpoint) _LOGGER.info('Registering %s %s', endpoint_path, hostport) # Need to delete/create endpoints for the disovery to pick it up in # case of master restart. # # Unlile typical endpoint, we cannot make the node ephemeral as we # exec into tkt-recv. zkutils.ensure_deleted(context.GLOBAL.zk.conn, endpoint_path) time.sleep(5) zkutils.put(context.GLOBAL.zk.conn, endpoint_path, hostport) context.GLOBAL.zk.conn.stop() # TODO: this will publish information about the endpoint state # under /discovery. Once discovery is refactored (if it will be) # we can remove the "manual" zookeeper manipulation. tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all( appname=appname, endpoint=endpoint, proto='tcp' ) endpoints_mgr.create_spec( appname=appname, endpoint=endpoint, proto='tcp', real_port=port, pid=os.getpid(), port=port, owner='/proc/{}'.format(os.getpid()), ) subproc.safe_exec(['tkt_recv_v2', '-p{}'.format(port), '-d{}'.format(tkt_spool_dir)])