def _monitor_data_watch(data, stat, event): """Monitor individual monitor.""" if (event is not None and event.type == 'DELETED') or stat is None: _LOGGER.info('Removing watch on deleted monitor: %s', name) return try: count = yaml.load(data)['count'] except Exception: # pylint: disable=W0703 _LOGGER.exception('Invalid monitor: %s', name) return _LOGGER.info('Reconfigure monitor: %s, count: %s', name, count) state['monitors'][name] = { 'count': count, 'available': 2.0 * count, 'last_update': time.time(), 'rate': (2.0 * count / _INTERVAL) }
def _watch_finished_snapshots(snapshots): """Watch /finished.history nodes.""" start_time = time.time() finished_history = cell_state.finished_history.copy() for db_node in sorted(set(loaded_snapshots) - set(snapshots)): _LOGGER.info('Unloading snapshot: %s', db_node) for instance in loaded_snapshots.pop(db_node): finished_history.pop(instance, None) for db_node in sorted(set(snapshots) - set(loaded_snapshots)): _LOGGER.info('Loading snapshot: %s', db_node) loading_start_time = time.time() loaded_snapshots[db_node] = [] data, _stat = zkclient.get(z.path.finished_history(db_node)) with tempfile.NamedTemporaryFile(delete=False, mode='wb') as f: f.write(zlib.decompress(data)) try: conn = sqlite3.connect(f.name) cur = conn.cursor() sql = 'SELECT name, data FROM finished ORDER BY timestamp' for row in cur.execute(sql): instance, data = row if data: data = yaml.load(data) finished_history[instance] = data loaded_snapshots[db_node].append(instance) conn.close() finally: os.unlink(f.name) _LOGGER.debug('Loading time: %s', time.time() - loading_start_time) cell_state.finished_history = finished_history _LOGGER.debug( 'Loaded snapshots: %d, finished: %d, finished history: %d, ' 'time: %s', len(loaded_snapshots), len(cell_state.finished), len(cell_state.finished_history), time.time() - start_time) return True
def schedule(app, manifest, count, env, proid): """Schedule app(s) on the cell master""" with io.open(manifest, 'rb') as fd: data = yaml.load(stream=fd) # TODO: should we delete all potential attributes starting # with _ ? if '_id' in data: del data['_id'] data['environment'] = env if 'affinity' not in data: # TODO: allow custom affinity formats. data['affinity'] = '{0}.{1}'.format(*app.split('.')) data['proid'] = proid scheduled = masterapi.create_apps(context.GLOBAL.zk.conn, app, data, count, 'admin') for app_id in scheduled: print(app_id)
def _update_nodes_change(data): """Update local Treadmill Nodes IP IPSet when the globals server list gets updated.""" servers = yaml.load(data) server_ips = [] for server in servers: try: server_ip = socket.gethostbyname(server) server_ips.append(server_ip) except socket.gaierror: _LOGGER.warning('Unable to resolve %r', server) continue iptables.atomic_set(iptables.SET_TM_NODES, content=server_ips, set_type='hash:ip', family='inet')
def test_normalize_run_once(self): """Test missing defaults which cause the app to fail.""" doc = """ services: - command: /bin/sleep 1m name: sleep1m restart: limit: 0 memory: 150M cpu: 10% disk: 100M """ masterapi.create_apps.side_effect = _create_apps new_doc = self.instance.create('proid.app', yaml.load(doc)) # Disable E1126: Sequence index is not an int, slice, or instance # pylint: disable=E1126 self.assertEqual(new_doc['services'][0]['restart']['interval'], 60) self.assertTrue(masterapi.create_apps.called)
def top(port, socket, auth, title, modules, config, cors_origin, workers, backlog, authz): """Run Treadmill API server.""" context.GLOBAL.zk.add_listener(zkutils.exit_on_lost) api_modules = {module: None for module in modules} for module, cfg in config: if module not in api_modules: raise click.UsageError( 'Orphan config: %s, not in: %r' % (module, modules) ) api_modules[module] = yaml.load(stream=cfg) cfg.close() api_paths = api.init(api_modules, title.replace('_', ' '), cors_origin, authz) if port: rest_server = rest.TcpRestServer(port, auth_type=auth, protect=api_paths, workers=workers, backlog=backlog) # TODO: need to rename that - conflicts with import socket. elif socket: rest_server = rest.UdsRestServer(socket, auth_type=auth, workers=workers, backlog=backlog) else: click.echo('port or socket must be specified') sys.exit(1) try: rest_server.run() except sock.error as sock_err: _LOGGER.warning('Socker error: %s', sock_err) if sock_err.errno == errno.EADDRINUSE: # TODO: hack, but please keep it for now, otherwise on the # setup several master processes run on same server # lookup api (listen on port 8080) is in tight loop. time.sleep(5)
def on_event(filename, operation, content): """Event handler. """ if not filename.startswith('/identity-groups/'): return sow = operation is None full_identity = filename[len('/identity-groups/'):] identity_group, identity = full_identity.rsplit('/', 1) message = { 'topic': '/identity-groups', 'identity-group': identity_group, 'identity': int(identity), 'app': None, 'host': None, 'sow': sow } if content: message.update(yaml.load(content)) return message
def status(self, timeout=30): """Query the status of the resource service. :param ``float`` timeout: Wait at least timeout seconds for the service to reply. :raises ``ResourceServiceTimeoutError``: If the requested service does not come up before timeout. :raises ``socket.error``: If there is a communication error with the service. """ backoff = 0 while backoff <= (timeout / 2): with contextlib.closing(socket.socket(socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0)) as status_socket: try: status_socket.connect(self.status_sock) status = yaml.load(stream=status_socket.makefile('r')) except socket.error as err: if err.errno in (errno.ECONNREFUSED, errno.ENOENT): status = None else: raise if status is not None: break _LOGGER.info('Waiting for service %r to become available', self.name) # Implement a backoff mechanism backoff += (backoff or 1) time.sleep(backoff) else: raise ResourceServiceTimeoutError( 'Service %r timed out' % (self.name), ) return status
def test_cache_placement_data(self): """Tests sync of placement data. """ # Access to a protected member _synchronize of a client class # pylint: disable=W0212 zk_content = { 'placement': { 'test.xx.com': { '.data': """ state: up since: 100 """, 'xxx.app1#1234': { '.data': '{identity: 1}\n', }, } }, 'scheduled': { 'xxx.app1#1234': { 'affinity': 'app1', 'memory': '1G', 'disk': '1G', 'cpu': '100%', 'identity_group': 'xxx.app1', }, } } self.make_mock_zk(zk_content) zkclient = kazoo.client.KazooClient() self.evmgr._hostname = 'test.xx.com' self.evmgr._cache(zkclient, 'xxx.app1#1234') appcache = os.path.join(self.cache, 'xxx.app1#1234') self.assertTrue(os.path.exists(appcache)) with io.open(appcache) as f: data = yaml.load(stream=f) self.assertEqual(data['identity'], 1)
def _configure(apis, manifest, appname): """Configure a Treadmill app""" try: existing = restclient.get(apis, _APP_REST_PATH + appname).json() except restclient.NotFoundError: if not manifest: raise else: existing = None if manifest: with io.open(manifest, 'rb') as fd: app = yaml.load(stream=fd) if existing: restclient.put(apis, _APP_REST_PATH + appname, payload=app) else: restclient.post(apis, _APP_REST_PATH + appname, payload=app) # Get new value after update. existing = restclient.get(apis, _APP_REST_PATH + appname).json() cli.out(_FORMATTER(existing))
def _configure(apis, manifest, appname): """Configure a Treadmill app""" try: existing = restclient.get(apis, _APP_REST_PATH + appname).json() except restclient.NotFoundError: if not manifest: raise existing = None if manifest: app = yaml.load(stream=manifest) if existing: response = restclient.put(apis, _APP_REST_PATH + appname, payload=app) else: response = restclient.post(apis, _APP_REST_PATH + appname, payload=app) existing = response.json() cli.out(_FORMATTER(existing))
def _load_server_info(self, path): """Loads the server info from the given path. :param path: The path to the server info :return: A `dict` representing the server info or None """ try: with io.open(path, 'r') as f: server_info = yaml.load(stream=f) if not server_info: return if 'partition' not in server_info: return if fnmatch.fnmatch(server_info['partition'], self._partition): if self._add_ldap_connection(server_info): hostname = os.path.basename(path) server_info['hostname'] = hostname _LOGGER.info('Found valid server %r', server_info) return server_info _LOGGER.info('Found invalid server %r at path %r', server_info, path) except OSError as err: _LOGGER.exception('Cannot read server info %r', path) if err.errno is not errno.ENOENT: raise except yaml.YAMLError: _LOGGER.exception('Invalid server info YAML %r', path) return None
def on_event(filename, operation, content): """Event handler. """ if not filename.startswith('{}/'.format(_SUB_DIR)): return None app_group = os.path.basename(filename) sow = operation is None message = {'topic': _TOPIC, 'app-group': app_group, 'sow': sow} if content: app_group_data = yaml.load(content) raw_data = app_group_data.pop('data', []) message.update(app_group_data) data = {} for kv_str in raw_data: (key, val) = kv_str.split('=', 1) data[key] = val message['data'] = data return message
def _render(name, ctx): """Render named template.""" jinja_env = jinja2.Environment(loader=jinja2.PackageLoader(__name__)) template = jinja_env.get_template(name) return yaml.load(template.render(**ctx.__dict__))
def server(approot, register, port, auth, modules, config, title, cors_origin, rate_limit_global, rate_limit_module, rate_limit_by): """Runs nodeinfo server.""" rate_limit = _get_rate_limit( rate_limit_global, rate_limit_module, rate_limit_by ) rest_server = rest.TcpRestServer(port, auth_type=auth, rate_limit=rate_limit) port = rest_server.port hostname = sysinfo.hostname() hostport = '%s:%s' % (hostname, port) if register: zkclient = context.GLOBAL.zk.conn zkclient.add_listener(zkutils.exit_on_lost) appname = 'root.%s#%010d' % (hostname, os.getpid()) app_pattern = 'root.%s#*' % (hostname) path = z.path.endpoint(appname, 'tcp', 'nodeinfo') _LOGGER.info('register endpoint: %s %s', path, hostport) zkutils.create(zkclient, path, hostport, acl=[zkclient.make_servers_acl()], ephemeral=True) # TODO: remove "legacy" endpoint registration once conversion is # complete. tm_env = appenv.AppEnvironment(approot) endpoints_mgr = endpoints.EndpointsMgr(tm_env.endpoints_dir) endpoints_mgr.unlink_all( app_pattern, endpoint='nodeinfo', proto='tcp' ) # On Linux endpoint for nodeinfo is a symlink pointing to # /proc/{pid}, on Windows it's just a regular file owner = '/proc/{}'.format(os.getpid()) if os.name == 'posix' \ else None endpoints_mgr.create_spec( appname=appname, endpoint='nodeinfo', proto='tcp', real_port=port, pid=os.getpid(), port=port, owner=owner, ) _LOGGER.info('Starting nodeinfo server on port: %s', port) utils.drop_privileges() if modules: api_modules = {module: None for module in modules} for module, cfg in config: if module not in api_modules: raise click.UsageError( 'Orphan config: %s, not in: %r' % (module, api_modules) ) api_modules[module] = yaml.load(stream=cfg) cfg.close() rest_server.protect = api.init( api_modules, title.replace('_', ' '), cors_origin ) rest_server.run()
def _install_services(scan_dir, package, src_dir, dst_dir, params, prefix_len, rec=None): """Expand services in scan directory and install. """ package_name = package.__name__ contents = pkg_resources.resource_listdir(package_name, src_dir) for item in contents: if item in (_CONTROL_DIR_NAME, _CONTROL_DIR_FILE): continue resource_path = os.path.join(src_dir, item) if pkg_resources.resource_isdir(package_name, os.path.join(src_dir, item)): dst_path = os.path.join(dst_dir, resource_path[prefix_len:]) fs.mkdir_safe(dst_path) if rec: rec.write('%s\n' % os.path.join(dst_path, '')) _install( package, os.path.join(src_dir, item), dst_dir, params, prefix_len=prefix_len, rec=rec ) elif resource_path.endswith('.yml'): dst_path = os.path.join(dst_dir, resource_path[prefix_len:-4]) name = os.path.basename(dst_path) _LOGGER.info('Expand service (%s): %s => %s', name, resource_path, dst_path) fs.mkdir_safe(dst_path) if rec: rec.write('%s\n' % os.path.join(dst_path, '')) service_conf_file = pkg_resources.resource_string( package_name, resource_path ) if not service_conf_file: _LOGGER.warning('Service def was empty: %s', resource_path) continue service_conf = yaml.load(service_conf_file.decode('utf8')) service_conf = bootstrap.interpolate_service_conf( resource_path, service_conf, name, params) svc = supervisor.create_service( scan_dir, service_conf['name'], service_conf['command'], userid=service_conf['userid'], downed=service_conf['downed'], environ_dir=service_conf['environ_dir'], environ=service_conf['environ'], monitor_policy=service_conf['monitor_policy'], notification_fd=service_conf['notification_fd'], call_before_run=service_conf['call_before_run'], call_before_finish=service_conf['call_before_finish'], logger_args=service_conf['logger_args'], ionice_prio=0, ) for file in service_conf['data_dir']: permission = 0o644 if file['executable']: permission = 0o755 fs.write_safe( os.path.join(svc.data_dir, file['path']), lambda f, file=file: f.write( file['content'] ), mode='w', permission=permission )
def _on_created(self, impl, filepath): """Private handler for request creation events. """ # Avoid triggering on changes to the service directory itself. if filepath == self._rsrc_dir: return False req_id = os.path.basename(filepath) # Avoid triggerring on temporary files if req_id[0] == '.': return False req_file = os.path.join(filepath, REQ_FILE) rep_file = os.path.join(filepath, REP_FILE) try: with io.open(req_file) as f: req_data = yaml.load(stream=f) except IOError as err: if (err.errno == errno.ENOENT or err.errno == errno.ENOTDIR): _LOGGER.exception('Removing invalid request: %r', req_id) try: fs.rm_safe(filepath) except OSError as rm_err: if rm_err.errno == errno.EISDIR: fs.rmtree_safe(filepath) else: raise return False raise # TODO: We should also validate the req_id format with lc.LogContext(_LOGGER, req_id, adapter_cls=lc.ContainerAdapter) as log: log.debug('created %r: %r', req_id, req_data) try: # TODO: We should also validate the req_id format utils.validate(req_data, impl.PAYLOAD_SCHEMA) res = impl.on_create_request(req_id, req_data) except exc.InvalidInputError as err: log.error('Invalid request data: %r: %s', req_data, err) res = {'_error': {'input': req_data, 'why': str(err)}} except Exception as err: # pylint: disable=W0703 log.exception('Unable to process request: %r %r:', req_id, req_data) res = {'_error': {'input': req_data, 'why': str(err)}} if res is None: # Request was not actioned return False fs.write_safe( rep_file, lambda f: yaml.dump( res, explicit_start=True, explicit_end=True, default_flow_style=False, stream=f ), mode='w', permission=0o644 ) # Return True if there were no error return not bool(res.get('_error', False))
def _run(apis, count, manifest, memory, cpu, disk, tickets, service, restart_limit, restart_interval, endpoint, appname, command): """Run Treadmill app.""" # too many branches # # pylint: disable=R0912 app = {} if manifest: with io.open(manifest, 'rb') as fd: app = yaml.load(stream=fd) if endpoint: app['endpoints'] = [{ 'name': name, 'port': port } for name, port in endpoint] if tickets: app['tickets'] = tickets if command: if not service: # Take the basename of the command, always assume / on all # platforms. service = os.path.basename(shlex.split(command[0])[0]) services_dict = {svc['name']: svc for svc in app.get('services', [])} if service: if service not in services_dict: services_dict[service] = { 'name': service, 'restart': { 'limit': restart_limit, 'interval': restart_interval, } } if command: services_dict[service]['command'] = ' '.join(list(command)) if services_dict: app['services'] = list(six.itervalues(services_dict)) if app: # Ensure defaults are set. if 'memory' not in app: app['memory'] = _DEFAULT_MEM if 'disk' not in app: app['disk'] = _DEFAULT_DISK if 'cpu' not in app: app['cpu'] = _DEFAULT_CPU # Override if requested. if memory is not None: app['memory'] = str(memory) if disk is not None: app['disk'] = str(disk) if cpu is not None: app['cpu'] = str(cpu) url = '/instance/' + appname if count: url += '?count=%d' % count response = restclient.post(apis, url, payload=app) for instance_id in response.json()['instances']: cli.out(instance_id)