def _cache(self, zkclient, app): """Reads the manifest from Zk and stores it as YAML in <cache>/<app>. """ appnode = z.path.scheduled(app) placement_node = z.path.placement(self._hostname, app) manifest_file = None try: manifest = zkutils.get(zkclient, appnode) # TODO: need a function to parse instance id from name. manifest['task'] = app[app.index('#') + 1:] placement_info = zkutils.get(zkclient, placement_node) if placement_info is not None: manifest.update(placement_info) manifest_file = os.path.join(self.tm_env.cache_dir, app) with tempfile.NamedTemporaryFile(dir=self.tm_env.cache_dir, prefix='.%s-' % app, delete=False, mode='w') as temp_manifest: yaml.dump(manifest, stream=temp_manifest) os.rename(temp_manifest.name, manifest_file) _LOGGER.info('Created cache manifest: %s', manifest_file) except kazoo.exceptions.NoNodeError: _LOGGER.warning('App %r not found', app)
def endpoints(): """Show endpoints and their status.""" zkclient = context.GLOBAL.zk.conn discovery_state = zkclient.get_children(z.DISCOVERY_STATE) state = collections.defaultdict(dict) for hostname in discovery_state: state[hostname] = zkutils.get(zkclient, z.path.discovery_state(hostname)) discovery = zkclient.get_children(z.DISCOVERY) all_endpoints = [] for hostname in discovery: endpoints = [] for entry in zkutils.get(zkclient, z.path.discovery(hostname)): app, endpoint, proto, port = entry.split(':') port = int(port) endpoint_state = state[hostname].get(port) hostport = '{}:{}'.format(hostname, port) endpoints.append( (app, proto, endpoint, hostport, endpoint_state)) all_endpoints.append(endpoints) merged = heapq.merge(*all_endpoints) formatter = cli.make_formatter('endpoint') cli.out( formatter([{ 'name': name, 'endpoint': endpoint, 'proto': proto, 'hostport': hostport, 'state': state, } for name, proto, endpoint, hostport, state in merged]))
def kill_node(zkclient, node): """Kills app, endpoints, and server node.""" _LOGGER.info('killing node: %s', node) try: zkutils.get(zkclient, z.path.server(node)) except kazoo.client.NoNodeError: _LOGGER.info('node does not exist.') return apps = zkclient.get_children(z.path.placement(node)) for app in apps: _LOGGER.info('removing app presence: %s', app) try: manifest = zkutils.get(zkclient, z.path.scheduled(app)) app_presence = EndpointPresence(zkclient, manifest, hostname=node, appname=app) app_presence.unregister_running() app_presence.unregister_endpoints() except kazoo.client.NoNodeError: _LOGGER.info('app %s no longer scheduled.', app) _LOGGER.info('removing server presence: %s', node) unregister_server(zkclient, node)
def _cache(self, zkclient, app): """Reads the manifest from Zk and stores it as YAML in <cache>/<app>. """ appnode = z.path.scheduled(app) placement_node = z.path.placement(self._hostname, app) manifest_file = None try: manifest = zkutils.get(zkclient, appnode) # TODO: need a function to parse instance id from name. manifest['task'] = app[app.index('#') + 1:] placement_info = zkutils.get(zkclient, placement_node) if placement_info is not None: manifest.update(placement_info) manifest_file = os.path.join(self.tm_env.cache_dir, app) fs.write_safe(manifest_file, lambda f: yaml.dump(manifest, stream=f), prefix='.%s-' % app, mode='w', permission=0o644) _LOGGER.info('Created cache manifest: %s', manifest_file) except kazoo.exceptions.NoNodeError: _LOGGER.warning('App %r not found', app)
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) traitz = zkutils.get(zkclient, z.path.traits()) new_node_info['traits'] = traits.detect(traitz) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # TODO: Fix the network initialization. Then the below can be part of # appenv.initialize() if os.name == 'posix': # Flush all rules in iptables nat and mangle tables (it is assumed # that none but Treadmill manages these tables) and bulk load all # the Treadmill static rules iptables.initialize(node_info['network']['external_ip']) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def test_get(self): """Test zkutils.get parsing of YAML data.""" client = treadmill.zkutils.ZkClient() treadmill.zkutils.ZkClient.get.return_value = ('{xxx: 123}', None) self.assertEqual({'xxx': 123}, zkutils.get(client, '/foo')) # parsing error treadmill.zkutils.ZkClient.get.return_value = ('{xxx: 123', None) self.assertEqual('{xxx: 123', zkutils.get(client, '/foo', strict=False)) self.assertRaises(yaml.YAMLError, zkutils.get, client, '/foo') treadmill.zkutils.ZkClient.get.return_value = (None, None) self.assertIsNone(zkutils.get(client, '/foo'))
def _get_identity_group(app): """get identity group if exists """ zkclient = context.GLOBAL.zk.conn data = zkutils.get(zkclient, z.path.identity_group(app)) return data['count']
def _get_appmonitor(app): """get appmonitor count from appname """ zkclient = context.GLOBAL.zk.conn data = zkutils.get(zkclient, z.path.appmonitor(app)) return data['count']
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # Invoke the local node initialization tm_env.initialize(node_info) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def create_server(zkclient, server_id, parent_id): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkutils.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) # zkutils.get return dict/tuple if need_metadata is true. # # pylint: disable=R0204 data = zkutils.get(zkclient, server_node) if parent_id: if not data: data = {'parent': parent_id} else: data['parent'] = parent_id _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_server_features(zkclient, server_id, features): """Updates server features.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['features'] = features if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_server_parent(zkclient, server_id, parent_id): """Update server parent.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['parent'] = parent_id if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_server_attrs(zkclient, server_id, partition): """Updates server traits.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['partition'] = partition if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def get_server(zkclient, server_id, placement=False): """Return server object.""" data = zkutils.get(zkclient, z.path.server(server_id)) if placement: placement_data = zkutils.get_default(zkclient, z.path.placement(server_id), {}) data.update(placement_data) return data
def reload_server(self, servername): """Reload individual server.""" _LOGGER.info('reloading server: %s', servername) if servername not in self.servers: # This server was never loaded. self.load_server(servername) return current_server = self.servers[servername] # Check if server is same try: data = zkutils.get(self.zkclient, z.path.server(servername)) if not data: # The server is configured, but never reported it's capacity. self.remove_server(servername) return # TODO: need better error handling. assert 'parent' in data assert data['parent'] in self.buckets # TODO: seems like this is cut/paste code from load_server. label = data.get('partition') if not label: label = admin.DEFAULT_PARTITION up_since = data.get('up_since', time.time()) partition = self.cell.partitions[label] server = scheduler.Server( servername, resources(data), valid_until=partition.valid_until(up_since), label=label, traits=data.get('traits', 0)) parent = self.buckets[data['parent']] # TODO: assume that bucket topology is constant, e.g. # rack can never change buiding. If this does not # hold, comparing parents is not enough, need to # compare recursively all the way up. if (current_server.is_same(server) and current_server.parent == parent): # Nothing changed, no need to update anything. _LOGGER.info('server is same, keeping old.') current_server.valid_until = server.valid_until else: # Something changed - clear everything and re-register server # as new. _LOGGER.info('server modified, replacing.') self.remove_server(servername) self.load_server(servername) except kazoo.client.NoNodeError: self.remove_server(servername) _LOGGER.warn('Server node not found: %s', servername)
def get_appmonitor(zkclient, monitor_id, raise_notfound=False): """Return app monitor given id.""" try: data = zkutils.get(zkclient, z.path.appmonitor(monitor_id)) data['_id'] = monitor_id return data except kazoo.client.NoNodeError: _LOGGER.info('App monitor does not exist: %s', monitor_id) if raise_notfound: raise else: return None
def load_partition(self, partition): """Load partition.""" try: data = zkutils.get(self.zkclient, z.path.partition(partition)) self.cell.partitions[partition] = scheduler.Partition( max_server_uptime=data.get('server_uptime'), max_lease=data.get('max_lease'), threshold=data.get('threshold'), ) except kazoo.client.NoNodeError: _LOGGER.warn('Partition node not found: %s', partition)
def update_server_capacity(zkclient, server_id, memory=None, cpu=None, disk=None): """Update server capacity.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) if memory: data['memory'] = memory if cpu: data['cpu'] = cpu if disk: data['disk'] = disk if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def _cache(self, zkclient, app, check_existing=False): """Read the manifest and placement data from Zk and store it as YAML in <cache>/<app>. :param ``str`` app: Instance name. :param ``bool`` check_existing: Whether to check if the file already exists and is up to date. """ placement_node = z.path.placement(self._hostname, app) try: placement_data, placement_metadata = zkutils.get_with_metadata( zkclient, placement_node ) placement_time = placement_metadata.ctime / 1000.0 except kazoo.exceptions.NoNodeError: _LOGGER.info('Placement %s/%s not found', self._hostname, app) return manifest_file = os.path.join(self.tm_env.cache_dir, app) if check_existing: try: manifest_time = os.stat(manifest_file).st_ctime except FileNotFoundError: manifest_time = None if manifest_time and manifest_time >= placement_time: _LOGGER.info('%s is up to date', manifest_file) return app_node = z.path.scheduled(app) try: manifest = zkutils.get(zkclient, app_node) # TODO: need a function to parse instance id from name. manifest['task'] = app[app.index('#') + 1:] if placement_data is not None: manifest.update(placement_data) fs.write_safe( manifest_file, lambda f: yaml.dump(manifest, stream=f), prefix='.%s-' % app, mode='w', permission=0o644 ) _LOGGER.info('Created cache manifest: %s', manifest_file) except kazoo.exceptions.NoNodeError: _LOGGER.info('App %s not found', app)
def get_appmonitor(zkclient, monitor_id, raise_notfound=False, suspended_monitors=None): """Return app monitor given id.""" try: data = zkutils.get(zkclient, z.path.appmonitor(monitor_id)) data['_id'] = monitor_id if suspended_monitors is None: suspended_monitors = get_suspended_appmonitors(zkclient) data['suspend_until'] = suspended_monitors.get(monitor_id) return data except kazoo.client.NoNodeError: _LOGGER.info('App monitor does not exist: %s', monitor_id) if raise_notfound: raise else: return None
def load_server(self, servername, readonly=False): """Load individual server.""" try: data = zkutils.get(self.zkclient, z.path.server(servername)) if not data: # The server is configured, but never reported it's capacity. _LOGGER.info('No capacity detected: %s', z.path.server(servername)) return assert 'parent' in data parentname = data['parent'] label = data.get('partition') if not label: # TODO: it will be better to have separate module for constants # and avoid unnecessary cross imports. label = admin.DEFAULT_PARTITION up_since = data.get('up_since', int(time.time())) partition = self.cell.partitions[label] server = scheduler.Server( servername, resources(data), valid_until=partition.valid_until(up_since), label=label, traits=data.get('traits', 0)) parent = self.buckets.get(parentname) if not parent: _LOGGER.warn('Server parent does not exist: %s/%s', servername, parentname) return self.buckets[parentname].add_node(server) self.servers[servername] = server assert server.parent == self.buckets[parentname] if not readonly: zkutils.ensure_exists(self.zkclient, z.path.placement(servername), acl=[_SERVERS_ACL]) self.adjust_server_state(servername, readonly) except kazoo.client.NoNodeError: _LOGGER.warn('Server node not found: %s', servername)
def register_server(zkclient, hostname, node_info): """Register server.""" server_path = z.path.server(hostname) server_data = zkutils.get(zkclient, server_path) server_data.update(node_info) _LOGGER.info('Registering server %s: %r', hostname, server_data) zkutils.update(zkclient, server_path, server_data) host_acl = zkutils.make_host_acl(hostname, 'rwcda') return zkutils.put(zkclient, z.path.server_presence(hostname + '#'), {'seen': False}, acl=[host_acl], ephemeral=True, sequence=True)
def upgrade(cell, ldap, ldap_search_base, batch, timeout, treadmill_root, continue_on_error, dry_run, force, servers): """Upgrade the supplied cell""" context.GLOBAL.ldap.url = ldap context.GLOBAL.ldap.search_base = ldap_search_base servers = [] for server_list in servers: servers.extend(server_list.split(',')) if not treadmill_root: admin_cell = admin.Cell(context.GLOBAL.ldap.conn) cell_info = admin_cell.get(cell) treadmill_root = cell_info.get('treadmill_root') _LOGGER.info('Treadmill root: %s', treadmill_root) digest = versionmgr.checksum_dir(treadmill_root).hexdigest() _LOGGER.info('Checksum: %s', digest) context.GLOBAL.resolve(cell) zkclient = context.GLOBAL.zk.conn if not servers: # pylint: disable=R0204 servers = zkutils.get(zkclient, zkutils.SERVERS) if dry_run: failed = versionmgr.verify(zkclient, digest, servers) else: failed = versionmgr.upgrade( zkclient, digest, servers, batch, timeout, stop_on_error=(not continue_on_error), force_upgrade=force, ) if not failed: _LOGGER.info('All servers are up to date.') else: _LOGGER.error('Upgrade failed.') utils.print_yaml(failed)
def unregister_identity(self): """Register app identity.""" identity_group = self.manifest.get('identity_group') # If identity_group is not set or set to None, nothing to register. if not identity_group: return identity = self.manifest.get('identity', _INVALID_IDENTITY) _LOGGER.info('Unregister identity: %s, %s', identity_group, identity) path = z.path.identity_group(identity_group, str(identity)) try: data = zkutils.get(self.zkclient, path) if data['host'] == self.hostname: zkutils.ensure_deleted(self.zkclient, path) except kazoo.client.NoNodeError: _LOGGER.info('identity node %s does not exist.', path)
def _node_initialize(tm_env, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ tm_env.initialize() new_node_info = sysinfo.node_info(tm_env) # XXX: Why a get/update dance instead of set node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True)
def create_server(zkclient, server_id, parent_id, partition): """Creates server definition in Zookeeper.""" server_node = z.path.server(server_id) server_acl = zkclient.make_host_acl(server_id, 'rwcd') zkutils.ensure_exists(zkclient, server_node, acl=[server_acl]) data = zkutils.get(zkclient, server_node) if not data: data = {} data.update({ 'parent': parent_id, 'partition': partition, }) _LOGGER.info('Creating server node %s with data %r and ACL %r', server_node, data, server_acl) if zkutils.put(zkclient, server_node, data, acl=[server_acl], check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def _list_server_blackouts(zkclient, fmt): """List server blackouts.""" # List currently blacked out nodes. blacked_out = [] try: blacked_out_nodes = zkclient.get_children(z.BLACKEDOUT_SERVERS) for server in blacked_out_nodes: node_path = z.path.blackedout_server(server) data, metadata = zkutils.get(zkclient, node_path, need_metadata=True) blacked_out.append((metadata.created, server, data)) except kazoo.client.NoNodeError: pass # [%t] %h %r will be printed as below # [Thu, 05 May 2016 02:59:58 +0000] <hostname> - mapping = {'t': 0, 'h': 1, 'r': 2} formatter = _gen_formatter(mapping, fmt) for when, server, reason in reversed(sorted(blacked_out)): reason = '-' if reason is None else reason print(formatter.format(utils.strftime_utc(when), server, reason))
def get(self, path): """Return stored object given path.""" try: return zkutils.get(self.zkclient, path) except kazoo.client.NoNodeError: raise backend.ObjectNotFoundError()
def get_identity_group(zkclient, ident_group_id): """Return app monitor given id.""" data = zkutils.get(zkclient, z.path.identity_group(ident_group_id)) data['_id'] = ident_group_id return data
def get_suspended_appmonitors(zkclient): """Return appmonitor suspension information.""" # we avoid returning None return zkutils.get(zkclient, z.path.appmonitor()) or {}