def _safe_create(self, rsrc_id, path, data): """Create ephemeral node in Zookeeper. If the node is present, check if the owner session id is ours, if not, fail. """ try: zkutils.create(self.zkclient, path, data, ephemeral=True) _LOGGER.info('Created node: %s', path) except kazoo.client.NodeExistsError: content, metadata = zkutils.get_with_metadata(self.zkclient, path) session_id, _pwd = self.zkclient.client_id if metadata.owner_session_id != session_id: _LOGGER.info('Node exists, owned by other: %s - %s - %s', path, content, metadata.owner_session_id) self._watch(rsrc_id, path) return False if content != data: _LOGGER.info('Content different: %s - old: %s, new: %s', path, content, data) zkutils.update(self.zkclient, path, data) _LOGGER.info('Node is up to date: %s - %s', path, session_id) return True
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) traitz = zkutils.get(zkclient, z.path.traits()) new_node_info['traits'] = traits.detect(traitz) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # TODO: Fix the network initialization. Then the below can be part of # appenv.initialize() if os.name == 'posix': # Flush all rules in iptables nat and mangle tables (it is assumed # that none but Treadmill manages these tables) and bulk load all # the Treadmill static rules iptables.initialize(node_info['network']['external_ip']) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ try: new_node_info = sysinfo.node_info(tm_env, runtime) # Merging scheduler data with node_info data node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True) # Invoke the local node initialization tm_env.initialize(node_info) except Exception: # pylint: disable=W0703 _LOGGER.exception('Node initialization failed') zkclient.stop()
def test_update_check_content(self): """Verifies put/update with check_content=True.""" treadmill.zkutils.ZkClient.get.return_value = (b'aaa', {}) zkclient = treadmill.zkutils.ZkClient() zkutils.update(zkclient, '/a', 'aaa', check_content=True) self.assertFalse(treadmill.zkutils.ZkClient.set.called) zkutils.update(zkclient, '/a', 'bbb', check_content=True) treadmill.zkutils.ZkClient.set.assert_called_with('/a', b'bbb')
def test_update_check_content(self): """Verifies put/update with check_content=True.""" kazoo.client.KazooClient.get.return_value = ('aaa', {}) zkclient = kazoo.client.KazooClient() zkutils.update(zkclient, '/a', 'aaa', check_content=True) self.assertFalse(kazoo.client.KazooClient.set.called) zkutils.update(zkclient, '/a', 'bbb', check_content=True) kazoo.client.KazooClient.set.assert_called_with('/a', 'bbb')
def register_server(zkclient, hostname, node_info): """Register server.""" server_path = z.path.server(hostname) server_data = zkutils.get(zkclient, server_path) server_data.update(node_info) _LOGGER.info('Registering server %s: %r', hostname, server_data) zkutils.update(zkclient, server_path, server_data) host_acl = zkutils.make_host_acl(hostname, 'rwcda') return zkutils.put(zkclient, z.path.server_presence(hostname + '#'), {'seen': False}, acl=[host_acl], ephemeral=True, sequence=True)
def update_server_parent(zkclient, server_id, parent_id): """Update server parent.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['parent'] = parent_id if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_server_features(zkclient, server_id, features): """Updates server features.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['features'] = features if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_server_attrs(zkclient, server_id, partition): """Updates server traits.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) data['partition'] = partition if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def _node_initialize(tm_env, zkclient, hostname, zk_server_path, zk_presence_path): """Node initialization. Should only be done on a cold start. """ tm_env.initialize() new_node_info = sysinfo.node_info(tm_env) # XXX: Why a get/update dance instead of set node_info = zkutils.get(zkclient, zk_server_path) node_info.update(new_node_info) _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname, node_info) zkutils.update(zkclient, zk_server_path, node_info) host_acl = zkutils.make_host_acl(hostname, 'rwcda') _LOGGER.debug('host_acl: %r', host_acl) zkutils.put(zkclient, zk_presence_path, {'seen': False}, acl=[host_acl], ephemeral=True)
def update_server_capacity(zkclient, server_id, memory=None, cpu=None, disk=None): """Update server capacity.""" node = z.path.server(server_id) data = zkutils.get(zkclient, node) if memory: data['memory'] = memory if cpu: data['cpu'] = cpu if disk: data['disk'] = disk if zkutils.update(zkclient, node, data, check_content=True): create_event(zkclient, 0, 'servers', [server_id])
def update_app_priorities(zkclient, updates): """Updates app priority.""" modified = [] for app_id, priority in six.iteritems(updates): assert 0 <= priority <= 100 app = get_app(zkclient, app_id) if app is None: # app does not exist. continue app['priority'] = priority if zkutils.update(zkclient, _app_node(app_id), app, check_content=True): modified.append(app_id) if modified: create_event(zkclient, 1, 'apps', modified)
def update(self, path, data, check_content=False): """Set data into ZK node.""" try: zkutils.update(self.zkclient, path, data, check_content) except kazoo.client.NoNodeError: raise backend.ObjectNotFoundError()
def reevaluate(api_url, alert_f, state, zkclient, last_waited): """Evaluate state and adjust app count based on monitor""" # Disable too many branches/statements warning. # # pylint: disable=R0912 # pylint: disable=R0915 grouped = dict(state['scheduled']) monitors = dict(state['monitors']) # Do not create a copy, suspended is accessed by ref. suspended = state['suspended'] waited = {} modified = False now = time.time() # remove outdated information in suspended dict extra = six.viewkeys(suspended) - six.viewkeys(monitors) for name in extra: suspended.pop(name, None) modified = True # Increase available tokens. for name, conf in six.iteritems(monitors): if suspended.get(name, 0) > now: _LOGGER.debug('Ignoring app %s - suspended.', name) continue # Either app is not suspended or it is past-due - remove it from # suspended dict. if suspended.pop(name, None) is not None: alert_f(name, 'Monitor active again', status='clear') modified = True # Max value reached, nothing to do. max_value = conf['count'] * 2 available = conf['available'] if available < max_value: delta = conf['rate'] * (now - conf['last_update']) conf['available'] = min(available + delta, max_value) conf['last_update'] = now for name, conf in six.iteritems(monitors): if suspended.get(name, 0) > now: _LOGGER.debug('Monitor is suspended for: %s.', name) continue count = conf['count'] available = conf['available'] current_count = len(grouped.get(name, [])) _LOGGER.debug('App: %r current: %d, target %d', name, current_count, count) if count == current_count: continue elif count > current_count: needed = count - current_count allowed = int(min(needed, math.floor(available))) _LOGGER.debug('%s => need %d, allow %d', name, needed, allowed) if allowed <= 0: # in this case available <= 0 as needed >= 1 # we got estimated wait time, now + wait seconds waited[name] = now + int((1 - available) / conf['rate']) # new wait item, need modify if name not in last_waited: alert_f(name, 'Monitor suspended: Rate limited') modified = True continue try: # scheduled, remove app from waited list _scheduled = restclient.post( [api_url], '/instance/{}?count={}'.format(name, allowed), payload={}, headers={'X-Treadmill-Trusted-Agent': 'monitor'}) if name in last_waited: # this means app jump out of wait, need to clear it from zk alert_f(name, 'Monitor active again', status='clear') modified = True conf['available'] -= allowed except restclient.NotFoundError: _LOGGER.info('App not configured: %s', name) suspended[name] = now + _DELAY_INTERVAL alert_f(name, 'Monitor suspended: App not configured') modified = True except restclient.BadRequestError: _LOGGER.exception('Unable to start: %s', name) suspended[name] = now + _DELAY_INTERVAL alert_f(name, 'Monitor suspended: Unable to start') modified = True except restclient.ValidationError: _LOGGER.exception('Invalid manifest: %s', name) suspended[name] = now + _DELAY_INTERVAL alert_f(name, 'Monitor suspended: Invalid manifest') modified = True except Exception: # pylint: disable=W0703 _LOGGER.exception('Unable to create instances: %s: %s', name, needed) elif count < current_count: extra = [] policy = conf.get('policy') if policy is None: policy = 'fifo' if policy == 'fifo': extra = grouped[name][:current_count - count] elif policy == 'lifo': extra = grouped[name][count - current_count:] else: _LOGGER.warning('Invalid scale policy: %s', policy) continue try: response = restclient.post( [api_url], '/instance/_bulk/delete', payload=dict(instances=list(extra)), headers={'X-Treadmill-Trusted-Agent': 'monitor'}) _LOGGER.info('deleted: %r - %s', extra, response) # this means we reduce the count number, no need to wait modified = True except Exception: # pylint: disable=W0703 _LOGGER.exception('Unable to delete instances: %r', extra) # total inactive means waited.update(suspended) if modified: _LOGGER.info('Updating suspended app monitors') zkutils.update(zkclient, z.path.appmonitor(), waited) return waited