def cleanup_trace(zkclient, batch_size, expires_after): """Move expired traces into history folder, compressed as sqlite db. """ scheduled = zkclient.get_children(z.SCHEDULED) shards = zkclient.get_children(z.TRACE) traces = [] for shard in shards: events = zkclient.get_children(z.path.trace_shard(shard)) for event in events: instanceid, timestamp, _ = event.split(',', 2) timestamp = float(timestamp) if ((instanceid not in scheduled and timestamp < time.time() - expires_after)): traces.append((timestamp, shard, event)) # Sort traces from older to latest. traces.sort() for idx in range(0, len(traces), batch_size): # Take a slice of batch_size batch = traces[idx:idx + batch_size] if len(batch) < batch_size: _LOGGER.info('Traces: batch = %s, total = %s, exiting.', batch_size, len(batch)) break db_rows = [(z.join_zookeeper_path(z.TRACE, shard, event), timestamp, None, z.join_zookeeper_path(z.TRACE, shard), event) for timestamp, shard, event in batch] _upload_batch(zkclient, z.path.trace_history('trace.db.gzip-'), 'trace', db_rows)
def _sync_collection(zkclient, entities, zkpath, match=None): """Sync ldap collection to Zookeeper. """ _LOGGER.info('Sync: %s', zkpath) zkclient.ensure_path(zkpath) in_zk = zkclient.get_children(zkpath) to_sync = {} for entity in entities: name = entity.pop('_id') if match and not match(entity): _LOGGER.debug('Skip: %s', name) continue to_sync[name] = entity for to_del in set(in_zk) - set(to_sync): _LOGGER.info('Delete: %s', to_del) zkutils.ensure_deleted(zkclient, z.join_zookeeper_path(zkpath, to_del)) # Add or update current app-groups for name, entity in to_sync.items(): if zkutils.put(zkclient, z.join_zookeeper_path(zkpath, name), entity, check_content=True): _LOGGER.info('Update: %s', name) else: _LOGGER.info('Up to date: %s', name)
def _sync_collection(zkclient, entities, zkpath, match=None): """Syncs ldap collection to Zookeeper.""" _LOGGER.info('Sync: %s', zkpath) zkclient.ensure_path(zkpath) in_zk = zkclient.get_children(zkpath) names = [entity['_id'] for entity in entities] for entity in entities: _remove_id(entity) for extra in set(in_zk) - set(names): _LOGGER.debug('Delete: %s', extra) zkutils.ensure_deleted(zkclient, z.join_zookeeper_path(zkpath, extra)) # Add or update current app-groups for name, entity in zip(names, entities): zkname = name if match: zkname = match(name, entity) if not zkname: _LOGGER.debug('Skip: %s', name) continue if zkutils.put(zkclient, z.join_zookeeper_path(zkpath, zkname), entity, check_content=True): _LOGGER.info('Update: %s', zkname) else: _LOGGER.info('Up to date: %s', zkname)
def _children_watch(self, zkpath, children, watch_data, on_add, on_del): """Callback invoked on children watch.""" fpath = self.fpath(zkpath) filenames = set( map(os.path.basename, glob.glob(os.path.join(fpath, '*')))) children = set(children) for extra in filenames - children: _LOGGER.info('Delete: %s', extra) self.watches.discard(z.join_zookeeper_path(zkpath, extra)) on_del(z.join_zookeeper_path(zkpath, extra)) if zkpath not in self.processed_once: self.processed_once.add(zkpath) for common in filenames & children: _LOGGER.info('Common: %s', common) zknode = z.join_zookeeper_path(zkpath, common) if watch_data: self.watches.add(zknode) on_add(zknode) for missing in children - filenames: _LOGGER.info('Add: %s', missing) zknode = z.join_zookeeper_path(zkpath, missing) if watch_data: self.watches.add(zknode) on_add(zknode) return True
def test_join_zookeeper_path(self): """Checks zookeeper path construction.""" path = z.join_zookeeper_path('/root', 'node') self.assertEqual('/root/node', path) path = z.join_zookeeper_path('/root', 'node1', 'node2') self.assertEqual('/root/node1/node2', path)
def zk_get(fullpath): """Mock the zkclient.get() method.""" if fullpath.startswith( z.join_zookeeper_path(z.ENDPOINTS, 'proid_A', 'foo')): return (b'xxx:123', None) elif fullpath.startswith( z.join_zookeeper_path(z.ENDPOINTS, 'proid_B', 'bar')): return (b'yyy:987', None) else: raise ValueError(fullpath)
def _children_watch(self, zkpath, children, watch_data, on_add, on_del, cont_watch_predicate=None): """Callback invoked on children watch.""" fpath = self.fpath(zkpath) sorted_children = sorted(children) sorted_filenames = sorted( map(os.path.basename, glob.glob(os.path.join(fpath, '*')))) add = [] remove = [] common = [] self._filter_children_actions(sorted_children, sorted_filenames, add, remove, common) for node in remove: _LOGGER.info('Delete: %s', node) zknode = z.join_zookeeper_path(zkpath, node) self.watches.discard(zknode) on_del(zknode) if zkpath not in self.processed_once: self.processed_once.add(zkpath) for node in common: _LOGGER.info('Common: %s', node) zknode = z.join_zookeeper_path(zkpath, node) if watch_data: self.watches.add(zknode) on_add(zknode) for node in add: _LOGGER.info('Add: %s', node) zknode = z.join_zookeeper_path(zkpath, node) if watch_data: self.watches.add(zknode) on_add(zknode) if cont_watch_predicate: return cont_watch_predicate(zkpath, sorted_children) return True
def refresh_zk(self, zknodes=None): """Parse data from Zookeeper nodes. NOTE: This is intended to be called with the output of a `:func:get_children` or in the callback of a `:class:ChildrenWatch`. If zknodes is None, get Zookeeper nodes first and then parse data. """ if zknodes is None: try: zknodes = self._zkclient.get_children(self._zkpath) except kazoo.exceptions.NoNodeError: zknodes = [] data = {} for node in zknodes: (name, chksum, seq) = node.split('#', 2) data.setdefault(name, []).append( ZkDataEntry(zname=z.join_zookeeper_path(self._zkpath, node), chksum=chksum, seq=int(seq))) for name in data: data[name].sort( key=lambda e: e.seq, # Sort nodes by their sequence numbers reverse=True) self._zkdata = data
def watch_task(zkclient, cell_state, scheduled, task): """Watch individual task.""" task_node = z.join_zookeeper_path(z.TASKS, task) # Establish watch on task instances. @exc.exit_on_unhandled @zkclient.ChildrenWatch(task_node) def _watch_task_instances(instance_ids): instance = None for instance_id in instance_ids: instance = '#'.join([task, instance_id]) # Either watch is established or data is acquired. if instance in cell_state.tasks: continue # On first load, optimize lookup by preloading state # of all scheduled instances. # # Once initial load is done, scheduled will be cleared. if scheduled: need_watch = instance in scheduled else: need_watch = zkclient.exists(z.path.scheduled(instance)) if need_watch: watch_task_instance(zkclient, cell_state, instance) else: data = zkutils.get_default(zkclient, z.path.task(instance)) cell_state.tasks[instance] = data return True
def make_endpoint_watcher(zkclient, state, proid): """Make endpoint watcher function.""" proid_instances = z.join_zookeeper_path(z.ENDPOINTS, proid) @zkclient.ChildrenWatch(proid_instances) @utils.exit_on_unhandled def _watch_instances(children): """Watch for proid instances.""" # TODO: current implementation does nto support instances, so # state from masters will be stored, but will be never displayed. current = set(state[proid].keys()) target = set(children) for name in current - target: del state[proid][name] endpoints = dict() for name in target - current: try: endpoint_node = z.join_zookeeper_path(proid_instances, name) data, _metadata = zkclient.get(endpoint_node) endpoints[name] = data.decode() except kazoo.client.NoNodeError: pass state[proid].update(endpoints) return True return _watch_instances
def prune_trace(zkclient, max_count): """Prune trace. Cleanup service (running/exited) events. """ shards = zkclient.get_children(z.TRACE) for shard in shards: service_events = collections.Counter() events = zkclient.get_children(z.path.trace_shard(shard)) for event in sorted(events, reverse=True): instanceid, ts, src, event_type, event_data = event.split(',') if event_type not in ('service_running', 'service_exited'): continue service_event = traceevents.AppTraceEvent.from_data( timestamp=ts, source=src, instanceid=instanceid, event_type=event_type, event_data=event_data, ) if not service_event: continue uniqueid, service = service_event.uniqueid, service_event.service service_events[(instanceid, uniqueid, service)] += 1 if service_events[(instanceid, uniqueid, service)] > max_count: path = z.join_zookeeper_path(z.TRACE, shard, event) _LOGGER.info('Pruning trace: %s', path) zkutils.with_retry(zkutils.ensure_deleted, zkclient, path)
def prune_trace_evictions(zkclient, max_count): """Cleanup excessive trace events caused by evictions. """ assert max_count > 0 shards = zkclient.get_children(z.TRACE) for shard in shards: evictions = collections.Counter() events = zkclient.get_children(z.path.trace_shard(shard)) for event in sorted(events, reverse=True): instanceid, ts, src, event_type, event_data = event.split(',') event_obj = traceevents.AppTraceEvent.from_data( timestamp=ts, source=src, instanceid=instanceid, event_type=event_type, event_data=event_data, ) if not event_obj: continue # Leave pending/created events. if event_type == 'pending' and 'created' in event_obj.why: continue # Prune when number of evictions for an instance reached max_count. if evictions.get(instanceid, 0) >= max_count: path = z.join_zookeeper_path(z.TRACE, shard, event) _LOGGER.info('Pruning trace: %s', path) zkutils.with_retry(zkutils.ensure_deleted, zkclient, path) else: if ((event_type in ['pending', 'scheduled'] and event_obj.why == 'evicted')): evictions[instanceid] += 1
def _cleanup(zkclient, path, max_count): """Cleanup old nodes given path. """ nodes = sorted(zkclient.get_children(path)) extra = len(nodes) - max_count if extra > 0: for node in nodes[0:extra]: zkutils.ensure_deleted(zkclient, z.join_zookeeper_path(path, node))
def resolve_endpoint(self, endpoint): """Resolves a endpoint to a hostport""" fullpath = z.join_zookeeper_path(z.ENDPOINTS, self.prefix, endpoint) try: hostport, _metadata = self.zkclient.get(fullpath) except kazoo.exceptions.NoNodeError: hostport = None return hostport
def _is_up(self, server, server_endpoints): """Nodeinfo is up for server: {server}.""" hostport, _metadata = zkclient.get(z.join_zookeeper_path( z.ENDPOINTS, 'root', server_endpoints[server])) host, port = hostport.split(':') url = 'http://%s:%s' % (host, port) print(url) self.assertTrue(chk.connect(host, port)) self.assertTrue(chk.url_check(url))
def ensure_deleted(zkclient, path, recursive=True): """Deletes the node if it exists.""" try: _LOGGER.debug('Deleting %s', path) if recursive: for child in zkclient.get_children(path): ensure_deleted(zkclient, z.join_zookeeper_path(path, child)) zkclient.delete(path) except kazoo.client.NoNodeError: _LOGGER.debug('Node %s does not exist.', path)
def invoke_callback(self, path, node): """Invokes callback for each new node.""" try: fullpath = z.join_zookeeper_path(path, node) data = None stat = None if self.include_data: data, stat = self.zkclient.get(fullpath) self.func(fullpath, data, stat) except Exception: # pylint: disable=W0703 _LOGGER.critical('Unexpected error: %s', sys.exc_info()[0])
def cleanup_server_trace(zkclient, batch_size): """Move expired traces into history folder, compressed as sqlite db. """ num_events = uploaded_events = 0 while True: batch = [] num_events = 0 shards = zkclient.get_children(z.SERVER_TRACE) for shard in shards: traces = [] events = zkclient.get_children(z.path.server_trace_shard(shard)) num_events += len(events) for event in events: servername, timestamp, _ = event.split(',', 2) timestamp = float(timestamp) traces.append((timestamp, shard, event)) # Sort traces from older to latest. traces.sort() # Keep batch_size traces ordered by timestamp. batch = [val for val in heapq.merge(batch, traces)][:batch_size] if len(batch) < batch_size: _LOGGER.info('Traces: batch = %s, total = %s, exiting.', batch_size, len(batch)) break db_rows = [(z.join_zookeeper_path(z.SERVER_TRACE, shard, event), timestamp, None, z.join_zookeeper_path(z.SERVER_TRACE, shard), event) for timestamp, shard, event in batch] _zk.upload_batch(zkclient, z.path.server_trace_history('server_trace.db.gzip-'), SERVER_TRACE_SOW_TABLE, db_rows) uploaded_events += len(db_rows) _LOGGER.info('Cleaned up %s server trace events, live events: %s', uploaded_events, num_events)
def snapshot(zkclient, root, zkpath='/'): """Create a snapshot of ZK state to the filesystem.""" if zkpath in _ZK_BLACKLIST: return _LOGGER.debug('snapshot %s', zkpath) fpath = _fpath(root, zkpath) fs.mkdir_safe(os.path.dirname(fpath)) data, stat = zkclient.get(zkpath) _write_data(fpath, data, stat) children = zkclient.get_children(zkpath) for node in children: zknode = z.join_zookeeper_path(zkpath, node) snapshot(zkclient, root, zknode)
def get_endpoints_zk(self, watch_cb=None): """Returns the current list of endpoints.""" endpoints_path = z.join_zookeeper_path(z.ENDPOINTS, self.prefix) full_pattern = ':'.join([self.pattern, '*', self.endpoint]) try: endpoints = self.zkclient.get_children( endpoints_path, watch=watch_cb ) match = set([endpoint for endpoint in endpoints if fnmatch.fnmatch(endpoint, full_pattern)]) except kazoo.exceptions.NoNodeError: if watch_cb: self.zkclient.exists(endpoints_path, watch=watch_cb) match = set() return match
def _instance_healthy(instance, endpoints): """helper to see if instance is healthy (connectable) """ (proid, instance_name) = instance.split('.', 1) instance_endpoints = [ val for val in endpoints if val.startswith(instance_name) ] zkclient = context.GLOBAL.zk.conn for endpoint in instance_endpoints: fullpath = z.join_zookeeper_path(z.ENDPOINTS, proid, endpoint) hostport, _metadata = zkclient.get(fullpath) (host, port) = hostport.decode().split(':') if not checkout.connect(host, port): return False return True
def refresh_zk(self, zknodes): """Parse data from Zookeeper nodes. NOTE: This is intended to be called with the output of a `:func:get_children` or in the callback of a `:class:ChildrenWatch`. """ data = {} for node in zknodes: (name, chksum, seq) = node.split('#', 2) data.setdefault(name, []).append( ZkDataEntry(zname=z.join_zookeeper_path(self._zkpath, node), chksum=chksum, seq=int(seq))) for name in data: data[name].sort( key=lambda e: e.seq, # Sort nodes by their sequence numbers reverse=True) self._zkdata = data
def _watch_instances(children): """Watch for proid instances.""" current = set(state[proid].keys()) target = set(children) for name in current - target: del state[proid][name] endpoints = dict() for name in target - current: try: endpoint_node = z.join_zookeeper_path(proid_instances, name) data, _metadata = zkclient.get(endpoint_node) endpoints[name] = data.decode() except kazoo.client.NoNodeError: pass state[proid].update(endpoints) return True
def _watch_instances(children): """Watch for proid instances.""" # TODO: current implementation does nto support instances, so # state from masters will be stored, but will be never displayed. current = set(state[proid].keys()) target = set(children) for name in current - target: del state[proid][name] endpoints = dict() for name in target - current: try: endpoint_node = z.join_zookeeper_path(proid_instances, name) data, _metadata = zkclient.get(endpoint_node) endpoints[name] = data.decode() except kazoo.client.NoNodeError: pass state[proid].update(endpoints) return True
def run(no_lock, proid, root): """Run Treadmill DNS endpoint engine. """ zkclient = context.GLOBAL.zk.conn zkendpointpath = z.join_zookeeper_path(z.ENDPOINTS, proid) zkclient.ensure_path(zkendpointpath) zk2fs_endpointpath = '{}{}'.format(root, zkendpointpath) if not os.path.isabs(zk2fs_endpointpath): _LOGGER.error('Invalid path: %s', zk2fs_endpointpath) sys.exit(1) if no_lock: _do_watch(zkclient=zkclient, zkfs_dir=zk2fs_endpointpath) else: lock = zkutils.make_lock( zkclient, z.path.election(__name__) ) _LOGGER.info('Waiting for leader lock.') with lock: _do_watch(zkclient=zkclient, zkfs_dir=zk2fs_endpointpath)
def _get_endpoints(proid): """get all endpoints of a proid """ zkclient = context.GLOBAL.zk.conn endpoint_path = z.join_zookeeper_path(z.ENDPOINTS, proid) return zkclient.get_children(endpoint_path)