def restart(pod_ip): pod = util.get_pod_from_ip(client, pod_ip) pname = pod.metadata.name cname = pod.spec.containers[0].name kill_cmd = 'kubectl exec -it %s -c %s -- /sbin/killall5' % (pname, cname) subprocess.run(kill_cmd, shell=True) pod_ips = util.get_pod_ips(client, selector='role=aft', is_running=True) while pod_ip not in pod_ips: pod_ips = util.get_pod_ips(client, selector='role=aft', is_running=True) # Send config file to the pod retry = 0 while True: try: sendConfig(pod_ip, None) break except Exception as e: retry += 1 print('Caught exception') if retry >= 5: print('Out of retries...') print(e) return print('Retrying in %d sec' % (retry * 10)) time.sleep(retry * 10) print('Restarted %s node at %s' % (kind, pod_ip))
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() while True: socks = dict(poller.poll()) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') add_nodes(client, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count)
def remove_node(ip, ntype): client, _ = util.init_k8s() pod = util.get_pod_from_ip(client, ip) hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-')) podname = pod.metadata.name client.delete_namespaced_pod(name=podname, namespace=util.NAMESPACE, body=k8s.client.V1DeleteOptions()) client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions()) prev_count = util.get_previous_count(client, ntype) util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
def sendConfig(nodeIP, configFile): pod = util.get_pod_from_ip(client, nodeIP) pname = pod.metadata.name # There is only 1 container in each Pod cname = pod.spec.containers[0].name cfile = configFile if configFile else BASE_CONFIG_FILE os.system(str('cp %s ' + CONFIG_FILE) % cfile) util.copy_file_to_pod(client, CONFIG_FILE[2:], pname, POD_CONFIG_DIR, cname) os.system('rm ' + CONFIG_FILE)
def sendConfig(nodeIP, configFile): pod = util.get_pod_from_ip(client, nodeIP) pname = pod.metadata.name # There is only 1 container in each Pod cname = pod.spec.containers[0].name cfile = configFile if configFile else BASE_CONFIG_FILE os.system(str('cp %s ' + CONFIG_FILE) % cfile) util.copy_file_to_pod(client, CONFIG_FILE[2:], pname, POD_CONFIG_DIR, cname) os.system('rm ' + CONFIG_FILE) # Sending replica txt replica_ips = util.get_node_ips(client, 'role=aft', 'ExternalIP') with open('replicas.txt', 'w') as f: for ip in replica_ips: f.write(ip + '\n') util.copy_file_to_pod(client, 'replicas.txt', pname, '/go/src/github.com/tajshaik24/aft', 'aft-container') os.system('rm replicas.txt')
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') list_executors_socket = context.socket(zmq.REP) list_executors_socket.bind('tcp://*:7002') function_status_socket = context.socket(zmq.PULL) function_status_socket.bind('tcp://*:7003') list_schedulers_socket = context.socket(zmq.REP) list_schedulers_socket.bind('tcp://*:7004') executor_depart_socket = context.socket(zmq.PULL) executor_depart_socket.bind('tcp://*:7005') executor_statistics_socket = context.socket(zmq.PULL) executor_statistics_socket.bind('tcp://*:7006') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) poller.register(function_status_socket, zmq.POLLIN) poller.register(list_executors_socket, zmq.POLLIN) poller.register(list_schedulers_socket, zmq.POLLIN) poller.register(executor_depart_socket, zmq.POLLIN) poller.register(executor_statistics_socket, zmq.POLLIN) add_push_socket = context.socket(zmq.PUSH) add_push_socket.connect('ipc:///tmp/node_add') remove_push_socket = context.socket(zmq.PUSH) remove_push_socket.connect('ipc:///tmp/node_remove') # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() # track the self-reported status of each function execution thread executor_statuses = {} departing_executors = {} function_frequencies = {} function_runtimes = {} latency_history = {} start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': msg = args[2] + args[1] add_push_socket.send_string(msg) elif args[0] == 'remove': msg = args[2] = args[1] remove_push_socket.send_string(msg) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count %s for IP %s.' % (count, ip)) restart_pull_socket.send_string(count) if list_executors_socket in socks and socks[list_executors_socket] == \ zmq.POLLIN: # it doesn't matter what is in this message msg = list_executors_socket.recv() ks = KeySet() for ip in util.get_pod_ips(client, 'role=function'): ks.keys.append(ip) list_executors_socket.send(ks.SerializeToString()) if function_status_socket in socks and \ socks[function_status_socket] == zmq.POLLIN: status = ThreadStatus() status.ParseFromString(function_status_socket.recv()) key = (status.ip, status.tid) # if this executor is one of the ones that's currently departing, # we can just ignore its status updates since we don't want # utilization to be skewed downwards if key in departing_executors: continue executor_statuses[key] = status logging.info(('Received thread status update from %s:%d: %.4f ' + 'occupancy, %d functions pinned') % (status.ip, status.tid, status.utilization, len(status.functions))) if list_schedulers_socket in socks and socks[list_schedulers_socket] == \ zmq.POLLIN: # It doesn't matter what is in this message msg = list_schedulers_socket.recv_string() ks = KeySet() for ip in util.get_pod_ips(client, 'role=scheduler'): ks.keys.append(ip) list_schedulers_socket.send(ks.SerializeToString()) if executor_depart_socket in socks and \ socks[executor_depart_socket] == zmq.POLLIN: ip = executor_depart_socket.recv_string() departing_executors[ip] -= 1 # wait until all the executors on this IP have cleared their queues # and left; then we remove the node if departing_executors[ip] == 0: msg = 'function:' + ip remove_push_socket.send_string(msg) del departing_executors[ip] if executor_statistics_socket in socks and \ socks[executor_statistics_socket] == zmq.POLLIN: stats = ExecutorStatistics() stats.ParseFromString(executor_statistics_socket.recv()) for fstats in stats.statistics: fname = fstats.fname if fname not in function_frequencies: function_frequencies[fname] = 0 function_runtimes[fname] = 0.0 function_frequencies[fname] += fstats.call_count function_runtimes[fname] += fstats.runtime end = time.time() if end - start > REPORT_PERIOD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client, add_push_socket) check_executor_utilization(client, context, executor_statuses, departing_executors, add_push_socket) check_function_load(context, function_frequencies, function_runtimes, executor_statuses, latency_history) start = time.time()
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') extant_caches_socket = context.socket(zmq.REP) extant_caches_socket.bind('tcp://*:7002') func_pull_socket = context.socket(zmq.PULL) func_pull_socket.bind('tcp://*:7003') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) poller.register(func_pull_socket, zmq.POLLIN) poller.register(extant_caches_socket, zmq.POLLIN) cfile = '/fluent/conf/kvs-base.yml' # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() func_occ_map = {} start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) if len(args) > 3: num_threads = args[3] else: num_threads = 3 mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads, cfile)) os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads * 15, cfile)) add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count) if extant_caches_socket in socks and socks[extant_caches_socket] == \ zmq.POLLIN: # It doesn't matter what is in this message msg = extant_caches_socket.recv_string() ks = KeySet() for ip in util.get_pod_ips(clinet, 'role=function'): ks.add_keys(ip) extant_caches_socket.send_string(ks.SerializeToString()) if func_pull_socket in socks and socks[func_pull_socket] == zmq.POLLIN: msg = func_pull_socket.recv_string() args = msg.split('|') ip, mutil = args[0], float(args[1]) logging.info('Received node occupancy of %.2f%% from IP %s.' % (mutil * 100, ip)) func_occ_map[ip] = mutil end = time.time() if end - start > THRESHOLD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client) if func_occ_map.values(): avg_focc = reduce(lambda a, b: a + b, func_occ_map.values(), \ 0) / len(func_occ_map) else: avg_focc = 0 logging.info('Average node occupancy is %f%%...' % (avg_focc * 100)) if avg_focc > FOCC_THRESHOLD: mon_ips = util.get_pod_ips(client, 'role=monitoring') route_addr = get_service_address(client, 'routing-service') add_nodes(client, ['function'], [1], mon_ips, route_addr=route_addr) start = time.time()
def run(): context = zmq.Context(1) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) cfile = '/fluent/conf/kvs-base.yml' # waits until the kubecfg file gets copied into the pod -- this might be # brittle if we try to move to a non-Ubuntu setting, but I'm not worried # about that for now while not os.path.isfile('/root/.kube/config'): pass client = util.init_k8s() start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if churn_pull_socket in socks and socks[churn_pull_socket] == \ zmq.POLLIN: msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': num = int(args[1]) ntype = args[2] logging.info('Adding %d new %s node(s)...' % (num, ntype)) if len(args) > 3: num_threads = args[3] else: num_threads = 3 mon_ips = util.get_pod_ips(client, 'role=monitoring') route_ips = util.get_pod_ips(client, 'role=routing') os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads, cfile)) os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype, ntype, num_threads * 15, cfile)) add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips) logging.info('Successfully added %d %s node(s).' % (num, ntype)) elif args[0] == 'remove': ip = args[1] ntype = args[2] remove_node(ip, ntype) logging.info('Successfully removed node %s.' % (ip)) if restart_pull_socket in socks and socks[restart_pull_socket] == \ zmq.POLLIN: msg = restart_pull_socket.recv_string() args = msg.split(':') ip = args[1] pod = util.get_pod_from_ip(client, ip) count = str(pod.status.container_statuses[0].restart_count) logging.info('Returning restart count ' + count + ' for IP ' + ip + '.') restart_pull_socket.send_string(count) end = time.time() if end - start > THRESHOLD: logging.info('Checking hash ring...') check_hash_ring(client, context) logging.info('Checking for extra nodes...') check_unused_nodes(client) start = time.time()