def add_nodes(client, apps_client, cfile, kinds, counts, create=False, prefix=None): previously_created_pods_list = [] expected_counts = [] for i in range(len(kinds)): print('Adding %d %s server node(s) to cluster...' % (counts[i], kinds[i])) pods = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector='role=' + kinds[i]).items previously_created_pods_list.append( get_current_pod_container_pairs(pods)) prev_count = util.get_previous_count(client, kinds[i]) util.run_process( ['./modify_ig.sh', kinds[i], str(counts[i] + prev_count)]) expected_counts.append(counts[i] + prev_count) util.run_process(['./validate_cluster.sh']) management_ip = util.get_pod_ips(client, 'role=management')[0] route_ips = util.get_pod_ips(client, 'role=routing') if len(route_ips) > 0: seed_ip = random.choice(route_ips) else: seed_ip = '' mon_str = ' '.join(util.get_pod_ips(client, 'role=monitoring')) route_str = ' '.join(route_ips) sched_str = ' '.join(util.get_pod_ips(client, 'role=scheduler')) route_addr = util.get_service_address(client, 'routing-service') function_addr = util.get_service_address(client, 'function-service') for i in range(len(kinds)): kind = kinds[i] # Create should only be true when the DaemonSet is being created for the # first time -- i.e., when this is called from create_cluster. After that, # we can basically ignore this because the DaemonSet will take care of # adding pods to created nodes. if create: fname = 'yaml/ds/%s-ds.yml' % kind yml = util.load_yaml(fname, prefix) for container in yml['spec']['template']['spec']['containers']: env = container['env'] util.replace_yaml_val(env, 'ROUTING_IPS', route_str) util.replace_yaml_val(env, 'ROUTE_ADDR', route_addr) util.replace_yaml_val(env, 'SCHED_IPS', sched_str) util.replace_yaml_val(env, 'FUNCTION_ADDR', function_addr) util.replace_yaml_val(env, 'MON_IPS', mon_str) util.replace_yaml_val(env, 'MGMT_IP', management_ip) util.replace_yaml_val(env, 'SEED_IP', seed_ip) apps_client.create_namespaced_daemon_set(namespace=util.NAMESPACE, body=yml) # Wait until all pods of this kind are running res = [] while len(res) != expected_counts[i]: res = util.get_pod_ips(client, 'role=' + kind, is_running=True) pods = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector='role=' + kind).items created_pods = get_current_pod_container_pairs(pods) new_pods = created_pods.difference(previously_created_pods_list[i]) # Copy the KVS config into all recently created pods. os.system('cp %s ./anna-config.yml' % cfile) for pname, cname in new_pods: if kind != 'function' and kind != 'gpu': util.copy_file_to_pod(client, 'anna-config.yml', pname, '/hydro/anna/conf/', cname) else: if cname == 'cache-container': # For the cache pods, we also copy the conf into the cache # conf directory. util.copy_file_to_pod(client, 'anna-config.yml', pname, '/hydro/anna-cache/conf/', cname) os.system('rm ./anna-config.yml')
def check_hash_ring(client, context): route_ips = util.get_pod_ips(client, 'role=routing') # If there are no routing nodes in the system currently, the system is # still starting, so we do nothing. if not route_ips: return ip = random.choice(route_ips) # Retrieve a list of all current members of the cluster. socket = context.socket(zmq.REQ) socket.connect(get_routing_seed_address(ip, 0)) socket.send_string('') resp = socket.recv() cluster = ClusterMembership() cluster.ParseFromString(resp) tiers = cluster.tiers # If there are no tiers, then we don't need to evaluate anything. if len(tiers) == 0: return elif len(tiers) == 1: # If there is one tier, it will be the memory tier. mem_tier, ebs_tier = tiers[0], None else: # If there are two tiers, we need to make sure that we assign the # correct tiers as the memory and EBS tiers, respectively. if tiers[0].tier_id == MEMORY: mem_tier = tiers[0] ebs_tier = tiers[1] else: mem_tier = tiers[1] ebs_tier = tiers[0] # Queries the Kubernetes master for the list of memory nodes its aware of # -- if any of the nodes in the hash ring aren't currently running, we add # those the departed list. mem_ips = util.get_pod_ips(client, 'role=memory') departed = [] for node in mem_tier.servers: if node.private_ip not in mem_ips: departed.append(('0', node)) # Performs the same process for the EBS tier if it exists. ebs_ips = [] if ebs_tier: ebs_ips = util.get_pod_ips(client, 'role=ebs') for node in ebs_tier.servers: if node.private_ip not in ebs_ips: departed.append(('1', node)) logging.debug('Found %d departed nodes.' % (len(departed))) mon_ips = util.get_pod_ips(client, 'role=monitoring') storage_ips = mem_ips + ebs_ips # For each departed node the cluster is unaware of, we inform all storage # nodes, all monitoring nodes, and all routing nodes that it has departed. for pair in departed: logging.info('Informing cluster that node %s/%s has departed.' % (pair[1].public_ip, pair[1].private_ip)) msg = pair[0] + ':' + pair[1].public_ip + ':' + pair[1].private_ip # NOTE: In this code, we are presuming there are 4 threads per # storage/routing node. If there are more, this will be buggy; if there # are fewer, this is fine as the messages will go into the void. for ip in storage_ips: for t in range(4): send_message(context, msg, get_storage_depart_address(ip, t)) msg = 'depart:' + msg for ip in route_ips: for t in range(4): send_message(context, msg, get_routing_depart_address(ip, t)) for ip in mon_ips: send_message(context, msg, get_monitoring_depart_address(ip))
def run(self_ip): context = zmq.Context(1) pusher_cache = SocketCache(context, zmq.PUSH) restart_pull_socket = context.socket(zmq.REP) restart_pull_socket.bind('tcp://*:7000') churn_pull_socket = context.socket(zmq.PULL) churn_pull_socket.bind('tcp://*:7001') list_executors_socket = context.socket(zmq.PULL) list_executors_socket.bind('tcp://*:7002') function_status_socket = context.socket(zmq.PULL) function_status_socket.bind('tcp://*:7003') list_schedulers_socket = context.socket(zmq.REP) list_schedulers_socket.bind('tcp://*:7004') executor_depart_socket = context.socket(zmq.PULL) executor_depart_socket.bind('tcp://*:7005') statistics_socket = context.socket(zmq.PULL) statistics_socket.bind('tcp://*:7006') pin_accept_socket = context.socket(zmq.PULL) pin_accept_socket.setsockopt(zmq.RCVTIMEO, 10000) # 10 seconds. pin_accept_socket.bind('tcp://*:' + PIN_ACCEPT_PORT) poller = zmq.Poller() poller.register(restart_pull_socket, zmq.POLLIN) poller.register(churn_pull_socket, zmq.POLLIN) poller.register(function_status_socket, zmq.POLLIN) poller.register(list_executors_socket, zmq.POLLIN) poller.register(list_schedulers_socket, zmq.POLLIN) poller.register(executor_depart_socket, zmq.POLLIN) poller.register(statistics_socket, zmq.POLLIN) add_push_socket = context.socket(zmq.PUSH) add_push_socket.connect('ipc:///tmp/node_add') remove_push_socket = context.socket(zmq.PUSH) remove_push_socket.connect('ipc:///tmp/node_remove') client, _ = util.init_k8s() scaler = DefaultScaler(self_ip, context, add_push_socket, remove_push_socket, pin_accept_socket) policy = DefaultHydroPolicy(scaler) # Tracks the self-reported statuses of each executor thread in the system. executor_statuses = {} # Tracks of which executors are departing. This is used to ensure all # threads acknowledge that they are finished before we remove a thread from # the system. departing_executors = {} # Tracks how often each function is called. function_frequencies = {} # Tracks the aggregated runtime for each function. function_runtimes = {} # Tracks the arrival times of DAG requests. arrival_times = {} # Tracks how often each DAG is called. dag_frequencies = {} # Tracks how long each DAG request spends in the system, end to end. dag_runtimes = {} start = time.time() while True: socks = dict(poller.poll(timeout=1000)) if (churn_pull_socket in socks and socks[churn_pull_socket] == zmq.POLLIN): msg = churn_pull_socket.recv_string() args = msg.split(':') if args[0] == 'add': scaler.add_vms(args[2], args[1]) elif args[0] == 'remove': scaler.remove_vms(args[2], args[1]) if (restart_pull_socket in socks and socks[restart_pull_socket] == zmq.POLLIN): msg = restart_pull_socket.recv_string() args = msg.split(':') pod = util.get_pod_from_ip(client, args[1]) count = str(pod.status.container_statuses[0].restart_count) restart_pull_socket.send_string(count) if (list_executors_socket in socks and socks[list_executors_socket] == zmq.POLLIN): # We can safely ignore this message's contents, and the response # does not depend on it. response_ip = list_executors_socket.recv_string() ips = StringSet() for ip in util.get_pod_ips(client, 'role=function'): ips.keys.append(ip) for ip in util.get_pod_ips(client, 'role=gpu'): ips.keys.append(ip) sckt = pusher_cache.get(response_ip) sckt.send(ips.SerializeToString()) if (function_status_socket in socks and socks[function_status_socket] == zmq.POLLIN): # Dequeue all available ThreadStatus messages rather than doing # them one at a time---this prevents starvation if other operations # (e.g., pin) take a long time. while True: status = ThreadStatus() try: status.ParseFromString(function_status_socket.recv(zmq.DONTWAIT)) except: break # We've run out of messages. key = (status.ip, status.tid) # If this executor is one of the ones that's currently departing, # we can just ignore its status updates since we don't want # utilization to be skewed downwards. The reason we might still # receive this message is because the depart message may not have # arrived when this was sent. if key[0] in departing_executors: continue executor_statuses[key] = status # logging.info(('Received thread status update from %s:%d: %.4f ' + # 'occupancy, %d functions pinned') % # (status.ip, status.tid, status.utilization, # len(status.functions))) logging.info(f"Functions {status.functions} is placed on node " f"{status.ip}:{status.tid}") if (list_schedulers_socket in socks and socks[list_schedulers_socket] == zmq.POLLIN): # We can safely ignore this message's contents, and the response # does not depend on it. list_schedulers_socket.recv_string() ips = StringSet() for ip in util.get_pod_ips(client, 'role=scheduler'): ips.keys.append(ip) list_schedulers_socket.send(ips.SerializeToString()) if (executor_depart_socket in socks and socks[executor_depart_socket] == zmq.POLLIN): ip = executor_depart_socket.recv_string() departing_executors[ip] -= 1 # We wait until all the threads at this executor have acknowledged # that they are ready to leave, and we then remove the VM from the # system. if departing_executors[ip] == 0: logging.info('Removing node with ip %s' % ip) scaler.remove_vms('function', ip) del departing_executors[ip] if (statistics_socket in socks and socks[statistics_socket] == zmq.POLLIN): stats = ExecutorStatistics() stats.ParseFromString(statistics_socket.recv()) # Aggregates statistics reported for individual functions including # call frequencies, processed requests, and total runtimes. for fstats in stats.functions: fname = fstats.name if fname not in function_frequencies: function_frequencies[fname] = 0 if fname not in function_runtimes: function_runtimes[fname] = (0.0, 0) if fstats.runtime: old_latency = function_runtimes[fname] # This tracks how many calls were processed for the # function and the length of the total runtime of all # calls. function_runtimes[fname] = ( old_latency[0] + sum(fstats.runtime), old_latency[1] + fstats.call_count) else: # This tracks how many calls are made to the function. function_frequencies[fname] += fstats.call_count # Aggregates statistics for DAG requests, including call # frequencies, arrival rates, and end-to-end runtimes. for dstats in stats.dags: dname = dstats.name # Tracks the interarrival rates of requests to this function as # perceived by the scheduler. if dname not in arrival_times: arrival_times[dname] = [] arrival_times[dname] += list(dstats.interarrival) # Tracks how many calls to this DAG were received. if dname not in dag_frequencies: dag_frequencies[dname] = 0 dag_frequencies[dname] += dstats.call_count # Tracks the end-to-end runtime of individual requests # completed in the last epoch. if dname not in dag_runtimes: dag_runtimes[dname] = [] for rt in dstats.runtimes: dag_runtimes[dname].append(rt) end = time.time() if end - start > REPORT_PERIOD: logging.info('Checking hash ring...') check_hash_ring(client, context) # Invoke the configured policy to check system load and respond # appropriately. policy.replica_policy(function_frequencies, function_runtimes, dag_runtimes, executor_statuses, arrival_times) # TODO(simon): this turn off node scaling policy, which is what we want for static exp env # policy.executor_policy(executor_statuses, departing_executors) # Clears all metadata that was passed in for this epoch. function_runtimes.clear() function_frequencies.clear() dag_runtimes.clear() arrival_times.clear() # Restart the timer for the next reporting epoch. start = time.time()
def create_cluster(mem_count, ebs_count, func_count, sched_count, route_count, bench_count, cfile, ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key): if 'HYDRO_HOME' not in os.environ: raise ValueError('HYDRO_HOME environment variable must be set to be ' + 'the directory where all Hydro project repos are ' + 'located.') prefix = os.path.join(os.environ['HYDRO_HOME'], 'cluster/hydro/cluster') util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key]) client, apps_client = util.init_k8s() print('Creating management pods...') management_spec = util.load_yaml('yaml/pods/management-pod.yml', prefix) env = management_spec['spec']['containers'][0]['env'] util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id) util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key) util.replace_yaml_val(env, 'KOPS_STATE_STORE', kops_bucket) util.replace_yaml_val(env, 'HYDRO_CLUSTER_NAME', cluster_name) client.create_namespaced_pod(namespace=util.NAMESPACE, body=management_spec) # Waits until the management pod starts to move forward -- we need to do # this because other pods depend on knowing the management pod's IP address. management_ip = util.get_pod_ips(client, 'role=management', is_running=True)[0] # Copy kube config file to management pod, so it can execute kubectl # commands, in addition to SSH keys and KVS config. management_podname = management_spec['metadata']['name'] kcname = management_spec['spec']['containers'][0]['name'] os.system('cp %s anna-config.yml' % cfile) kubecfg = os.path.join(os.environ['HOME'], '.kube/config') util.copy_file_to_pod(client, kubecfg, management_podname, '/root/.kube/', kcname) util.copy_file_to_pod(client, ssh_key, management_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, ssh_key + '.pub', management_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, 'anna-config.yml', management_podname, '/hydro/anna/conf/', kcname) # Start the monitoring pod. mon_spec = util.load_yaml('yaml/pods/monitoring-pod.yml', prefix) util.replace_yaml_val(mon_spec['spec']['containers'][0]['env'], 'MGMT_IP', management_ip) client.create_namespaced_pod(namespace=util.NAMESPACE, body=mon_spec) # Wait until the monitoring pod is finished creating to get its IP address # and then copy KVS config into the monitoring pod. util.get_pod_ips(client, 'role=monitoring') util.copy_file_to_pod(client, 'anna-config.yml', mon_spec['metadata']['name'], '/hydro/anna/conf/', mon_spec['spec']['containers'][0]['name']) os.system('rm anna-config.yml') print('Creating %d routing nodes...' % (route_count)) add_nodes(client, apps_client, cfile, ['routing'], [route_count], True, prefix) util.get_pod_ips(client, 'role=routing') print('Creating %d memory, %d ebs node(s)...' % (mem_count, ebs_count)) add_nodes(client, apps_client, cfile, ['memory', 'ebs'], [mem_count, ebs_count], True, prefix) print('Creating routing service...') service_spec = util.load_yaml('yaml/services/routing.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d scheduler nodes...' % (sched_count)) add_nodes(client, apps_client, cfile, ['scheduler'], [sched_count], True, prefix) util.get_pod_ips(client, 'role=scheduler') print('Adding %d function serving nodes...' % (func_count)) add_nodes(client, apps_client, cfile, ['function'], [func_count], True, prefix) print('Creating function service...') service_spec = util.load_yaml('yaml/services/function.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d benchmark nodes...' % (bench_count)) add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], True, prefix) print('Finished creating all pods...') os.system('touch setup_complete') util.copy_file_to_pod(client, 'setup_complete', management_podname, '/hydro', kcname) os.system('rm setup_complete') sg_name = 'nodes.' + cluster_name sg = ec2_client.describe_security_groups(Filters=[{ 'Name': 'group-name', 'Values': [sg_name] }])['SecurityGroups'][0] print('Authorizing ports for routing service...') permission = [{ 'FromPort': 6200, 'IpProtocol': 'tcp', 'ToPort': 6203, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }] ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'], IpPermissions=permission) routing_svc_addr = util.get_service_address(client, 'routing-service') function_svc_addr = util.get_service_address(client, 'function-service') print('The routing service can be accessed here: \n\t%s' % (routing_svc_addr)) print('The function service can be accessed here: \n\t%s' % (function_svc_addr))