def remove_node(ip, ntype): client, _ = util.init_k8s() pod = util.get_pod_from_ip(client, ip) hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-')) prev_count = util.get_previous_count(client, ntype) util.run_process(['./delete_node.sh', hostname, ntype, str(prev_count), str(prev_count - 1)])
def remove_node(ip, ntype): client, _ = util.init_k8s() pod = util.get_pod_from_ip(client, ip) hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-')) podname = pod.metadata.name client.delete_namespaced_pod(name=podname, namespace=util.NAMESPACE, body=k8s.client.V1DeleteOptions()) client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions()) prev_count = util.get_previous_count(client, ntype) util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
def add_nodes(client, apps_client, cfile, kinds, counts, create=False, prefix=None): previously_created_pods_list = [] expected_counts = [] for i in range(len(kinds)): print('Adding %d %s server node(s) to cluster...' % (counts[i], kinds[i])) pods = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector='role=' + kinds[i]).items previously_created_pods_list.append( get_current_pod_container_pairs(pods)) prev_count = util.get_previous_count(client, kinds[i]) util.run_process( ['./modify_ig.sh', kinds[i], str(counts[i] + prev_count)]) expected_counts.append(counts[i] + prev_count) util.run_process(['./validate_cluster.sh']) management_ip = util.get_pod_ips(client, 'role=management')[0] route_ips = util.get_pod_ips(client, 'role=routing') if len(route_ips) > 0: seed_ip = random.choice(route_ips) else: seed_ip = '' mon_str = ' '.join(util.get_pod_ips(client, 'role=monitoring')) route_str = ' '.join(route_ips) sched_str = ' '.join(util.get_pod_ips(client, 'role=scheduler')) route_addr = util.get_service_address(client, 'routing-service') function_addr = util.get_service_address(client, 'function-service') for i in range(len(kinds)): kind = kinds[i] # Create should only be true when the DaemonSet is being created for the # first time -- i.e., when this is called from create_cluster. After that, # we can basically ignore this because the DaemonSet will take care of # adding pods to created nodes. if create: fname = 'yaml/ds/%s-ds.yml' % kind yml = util.load_yaml(fname, prefix) for container in yml['spec']['template']['spec']['containers']: env = container['env'] util.replace_yaml_val(env, 'ROUTING_IPS', route_str) util.replace_yaml_val(env, 'ROUTE_ADDR', route_addr) util.replace_yaml_val(env, 'SCHED_IPS', sched_str) util.replace_yaml_val(env, 'FUNCTION_ADDR', function_addr) util.replace_yaml_val(env, 'MON_IPS', mon_str) util.replace_yaml_val(env, 'MGMT_IP', management_ip) util.replace_yaml_val(env, 'SEED_IP', seed_ip) apps_client.create_namespaced_daemon_set(namespace=util.NAMESPACE, body=yml) # Wait until all pods of this kind are running res = [] while len(res) != expected_counts[i]: res = util.get_pod_ips(client, 'role=' + kind, is_running=True) pods = client.list_namespaced_pod(namespace=util.NAMESPACE, label_selector='role=' + kind).items created_pods = get_current_pod_container_pairs(pods) new_pods = created_pods.difference(previously_created_pods_list[i]) # Copy the KVS config into all recently created pods. os.system('cp %s ./anna-config.yml' % cfile) for pname, cname in new_pods: if kind != 'function' and kind != 'gpu': util.copy_file_to_pod(client, 'anna-config.yml', pname, '/hydro/anna/conf/', cname) else: if cname == 'cache-container': # For the cache pods, we also copy the conf into the cache # conf directory. util.copy_file_to_pod(client, 'anna-config.yml', pname, '/hydro/anna-cache/conf/', cname) os.system('rm ./anna-config.yml')
def create_cluster(mem_count, ebs_count, func_count, sched_count, route_count, bench_count, cfile, ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key): if 'HYDRO_HOME' not in os.environ: raise ValueError('HYDRO_HOME environment variable must be set to be ' + 'the directory where all Hydro project repos are ' + 'located.') prefix = os.path.join(os.environ['HYDRO_HOME'], 'cluster/hydro/cluster') util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key]) client, apps_client = util.init_k8s() print('Creating management pods...') management_spec = util.load_yaml('yaml/pods/management-pod.yml', prefix) env = management_spec['spec']['containers'][0]['env'] util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id) util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key) util.replace_yaml_val(env, 'KOPS_STATE_STORE', kops_bucket) util.replace_yaml_val(env, 'HYDRO_CLUSTER_NAME', cluster_name) client.create_namespaced_pod(namespace=util.NAMESPACE, body=management_spec) # Waits until the management pod starts to move forward -- we need to do # this because other pods depend on knowing the management pod's IP address. management_ip = util.get_pod_ips(client, 'role=management', is_running=True)[0] # Copy kube config file to management pod, so it can execute kubectl # commands, in addition to SSH keys and KVS config. management_podname = management_spec['metadata']['name'] kcname = management_spec['spec']['containers'][0]['name'] os.system('cp %s anna-config.yml' % cfile) kubecfg = os.path.join(os.environ['HOME'], '.kube/config') util.copy_file_to_pod(client, kubecfg, management_podname, '/root/.kube/', kcname) util.copy_file_to_pod(client, ssh_key, management_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, ssh_key + '.pub', management_podname, '/root/.ssh/', kcname) util.copy_file_to_pod(client, 'anna-config.yml', management_podname, '/hydro/anna/conf/', kcname) # Start the monitoring pod. mon_spec = util.load_yaml('yaml/pods/monitoring-pod.yml', prefix) util.replace_yaml_val(mon_spec['spec']['containers'][0]['env'], 'MGMT_IP', management_ip) client.create_namespaced_pod(namespace=util.NAMESPACE, body=mon_spec) # Wait until the monitoring pod is finished creating to get its IP address # and then copy KVS config into the monitoring pod. util.get_pod_ips(client, 'role=monitoring') util.copy_file_to_pod(client, 'anna-config.yml', mon_spec['metadata']['name'], '/hydro/anna/conf/', mon_spec['spec']['containers'][0]['name']) os.system('rm anna-config.yml') print('Creating %d routing nodes...' % (route_count)) add_nodes(client, apps_client, cfile, ['routing'], [route_count], True, prefix) util.get_pod_ips(client, 'role=routing') print('Creating %d memory, %d ebs node(s)...' % (mem_count, ebs_count)) add_nodes(client, apps_client, cfile, ['memory', 'ebs'], [mem_count, ebs_count], True, prefix) print('Creating routing service...') service_spec = util.load_yaml('yaml/services/routing.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d scheduler nodes...' % (sched_count)) add_nodes(client, apps_client, cfile, ['scheduler'], [sched_count], True, prefix) util.get_pod_ips(client, 'role=scheduler') print('Adding %d function serving nodes...' % (func_count)) add_nodes(client, apps_client, cfile, ['function'], [func_count], True, prefix) print('Creating function service...') service_spec = util.load_yaml('yaml/services/function.yml', prefix) client.create_namespaced_service(namespace=util.NAMESPACE, body=service_spec) print('Adding %d benchmark nodes...' % (bench_count)) add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], True, prefix) print('Finished creating all pods...') os.system('touch setup_complete') util.copy_file_to_pod(client, 'setup_complete', management_podname, '/hydro', kcname) os.system('rm setup_complete') sg_name = 'nodes.' + cluster_name sg = ec2_client.describe_security_groups(Filters=[{ 'Name': 'group-name', 'Values': [sg_name] }])['SecurityGroups'][0] print('Authorizing ports for routing service...') permission = [{ 'FromPort': 6200, 'IpProtocol': 'tcp', 'ToPort': 6203, 'IpRanges': [{ 'CidrIp': '0.0.0.0/0' }] }] ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'], IpPermissions=permission) routing_svc_addr = util.get_service_address(client, 'routing-service') function_svc_addr = util.get_service_address(client, 'function-service') print('The routing service can be accessed here: \n\t%s' % (routing_svc_addr)) print('The function service can be accessed here: \n\t%s' % (function_svc_addr))