Ejemplo n.º 1
0
def remove_node(ip, ntype):
    client, _ = util.init_k8s()

    pod = util.get_pod_from_ip(client, ip)
    hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-'))

    prev_count = util.get_previous_count(client, ntype)

    util.run_process(['./delete_node.sh', hostname, ntype, str(prev_count), str(prev_count - 1)])
Ejemplo n.º 2
0
def remove_node(ip, ntype):
    client, _ = util.init_k8s()

    pod = util.get_pod_from_ip(client, ip)
    hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-'))

    podname = pod.metadata.name
    client.delete_namespaced_pod(name=podname,
                                 namespace=util.NAMESPACE,
                                 body=k8s.client.V1DeleteOptions())
    client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions())

    prev_count = util.get_previous_count(client, ntype)
    util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
Ejemplo n.º 3
0
def add_nodes(client,
              apps_client,
              cfile,
              kinds,
              counts,
              create=False,
              prefix=None):
    previously_created_pods_list = []
    expected_counts = []
    for i in range(len(kinds)):
        print('Adding %d %s server node(s) to cluster...' %
              (counts[i], kinds[i]))

        pods = client.list_namespaced_pod(namespace=util.NAMESPACE,
                                          label_selector='role=' +
                                          kinds[i]).items

        previously_created_pods_list.append(
            get_current_pod_container_pairs(pods))

        prev_count = util.get_previous_count(client, kinds[i])
        util.run_process(
            ['./modify_ig.sh', kinds[i],
             str(counts[i] + prev_count)])
        expected_counts.append(counts[i] + prev_count)

    util.run_process(['./validate_cluster.sh'])

    management_ip = util.get_pod_ips(client, 'role=management')[0]
    route_ips = util.get_pod_ips(client, 'role=routing')

    if len(route_ips) > 0:
        seed_ip = random.choice(route_ips)
    else:
        seed_ip = ''

    mon_str = ' '.join(util.get_pod_ips(client, 'role=monitoring'))
    route_str = ' '.join(route_ips)
    sched_str = ' '.join(util.get_pod_ips(client, 'role=scheduler'))

    route_addr = util.get_service_address(client, 'routing-service')
    function_addr = util.get_service_address(client, 'function-service')

    for i in range(len(kinds)):
        kind = kinds[i]

        # Create should only be true when the DaemonSet is being created for the
        # first time -- i.e., when this is called from create_cluster. After that,
        # we can basically ignore this because the DaemonSet will take care of
        # adding pods to created nodes.
        if create:
            fname = 'yaml/ds/%s-ds.yml' % kind
            yml = util.load_yaml(fname, prefix)

            for container in yml['spec']['template']['spec']['containers']:
                env = container['env']

                util.replace_yaml_val(env, 'ROUTING_IPS', route_str)
                util.replace_yaml_val(env, 'ROUTE_ADDR', route_addr)
                util.replace_yaml_val(env, 'SCHED_IPS', sched_str)
                util.replace_yaml_val(env, 'FUNCTION_ADDR', function_addr)
                util.replace_yaml_val(env, 'MON_IPS', mon_str)
                util.replace_yaml_val(env, 'MGMT_IP', management_ip)
                util.replace_yaml_val(env, 'SEED_IP', seed_ip)

            apps_client.create_namespaced_daemon_set(namespace=util.NAMESPACE,
                                                     body=yml)

        # Wait until all pods of this kind are running
        res = []
        while len(res) != expected_counts[i]:
            res = util.get_pod_ips(client, 'role=' + kind, is_running=True)

        pods = client.list_namespaced_pod(namespace=util.NAMESPACE,
                                          label_selector='role=' + kind).items

        created_pods = get_current_pod_container_pairs(pods)

        new_pods = created_pods.difference(previously_created_pods_list[i])

        # Copy the KVS config into all recently created pods.
        os.system('cp %s ./anna-config.yml' % cfile)

        for pname, cname in new_pods:
            if kind != 'function' and kind != 'gpu':
                util.copy_file_to_pod(client, 'anna-config.yml', pname,
                                      '/hydro/anna/conf/', cname)
            else:
                if cname == 'cache-container':
                    # For the cache pods, we also copy the conf into the cache
                    # conf directory.
                    util.copy_file_to_pod(client, 'anna-config.yml', pname,
                                          '/hydro/anna-cache/conf/', cname)

        os.system('rm ./anna-config.yml')
Ejemplo n.º 4
0
def create_cluster(mem_count, ebs_count, func_count, sched_count, route_count,
                   bench_count, cfile, ssh_key, cluster_name, kops_bucket,
                   aws_key_id, aws_key):

    if 'HYDRO_HOME' not in os.environ:
        raise ValueError('HYDRO_HOME environment variable must be set to be ' +
                         'the directory where all Hydro project repos are ' +
                         'located.')
    prefix = os.path.join(os.environ['HYDRO_HOME'], 'cluster/hydro/cluster')

    util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key])

    client, apps_client = util.init_k8s()

    print('Creating management pods...')
    management_spec = util.load_yaml('yaml/pods/management-pod.yml', prefix)
    env = management_spec['spec']['containers'][0]['env']

    util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id)
    util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key)
    util.replace_yaml_val(env, 'KOPS_STATE_STORE', kops_bucket)
    util.replace_yaml_val(env, 'HYDRO_CLUSTER_NAME', cluster_name)

    client.create_namespaced_pod(namespace=util.NAMESPACE,
                                 body=management_spec)

    # Waits until the management pod starts to move forward -- we need to do
    # this because other pods depend on knowing the management pod's IP address.
    management_ip = util.get_pod_ips(client,
                                     'role=management',
                                     is_running=True)[0]

    # Copy kube config file to management pod, so it can execute kubectl
    # commands, in addition to SSH keys and KVS config.
    management_podname = management_spec['metadata']['name']
    kcname = management_spec['spec']['containers'][0]['name']

    os.system('cp %s anna-config.yml' % cfile)
    kubecfg = os.path.join(os.environ['HOME'], '.kube/config')
    util.copy_file_to_pod(client, kubecfg, management_podname, '/root/.kube/',
                          kcname)
    util.copy_file_to_pod(client, ssh_key, management_podname, '/root/.ssh/',
                          kcname)
    util.copy_file_to_pod(client, ssh_key + '.pub', management_podname,
                          '/root/.ssh/', kcname)
    util.copy_file_to_pod(client, 'anna-config.yml', management_podname,
                          '/hydro/anna/conf/', kcname)

    # Start the monitoring pod.
    mon_spec = util.load_yaml('yaml/pods/monitoring-pod.yml', prefix)
    util.replace_yaml_val(mon_spec['spec']['containers'][0]['env'], 'MGMT_IP',
                          management_ip)
    client.create_namespaced_pod(namespace=util.NAMESPACE, body=mon_spec)

    # Wait until the monitoring pod is finished creating to get its IP address
    # and then copy KVS config into the monitoring pod.
    util.get_pod_ips(client, 'role=monitoring')
    util.copy_file_to_pod(client, 'anna-config.yml',
                          mon_spec['metadata']['name'], '/hydro/anna/conf/',
                          mon_spec['spec']['containers'][0]['name'])
    os.system('rm anna-config.yml')

    print('Creating %d routing nodes...' % (route_count))
    add_nodes(client, apps_client, cfile, ['routing'], [route_count], True,
              prefix)
    util.get_pod_ips(client, 'role=routing')

    print('Creating %d memory, %d ebs node(s)...' % (mem_count, ebs_count))
    add_nodes(client, apps_client, cfile, ['memory', 'ebs'],
              [mem_count, ebs_count], True, prefix)

    print('Creating routing service...')
    service_spec = util.load_yaml('yaml/services/routing.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    print('Adding %d scheduler nodes...' % (sched_count))
    add_nodes(client, apps_client, cfile, ['scheduler'], [sched_count], True,
              prefix)
    util.get_pod_ips(client, 'role=scheduler')

    print('Adding %d function serving nodes...' % (func_count))
    add_nodes(client, apps_client, cfile, ['function'], [func_count], True,
              prefix)

    print('Creating function service...')
    service_spec = util.load_yaml('yaml/services/function.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    print('Adding %d benchmark nodes...' % (bench_count))
    add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], True,
              prefix)

    print('Finished creating all pods...')
    os.system('touch setup_complete')
    util.copy_file_to_pod(client, 'setup_complete', management_podname,
                          '/hydro', kcname)
    os.system('rm setup_complete')

    sg_name = 'nodes.' + cluster_name
    sg = ec2_client.describe_security_groups(Filters=[{
        'Name': 'group-name',
        'Values': [sg_name]
    }])['SecurityGroups'][0]

    print('Authorizing ports for routing service...')

    permission = [{
        'FromPort': 6200,
        'IpProtocol': 'tcp',
        'ToPort': 6203,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }]

    ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'],
                                                IpPermissions=permission)

    routing_svc_addr = util.get_service_address(client, 'routing-service')
    function_svc_addr = util.get_service_address(client, 'function-service')
    print('The routing service can be accessed here: \n\t%s' %
          (routing_svc_addr))
    print('The function service can be accessed here: \n\t%s' %
          (function_svc_addr))