Ejemplo n.º 1
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    while True:
        socks = dict(poller.poll())

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                add_nodes(client, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' %
                             (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))
        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip +
                         '.')
            restart_pull_socket.send_string(count)
Ejemplo n.º 2
0
def remove_node(ip, ntype):
    client, _ = util.init_k8s()

    pod = util.get_pod_from_ip(client, ip)
    hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-'))

    podname = pod.metadata.name
    client.delete_namespaced_pod(name=podname, namespace=util.NAMESPACE,
                                 body=k8s.client.V1DeleteOptions())
    client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions())

    prev_count = util.get_previous_count(client, ntype)
    util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
Ejemplo n.º 3
0
def run():
    context = zmq.Context(1)
    client = util.init_k8s()

    node_add_socket = context.socket(zmq.PULL)
    node_add_socket.bind('ipc:///tmp/node_add')

    node_remove_socket = context.socket(zmq.PULL)
    node_remove_socket.bind('ipc:///tmp/node_remove')

    poller = zmq.Poller()
    poller.register(node_add_socket, zmq.POLLIN)
    poller.register(node_remove_socket, zmq.POLLIN)

    cfile = '/fluent/conf/kvs-base.yml'

    while True:
        socks = dict(poller.poll(timeout=1000))

        if node_add_socket in socks and socks[node_add_socket] == zmq.POLLIN:
            msg = node_add_socket.recv_string()
            args = msg.split(':')

            ntype = args[0]
            num = int(args[1])
            logging.info('Adding %d new %s node(s)...' % (num, ntype))

            mon_ips = util.get_pod_ips(client, 'role=monitoring')
            route_ips = util.get_pod_ips(client, 'role=routing')
            scheduler_ips = util.get_pod_ips(client, 'role=scheduler')
            route_addr = util.get_service_address(client, 'routing-service')

            add_nodes(client,
                      cfile, [ntype], [num],
                      mon_ips,
                      route_ips=route_ips,
                      route_addr=route_addr,
                      scheduler_ips=scheduler_ips)
            logging.info('Successfully added %d %s node(s).' % (num, ntype))

        if node_remove_socket in socks and socks[node_remove_socket] == \
                zmq.POLLIN:
            msg = node_remove_socket.recv_string()
            args = msg.split(':')

            ntype = args[0]
            ip = args[1]

            remove_node(ip, ntype)
            logging.info('Successfully removed node %s.' % (ip))
Ejemplo n.º 4
0
def main():
    client, apps_client = util.init_k8s()
    context = zmq.Context()

    # Sockets for hash ring membership changes
    rtr_join_sock = context.socket(zmq.PULL)
    rtr_join_sock.bind('tcp://*:%s' % (str(ru.MNG_JOIN_PORT)))
    rtr_depart_sock = context.socket(zmq.PULL)
    rtr_depart_sock.bind('tcp://*:%s' % (str(ru.MNG_DEPART_PORT)))

    poller = zmq.Poller()
    poller.register(rtr_join_sock, zmq.POLLIN)

    while True:
        socks = dict(poller.poll())
        if rtr_join_sock in socks and socks[rtr_join_sock] == zmq.POLLIN:
            logging.info('Received join')
            msg = rtr_join_sock.recv()
            router_broadcast(client, 'join', msg)
        if rtr_depart_sock in socks and socks[rtr_depart_sock] == zmq.POLLIN:
            logging.info('Received depart')
            msg = rtr_depart_sock.recv()
            router_broadcast(client, 'depart', msg)
Ejemplo n.º 5
0
def create_cluster(mem_count, ebs_count, func_count, sched_count, route_count,
                   bench_count, cfile, ssh_key, cluster_name, kops_bucket,
                   aws_key_id, aws_key):

    # create the cluster object with kops
    util.run_process(
        ['./create_cluster_object.sh', cluster_name, kops_bucket, ssh_key])

    client, apps_client = util.init_k8s()

    # create the kops pod
    print('Creating management pods...')
    kops_spec = util.load_yaml('yaml/pods/kops-pod.yml')
    env = kops_spec['spec']['containers'][0]['env']

    util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id)
    util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key)
    util.replace_yaml_val(env, 'KOPS_STATE_STORE', kops_bucket)
    util.replace_yaml_val(env, 'FLUENT_CLUSTER_NAME', cluster_name)

    client.create_namespaced_pod(namespace=util.NAMESPACE, body=kops_spec)

    # wait for the kops pod to start
    kops_ip = util.get_pod_ips(client, 'role=kops', is_running=True)[0]

    # copy kube config file to kops pod, so it can execute kubectl commands
    kops_podname = kops_spec['metadata']['name']
    kcname = kops_spec['spec']['containers'][0]['name']

    os.system('cp %s kvs-config.yml' % cfile)
    util.copy_file_to_pod(client, '/home/ubuntu/.kube/config', kops_podname,
                          '/root/.kube/', kcname)
    util.copy_file_to_pod(client, ssh_key, kops_podname, '/root/.ssh/', kcname)
    util.copy_file_to_pod(client, ssh_key + '.pub', kops_podname,
                          '/root/.ssh/', kcname)
    util.copy_file_to_pod(client, 'kvs-config.yml', kops_podname,
                          '/fluent/conf/', kcname)

    # start the monitoring pod
    mon_spec = util.load_yaml('yaml/pods/monitoring-pod.yml')
    util.replace_yaml_val(mon_spec['spec']['containers'][0]['env'], 'MGMT_IP',
                          kops_ip)
    client.create_namespaced_pod(namespace=util.NAMESPACE, body=mon_spec)

    util.get_pod_ips(client, 'role=monitoring')

    # copy config file into monitoring pod -- wait till we create routing pods,
    # so we're sure that the monitoring nodes are up and running
    util.copy_file_to_pod(client, 'kvs-config.yml',
                          mon_spec['metadata']['name'], '/fluent/conf/',
                          mon_spec['spec']['containers'][0]['name'])
    os.system('rm kvs-config.yml')

    print('Creating %d routing nodes...' % (route_count))
    add_nodes(client, apps_client, cfile, ['routing'], [route_count], True)
    util.get_pod_ips(client, 'role=routing')

    print('Creating %d memory, %d ebs node(s)...' % (mem_count, ebs_count))
    add_nodes(client, apps_client, cfile, ['memory', 'ebs'],
              [mem_count, ebs_count], True)

    print('Creating routing service...')
    service_spec = util.load_yaml('yaml/services/routing.yml')
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    print('Adding %d scheduler nodes...' % (sched_count))
    add_nodes(client, apps_client, cfile, ['scheduler'], [sched_count], True)
    util.get_pod_ips(client, 'role=scheduler')

    print('Adding %d function serving nodes...' % (func_count))
    add_nodes(client, apps_client, cfile, ['function'], [func_count], True)

    print('Creating function service...')
    service_spec = util.load_yaml('yaml/services/function.yml')
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    print('Adding %d benchmark nodes...' % (bench_count))
    add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count], True)

    print('Finished creating all pods...')
    os.system('touch setup_complete')
    util.copy_file_to_pod(client, 'setup_complete', kops_podname, '/fluent',
                          kcname)
    os.system('rm setup_complete')

    sg_name = 'nodes.' + cluster_name
    sg = ec2_client.describe_security_groups(Filters=[{
        'Name': 'group-name',
        'Values': [sg_name]
    }])['SecurityGroups'][0]

    print('Authorizing ports for routing service...')

    permission = [{
        'FromPort': 6200,
        'IpProtocol': 'tcp',
        'ToPort': 6203,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }]
    ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'],
                                                IpPermissions=permission)

    routing_svc_addr = util.get_service_address(client, 'routing-service')
    function_svc_addr = util.get_service_address(client, 'function-service')
    print('The routing service can be accessed here: \n\t%s' %
          (routing_svc_addr))
    print('The function service can be accessed here: \n\t%s' %
          (function_svc_addr))
Ejemplo n.º 6
0
def run():
    context = zmq.Context(1)

    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    list_executors_socket = context.socket(zmq.REP)
    list_executors_socket.bind('tcp://*:7002')

    function_status_socket = context.socket(zmq.PULL)
    function_status_socket.bind('tcp://*:7003')

    list_schedulers_socket = context.socket(zmq.REP)
    list_schedulers_socket.bind('tcp://*:7004')

    executor_depart_socket = context.socket(zmq.PULL)
    executor_depart_socket.bind('tcp://*:7005')

    executor_statistics_socket = context.socket(zmq.PULL)
    executor_statistics_socket.bind('tcp://*:7006')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)
    poller.register(function_status_socket, zmq.POLLIN)
    poller.register(list_executors_socket, zmq.POLLIN)
    poller.register(list_schedulers_socket, zmq.POLLIN)
    poller.register(executor_depart_socket, zmq.POLLIN)
    poller.register(executor_statistics_socket, zmq.POLLIN)

    add_push_socket = context.socket(zmq.PUSH)
    add_push_socket.connect('ipc:///tmp/node_add')

    remove_push_socket = context.socket(zmq.PUSH)
    remove_push_socket.connect('ipc:///tmp/node_remove')

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    # track the self-reported status of each function execution thread
    executor_statuses = {}
    departing_executors = {}
    function_frequencies = {}
    function_runtimes = {}
    latency_history = {}

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:
            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                msg = args[2] + args[1]
                add_push_socket.send_string(msg)
            elif args[0] == 'remove':
                msg = args[2] = args[1]
                remove_push_socket.send_string(msg)

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:
            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count %s for IP %s.' % (count, ip))
            restart_pull_socket.send_string(count)

        if list_executors_socket in socks and socks[list_executors_socket] == \
                zmq.POLLIN:
            # it doesn't matter what is in this message
            msg = list_executors_socket.recv()

            ks = KeySet()
            for ip in util.get_pod_ips(client, 'role=function'):
                ks.keys.append(ip)

            list_executors_socket.send(ks.SerializeToString())

        if function_status_socket in socks and \
                socks[function_status_socket] == zmq.POLLIN:
            status = ThreadStatus()
            status.ParseFromString(function_status_socket.recv())

            key = (status.ip, status.tid)

            # if this executor is one of the ones that's currently departing,
            # we can just ignore its status updates since we don't want
            # utilization to be skewed downwards
            if key in departing_executors:
                continue

            executor_statuses[key] = status
            logging.info(('Received thread status update from %s:%d: %.4f ' +
                    'occupancy, %d functions pinned') % (status.ip, status.tid,
                        status.utilization, len(status.functions)))

        if list_schedulers_socket in socks and socks[list_schedulers_socket] == \
                zmq.POLLIN:
            # It doesn't matter what is in this message
            msg = list_schedulers_socket.recv_string()

            ks = KeySet()
            for ip in util.get_pod_ips(client, 'role=scheduler'):
                ks.keys.append(ip)

            list_schedulers_socket.send(ks.SerializeToString())

        if executor_depart_socket in socks and \
                socks[executor_depart_socket] == zmq.POLLIN:
            ip = executor_depart_socket.recv_string()
            departing_executors[ip] -= 1

            # wait until all the executors on this IP have cleared their queues
            # and left; then we remove the node
            if departing_executors[ip] == 0:
                msg = 'function:' + ip
                remove_push_socket.send_string(msg)
                del departing_executors[ip]

        if executor_statistics_socket in socks and \
                socks[executor_statistics_socket] == zmq.POLLIN:
            stats = ExecutorStatistics()
            stats.ParseFromString(executor_statistics_socket.recv())

            for fstats in stats.statistics:
                fname = fstats.fname

                if fname not in function_frequencies:
                    function_frequencies[fname] = 0
                    function_runtimes[fname] = 0.0

                function_frequencies[fname] += fstats.call_count
                function_runtimes[fname] += fstats.runtime

        end = time.time()
        if end - start > REPORT_PERIOD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client, add_push_socket)

            check_executor_utilization(client, context, executor_statuses,
                    departing_executors, add_push_socket)

            check_function_load(context, function_frequencies, function_runtimes,
                    executor_statuses, latency_history)
            start = time.time()
Ejemplo n.º 7
0
import util
from routing_util import register, deregister
import subprocess
import time

# AWS Info
aws_key_id = util.check_or_get_env_arg('AWS_ACCESS_KEY_ID')
aws_key = util.check_or_get_env_arg('AWS_SECRET_ACCESS_KEY')

# Config File Info
BASE_CONFIG_FILE = '../config/tasc-base.yml'
CONFIG_FILE = './tasc-config.yml'
POD_CONFIG_DIR = '/go/src/github.com/saurav-c/tasc/config'

NODE_TYPES = ['tasc', 'keynode', 'routing', 'lb', 'worker', 'benchmark']
client, apps_client = util.init_k8s()


def main():
    args = sys.argv[1:]
    cmd = args[0]

    if cmd == 'send-conf':
        ip = args[1]
        conf = args[2] if len(args) > 2 else None
        sendConfig(ip, conf)
    elif cmd == 'add':
        ntype = args[1]
        count = int(args[2])
        if ntype not in NODE_TYPES:
            print('Unknown node type: ' + ntype)
Ejemplo n.º 8
0
def create_cluster(replica_count, gc_count, lb_count, bench_count, cfile,
                   ssh_key, cluster_name, kops_bucket, aws_key_id, aws_key):
    prefix = './'
    util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key])

    client, apps_client = util.init_k8s()

    print('Creating management pod')
    # management_spec = util.load_yaml('yaml/pods/management-pod.yml')
    # env = management_spec['spec']['containers'][0]['env']
    # util.replace_yaml_val(env, 'AWS_ACCESS_KEY_ID', aws_key_id)
    # util.replace_yaml_val(env, 'AWS_SECRET_ACCESS_KEY', aws_key)
    #
    # client.create_namespaced_pod(namespace=util.NAMESPACE,
    #                              body=management_spec)
    # management_ip = util.get_pod_ips(client, 'role=management',
    #                                 is_running=True)[0]

    management_ip = ""

    print('Creating standby replicas...')
    util.run_process(['./modify_ig.sh', 'standby', '1'])
    util.run_process(['./validate_cluster.sh'])
    print('Creating %d load balancer, %d GC replicas...' %
          (lb_count, gc_count))
    add_nodes(client, apps_client, cfile, ['lb', 'gc'], [lb_count, gc_count],
              management_ip, aws_key_id, aws_key, True, prefix)

    lb_pods = client.list_namespaced_pod(namespace=util.NAMESPACE,
                                         label_selector="role=lb").items
    kubecfg = os.path.join(os.environ['HOME'], '.kube/config')
    for pod in lb_pods:
        util.copy_file_to_pod(client, kubecfg, pod.metadata.name,
                              '/root/.kube', 'lb-container')

    replica_ips = util.get_node_ips(client, 'role=gc', 'ExternalIP')
    with open('gcs.txt', 'w') as f:
        for ip in replica_ips:
            f.write(ip + '\n')

    # Wait until the monitoring pod is finished creating to get its IP address
    # and then copy KVS config into the monitoring pod.
    print('Creating %d Aft replicas...' % (replica_count))
    add_nodes(client, apps_client, cfile, ['aft'], [replica_count],
              management_ip, aws_key_id, aws_key, True, prefix)
    util.get_pod_ips(client, 'role=aft')

    replica_ips = util.get_node_ips(client, 'role=aft', 'ExternalIP')
    with open('replicas.txt', 'w') as f:
        for ip in replica_ips:
            f.write(ip + '\n')

    os.system('cp %s aft-config.yml' % cfile)
    management_pname = management_spec['metadata']['name']
    management_cname = management_spec['spec']['containers'][0]['name']
    util.copy_file_to_pod(client, 'aft-config.yml', management_pname,
                          '/go/src/github.com/tajshaik24/aft/config',
                          management_cname)
    util.copy_file_to_pod(client, 'replicas.txt', management_pname,
                          '/go/src/github.com/tajshaik24/aft',
                          management_cname)
    util.copy_file_to_pod(client, 'gcs.txt', management_pname,
                          '/go/src/github.com/tajshaik24/aft',
                          management_cname)
    util.copy_file_to_pod(client, kubecfg, management_pname, '/root/.kube/',
                          management_cname)
    os.system('rm aft-config.yml')
    os.system('rm gcs.txt')

    # Copy replicas.txt to all Aft pods.
    aft_pod_list = client.list_namespaced_pod(namespace=util.NAMESPACE,
                                              label_selector="role=aft").items
    aft_pod_list = list(map(lambda pod: pod.metadata.name, aft_pod_list))
    for pname in aft_pod_list:
        util.copy_file_to_pod(client, 'replicas.txt', pname,
                              '/go/src/github.com/tajshaik24/aft',
                              'aft-container')

    gc_pod_list = client.list_namespaced_pod(namespace=util.NAMESPACE,
                                             label_selector="role=gc").items
    gc_pod_list = list(map(lambda pod: pod.metadata.name, gc_pod_list))
    for pname in gc_pod_list:
        util.copy_file_to_pod(client, 'replicas.txt', pname,
                              '/go/src/github.com/tajshaik24/aft',
                              'gc-container')
    os.system('rm replicas.txt')

    print('Adding %d benchmark nodes...' % (bench_count))
    add_nodes(client, apps_client, cfile, ['benchmark'], [bench_count],
              management_ip, aws_key_id, aws_key, True, prefix)

    print('Finished creating all pods...')

    print('Creating Aft service...')
    service_spec = util.load_yaml('yaml/services/aft.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    sg_name = 'nodes.' + cluster_name
    sg = ec2_client.describe_security_groups(Filters=[{
        'Name': 'group-name',
        'Values': [sg_name]
    }])['SecurityGroups'][0]

    print('Authorizing ports for Aft replicas...')
    permission = [{
        'FromPort': 7654,
        'IpProtocol': 'tcp',
        'ToPort': 7656,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }, {
        'FromPort': 7777,
        'IpProtocol': 'tcp',
        'ToPort': 7782,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }, {
        'FromPort': 8000,
        'IpProtocol': 'tcp',
        'ToPort': 8003,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }]

    ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'],
                                                IpPermissions=permission)
    print('Finished!')
Ejemplo n.º 9
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    extant_caches_socket = context.socket(zmq.REP)
    extant_caches_socket.bind('tcp://*:7002')

    func_pull_socket = context.socket(zmq.PULL)
    func_pull_socket.bind('tcp://*:7003')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)
    poller.register(func_pull_socket, zmq.POLLIN)
    poller.register(extant_caches_socket, zmq.POLLIN)

    cfile = '/fluent/conf/kvs-base.yml'

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    func_occ_map = {}

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                if len(args) > 3:
                    num_threads = args[3]
                else:
                    num_threads = 3

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype,
                    ntype, num_threads, cfile))
                os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype,
                    ntype, num_threads * 15, cfile))

                add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' % (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip + '.')
            restart_pull_socket.send_string(count)

        if extant_caches_socket in socks and socks[extant_caches_socket] == \
                zmq.POLLIN:

            # It doesn't matter what is in this message
            msg = extant_caches_socket.recv_string()

            ks = KeySet()
            for ip in util.get_pod_ips(clinet, 'role=function'):
                ks.add_keys(ip)

            extant_caches_socket.send_string(ks.SerializeToString())

        if func_pull_socket in socks and socks[func_pull_socket] == zmq.POLLIN:
            msg = func_pull_socket.recv_string()
            args = msg.split('|')
            ip, mutil = args[0], float(args[1])

            logging.info('Received node occupancy of %.2f%% from IP %s.' %
                    (mutil * 100, ip))

            func_occ_map[ip] = mutil

        end = time.time()
        if end - start > THRESHOLD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client)

            if func_occ_map.values():
                avg_focc = reduce(lambda a, b: a + b, func_occ_map.values(), \
                        0) / len(func_occ_map)
            else:
                avg_focc = 0
            logging.info('Average node occupancy is %f%%...' % (avg_focc * 100))

            if avg_focc > FOCC_THRESHOLD:
                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_addr = get_service_address(client, 'routing-service')
                add_nodes(client, ['function'], [1], mon_ips,
                        route_addr=route_addr)

            start = time.time()
Ejemplo n.º 10
0
def create_cluster(txn_count, keynode_count, rtr_count, worker_count, lb_count,
                   benchmark_count, config_file, branch_name, ssh_key,
                   cluster_name, kops_bucket, aws_key_id, aws_key,
                   anna_config_file):
    prefix = './'
    util.run_process(['./create_cluster_object.sh', kops_bucket, ssh_key],
                     'kops')

    client, apps_client = util.init_k8s()

    print('Creating Monitor Node...')
    add_nodes(client, apps_client, config_file, "monitor", 1, aws_key_id,
              aws_key, True, prefix, branch_name)

    print('Creating %d Anna Routing Nodes...' % (rtr_count))
    add_nodes(client, apps_client, anna_config_file, "routing", rtr_count,
              aws_key_id, aws_key, True, prefix, branch_name)

    print('Creating routing service...')
    service_spec = util.load_yaml('yaml/services/routing.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)
    util.get_service_address(client, 'routing-service')

    print('Creating %d Key Nodes...' % (keynode_count))
    add_nodes(client, apps_client, config_file, "keynode", keynode_count,
              aws_key_id, aws_key, True, prefix, branch_name)

    print('Creating %d Worker Nodes...' % (worker_count))
    add_nodes(client, apps_client, config_file, "worker", worker_count,
              aws_key_id, aws_key, True, prefix, branch_name)

    print('Creating Worker Service...')
    service_spec = util.load_yaml('yaml/services/worker.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)
    util.get_service_address(client, 'worker-service')

    print('Creating %d TASC nodes...' % (txn_count))
    add_nodes(client, apps_client, config_file, 'tasc', txn_count, aws_key_id,
              aws_key, True, prefix, branch_name)

    print('Creating %d Load Balancers...' % (lb_count))
    add_nodes(client, apps_client, config_file, 'lb', lb_count, aws_key_id,
              aws_key, True, prefix, branch_name)

    print('Creating TASC Load Balancing service...')
    service_spec = util.load_yaml('yaml/services/tasc.yml', prefix)
    client.create_namespaced_service(namespace=util.NAMESPACE,
                                     body=service_spec)

    print('Creating %d Benchmark nodes...' % (benchmark_count))
    add_nodes(client, apps_client, config_file, 'benchmark', benchmark_count,
              aws_key_id, aws_key, True, prefix, branch_name)

    benchmark_ips = util.get_node_ips(client, 'role=benchmark', 'ExternalIP')
    with open('../cmd/benchmark/benchmarks.txt', 'w+') as f:
        for ip in benchmark_ips:
            f.write(ip + '\n')

    print('Finished creating all pods...')

    sg_name = 'nodes.' + cluster_name
    sg = ec2_client.describe_security_groups(Filters=[{
        'Name': 'group-name',
        'Values': [sg_name]
    }])['SecurityGroups'][0]
    print("Authorizing Ports for TASC...")
    permission = [{
        'FromPort': 0,
        'IpProtocol': 'tcp',
        'ToPort': 65535,
        'IpRanges': [{
            'CidrIp': '0.0.0.0/0'
        }]
    }]

    ec2_client.authorize_security_group_ingress(GroupId=sg['GroupId'],
                                                IpPermissions=permission)

    print('Registering Key Nodes...')
    keynode_pod_ips = util.get_pod_ips(client, 'role=keynode', is_running=True)
    register(client, keynode_pod_ips)

    print("\nThe TASC ELB Endpoint: " +
          util.get_service_address(client, "tasc-service") + "\n")
    print('Finished!')
Ejemplo n.º 11
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)

    cfile = '/fluent/conf/kvs-base.yml'

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                if len(args) > 3:
                    num_threads = args[3]
                else:
                    num_threads = 3

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' %
                          (ntype, ntype, num_threads, cfile))
                os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' %
                          (ntype, ntype, num_threads * 15, cfile))

                add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' %
                             (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip +
                         '.')
            restart_pull_socket.send_string(count)

        end = time.time()
        if end - start > THRESHOLD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client)

            start = time.time()