Beispiel #1
0
def restart(pod_ip):
    pod = util.get_pod_from_ip(client, pod_ip)
    pname = pod.metadata.name
    cname = pod.spec.containers[0].name
    kill_cmd = 'kubectl exec -it %s -c %s -- /sbin/killall5' % (pname, cname)
    subprocess.run(kill_cmd, shell=True)

    pod_ips = util.get_pod_ips(client, selector='role=aft', is_running=True)
    while pod_ip not in pod_ips:
        pod_ips = util.get_pod_ips(client,
                                   selector='role=aft',
                                   is_running=True)

    # Send config file to the pod
    retry = 0
    while True:
        try:
            sendConfig(pod_ip, None)
            break
        except Exception as e:
            retry += 1
            print('Caught exception')
            if retry >= 5:
                print('Out of retries...')
                print(e)
                return
            print('Retrying in %d sec' % (retry * 10))
            time.sleep(retry * 10)

    print('Restarted %s node at %s' % (kind, pod_ip))
Beispiel #2
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    while True:
        socks = dict(poller.poll())

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                add_nodes(client, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' %
                             (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))
        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip +
                         '.')
            restart_pull_socket.send_string(count)
Beispiel #3
0
def remove_node(ip, ntype):
    client, _ = util.init_k8s()

    pod = util.get_pod_from_ip(client, ip)
    hostname = 'ip-%s.ec2.internal' % (ip.replace('.', '-'))

    podname = pod.metadata.name
    client.delete_namespaced_pod(name=podname, namespace=util.NAMESPACE,
                                 body=k8s.client.V1DeleteOptions())
    client.delete_node(name=hostname, body=k8s.client.V1DeleteOptions())

    prev_count = util.get_previous_count(client, ntype)
    util.run_process(['./modify_ig.sh', ntype, str(prev_count - 1)])
Beispiel #4
0
def sendConfig(nodeIP, configFile):
    pod = util.get_pod_from_ip(client, nodeIP)
    pname = pod.metadata.name
    # There is only 1 container in each Pod
    cname = pod.spec.containers[0].name

    cfile = configFile if configFile else BASE_CONFIG_FILE
    os.system(str('cp %s ' + CONFIG_FILE) % cfile)

    util.copy_file_to_pod(client, CONFIG_FILE[2:], pname, POD_CONFIG_DIR,
                          cname)

    os.system('rm ' + CONFIG_FILE)
Beispiel #5
0
def sendConfig(nodeIP, configFile):
    pod = util.get_pod_from_ip(client, nodeIP)
    pname = pod.metadata.name
    # There is only 1 container in each Pod
    cname = pod.spec.containers[0].name

    cfile = configFile if configFile else BASE_CONFIG_FILE
    os.system(str('cp %s ' + CONFIG_FILE) % cfile)

    util.copy_file_to_pod(client, CONFIG_FILE[2:], pname, POD_CONFIG_DIR,
                          cname)

    os.system('rm ' + CONFIG_FILE)

    # Sending replica txt
    replica_ips = util.get_node_ips(client, 'role=aft', 'ExternalIP')
    with open('replicas.txt', 'w') as f:
        for ip in replica_ips:
            f.write(ip + '\n')

    util.copy_file_to_pod(client, 'replicas.txt', pname,
                          '/go/src/github.com/tajshaik24/aft', 'aft-container')
    os.system('rm replicas.txt')
Beispiel #6
0
def run():
    context = zmq.Context(1)

    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    list_executors_socket = context.socket(zmq.REP)
    list_executors_socket.bind('tcp://*:7002')

    function_status_socket = context.socket(zmq.PULL)
    function_status_socket.bind('tcp://*:7003')

    list_schedulers_socket = context.socket(zmq.REP)
    list_schedulers_socket.bind('tcp://*:7004')

    executor_depart_socket = context.socket(zmq.PULL)
    executor_depart_socket.bind('tcp://*:7005')

    executor_statistics_socket = context.socket(zmq.PULL)
    executor_statistics_socket.bind('tcp://*:7006')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)
    poller.register(function_status_socket, zmq.POLLIN)
    poller.register(list_executors_socket, zmq.POLLIN)
    poller.register(list_schedulers_socket, zmq.POLLIN)
    poller.register(executor_depart_socket, zmq.POLLIN)
    poller.register(executor_statistics_socket, zmq.POLLIN)

    add_push_socket = context.socket(zmq.PUSH)
    add_push_socket.connect('ipc:///tmp/node_add')

    remove_push_socket = context.socket(zmq.PUSH)
    remove_push_socket.connect('ipc:///tmp/node_remove')

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    # track the self-reported status of each function execution thread
    executor_statuses = {}
    departing_executors = {}
    function_frequencies = {}
    function_runtimes = {}
    latency_history = {}

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:
            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                msg = args[2] + args[1]
                add_push_socket.send_string(msg)
            elif args[0] == 'remove':
                msg = args[2] = args[1]
                remove_push_socket.send_string(msg)

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:
            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count %s for IP %s.' % (count, ip))
            restart_pull_socket.send_string(count)

        if list_executors_socket in socks and socks[list_executors_socket] == \
                zmq.POLLIN:
            # it doesn't matter what is in this message
            msg = list_executors_socket.recv()

            ks = KeySet()
            for ip in util.get_pod_ips(client, 'role=function'):
                ks.keys.append(ip)

            list_executors_socket.send(ks.SerializeToString())

        if function_status_socket in socks and \
                socks[function_status_socket] == zmq.POLLIN:
            status = ThreadStatus()
            status.ParseFromString(function_status_socket.recv())

            key = (status.ip, status.tid)

            # if this executor is one of the ones that's currently departing,
            # we can just ignore its status updates since we don't want
            # utilization to be skewed downwards
            if key in departing_executors:
                continue

            executor_statuses[key] = status
            logging.info(('Received thread status update from %s:%d: %.4f ' +
                    'occupancy, %d functions pinned') % (status.ip, status.tid,
                        status.utilization, len(status.functions)))

        if list_schedulers_socket in socks and socks[list_schedulers_socket] == \
                zmq.POLLIN:
            # It doesn't matter what is in this message
            msg = list_schedulers_socket.recv_string()

            ks = KeySet()
            for ip in util.get_pod_ips(client, 'role=scheduler'):
                ks.keys.append(ip)

            list_schedulers_socket.send(ks.SerializeToString())

        if executor_depart_socket in socks and \
                socks[executor_depart_socket] == zmq.POLLIN:
            ip = executor_depart_socket.recv_string()
            departing_executors[ip] -= 1

            # wait until all the executors on this IP have cleared their queues
            # and left; then we remove the node
            if departing_executors[ip] == 0:
                msg = 'function:' + ip
                remove_push_socket.send_string(msg)
                del departing_executors[ip]

        if executor_statistics_socket in socks and \
                socks[executor_statistics_socket] == zmq.POLLIN:
            stats = ExecutorStatistics()
            stats.ParseFromString(executor_statistics_socket.recv())

            for fstats in stats.statistics:
                fname = fstats.fname

                if fname not in function_frequencies:
                    function_frequencies[fname] = 0
                    function_runtimes[fname] = 0.0

                function_frequencies[fname] += fstats.call_count
                function_runtimes[fname] += fstats.runtime

        end = time.time()
        if end - start > REPORT_PERIOD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client, add_push_socket)

            check_executor_utilization(client, context, executor_statuses,
                    departing_executors, add_push_socket)

            check_function_load(context, function_frequencies, function_runtimes,
                    executor_statuses, latency_history)
            start = time.time()
Beispiel #7
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    extant_caches_socket = context.socket(zmq.REP)
    extant_caches_socket.bind('tcp://*:7002')

    func_pull_socket = context.socket(zmq.PULL)
    func_pull_socket.bind('tcp://*:7003')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)
    poller.register(func_pull_socket, zmq.POLLIN)
    poller.register(extant_caches_socket, zmq.POLLIN)

    cfile = '/fluent/conf/kvs-base.yml'

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    func_occ_map = {}

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                if len(args) > 3:
                    num_threads = args[3]
                else:
                    num_threads = 3

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' % (ntype,
                    ntype, num_threads, cfile))
                os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' % (ntype,
                    ntype, num_threads * 15, cfile))

                add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' % (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip + '.')
            restart_pull_socket.send_string(count)

        if extant_caches_socket in socks and socks[extant_caches_socket] == \
                zmq.POLLIN:

            # It doesn't matter what is in this message
            msg = extant_caches_socket.recv_string()

            ks = KeySet()
            for ip in util.get_pod_ips(clinet, 'role=function'):
                ks.add_keys(ip)

            extant_caches_socket.send_string(ks.SerializeToString())

        if func_pull_socket in socks and socks[func_pull_socket] == zmq.POLLIN:
            msg = func_pull_socket.recv_string()
            args = msg.split('|')
            ip, mutil = args[0], float(args[1])

            logging.info('Received node occupancy of %.2f%% from IP %s.' %
                    (mutil * 100, ip))

            func_occ_map[ip] = mutil

        end = time.time()
        if end - start > THRESHOLD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client)

            if func_occ_map.values():
                avg_focc = reduce(lambda a, b: a + b, func_occ_map.values(), \
                        0) / len(func_occ_map)
            else:
                avg_focc = 0
            logging.info('Average node occupancy is %f%%...' % (avg_focc * 100))

            if avg_focc > FOCC_THRESHOLD:
                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_addr = get_service_address(client, 'routing-service')
                add_nodes(client, ['function'], [1], mon_ips,
                        route_addr=route_addr)

            start = time.time()
Beispiel #8
0
def run():
    context = zmq.Context(1)
    restart_pull_socket = context.socket(zmq.REP)
    restart_pull_socket.bind('tcp://*:7000')

    churn_pull_socket = context.socket(zmq.PULL)
    churn_pull_socket.bind('tcp://*:7001')

    poller = zmq.Poller()
    poller.register(restart_pull_socket, zmq.POLLIN)
    poller.register(churn_pull_socket, zmq.POLLIN)

    cfile = '/fluent/conf/kvs-base.yml'

    # waits until the kubecfg file gets copied into the pod -- this might be
    # brittle if we try to move to a non-Ubuntu setting, but I'm not worried
    # about that for now
    while not os.path.isfile('/root/.kube/config'):
        pass

    client = util.init_k8s()

    start = time.time()
    while True:
        socks = dict(poller.poll(timeout=1000))

        if churn_pull_socket in socks and socks[churn_pull_socket] == \
                zmq.POLLIN:

            msg = churn_pull_socket.recv_string()
            args = msg.split(':')

            if args[0] == 'add':
                num = int(args[1])
                ntype = args[2]
                logging.info('Adding %d new %s node(s)...' % (num, ntype))

                if len(args) > 3:
                    num_threads = args[3]
                else:
                    num_threads = 3

                mon_ips = util.get_pod_ips(client, 'role=monitoring')
                route_ips = util.get_pod_ips(client, 'role=routing')

                os.system('sed -i "s|%s: [0-9][0-9]*|%s: %d|g" %s' %
                          (ntype, ntype, num_threads, cfile))
                os.system('sed -i "s|%s-cap: [0-9][0-9]*|%s: %d|g" %s' %
                          (ntype, ntype, num_threads * 15, cfile))

                add_nodes(client, cfile, [ntype], [num], mon_ips, route_ips)
                logging.info('Successfully added %d %s node(s).' %
                             (num, ntype))
            elif args[0] == 'remove':
                ip = args[1]
                ntype = args[2]

                remove_node(ip, ntype)
                logging.info('Successfully removed node %s.' % (ip))

        if restart_pull_socket in socks and socks[restart_pull_socket] == \
                zmq.POLLIN:

            msg = restart_pull_socket.recv_string()
            args = msg.split(':')

            ip = args[1]
            pod = util.get_pod_from_ip(client, ip)

            count = str(pod.status.container_statuses[0].restart_count)

            logging.info('Returning restart count ' + count + ' for IP ' + ip +
                         '.')
            restart_pull_socket.send_string(count)

        end = time.time()
        if end - start > THRESHOLD:
            logging.info('Checking hash ring...')
            check_hash_ring(client, context)

            logging.info('Checking for extra nodes...')
            check_unused_nodes(client)

            start = time.time()