Example #1
0
    def schedule_link_join(self, time, graph, origin, destination):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        joining_links = []
        for link in new_graph.removed_links:
            if link.source.name == origin and link.destination.name == destination or link.source.name == destination and link.destination.name == origin:
                joining_links.append(link)
        for link in joining_links:
            new_graph.removed_links.remove(link)
            new_graph.links.append(link)

        for l in joining_links:
            for node in new_graph.services:
                if l.source == node:
                    for nodeinstance in new_graph.services[node]:
                        nodeinstance.links.append(l)
            for bridge in new_graph.bridges:
                if l.source == new_graph.bridges[bridge][0]:
                    new_graph.bridges[bridge][0].links.append(l)

        self.recompute_and_store(new_graph, time)

        print_message("Link " + origin + "--" + destination +
                      " scheduled to join at " + str(time))
        self.graph_changes.append((time, [graph, new_graph]))
Example #2
0
    def schedule_link_leave(self, time, graph, origin, destination):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        for l in new_graph.links:
            if (l.source.name == origin and l.destination.name
                    == destination) or (l.source.name == destination
                                        and l.destination.name == origin):
                new_graph.removed_links.append(l)
        for l in new_graph.removed_links:
            if l in new_graph.links:  # and has not been removed before
                new_graph.links.remove(l)
                for node in new_graph.services:
                    for nodeinstance in new_graph.services[node]:
                        if l in nodeinstance.links:
                            nodeinstance.links.remove(l)
                for bridge in new_graph.bridges:
                    if l in new_graph.bridges[bridge][0].links:
                        new_graph.bridges[bridge][0].links.remove(l)

        self.recompute_and_store(new_graph, time)

        print_message("Link " + origin + "--" + destination +
                      " scheduled to leave at " + str(time))
        self.graph_changes.append((time, [graph, new_graph]))
Example #3
0
    def __init__(self, net_graph, event_scheduler):
        self.graph = net_graph  # type: NetGraph
        self.scheduler = event_scheduler  # type: EventScheduler
        self.active_paths = []  # type: List[NetGraph.Path]
        self.active_paths_ids = []  # type: List[int]
        self.flow_accumulator = {
        }  # type: Dict[str, List[List[int], int, int]]
        self.state_lock = Lock()
        self.last_time = 0
        # self.delayed_flows = 0

        EmulationCore.POOL_PERIOD = float(
            environ.get(ENVIRONMENT.POOL_PERIOD,
                        str(EmulationCore.POOL_PERIOD)))
        EmulationCore.ITERATIONS_TO_INTEGRATE = int(
            environ.get(ENVIRONMENT.ITERATION_COUNT,
                        str(EmulationCore.ITERATIONS_TO_INTEGRATE)))

        print_message("Pool Period: " + str(EmulationCore.POOL_PERIOD))
        # print_message("Iteration Count: " + str(EmulationCore.ITERATIONS_TO_INTEGRATE))

        self.check_flows_time_delta = 0
        # We need to give the callback a reference to ourselves (kind of hackish...)
        global emuManager
        emuManager = self

        if getenv('RUNTIME_EMULATION', 'true') != 'false':
            self.comms = CommunicationsManager(self.collect_flow, self.graph,
                                               self.scheduler)
Example #4
0
def path_change(graphs):
    start = time()
    graph = graphs[0]
    new_graph = graphs[1]
    try:
        #is a service not reachable after this change? Then set packet loss to 100%
        to_remove = []
        for service in graph.paths:
            if not service in new_graph.paths and isinstance(
                    service, NetGraph.Service):
                to_remove.append(service)
                change_loss(service, 1.0)
        for service in to_remove:
            del graph.paths[service]

        graph.links_by_index = new_links_by_index(
            new_graph.links_by_index,
            graph.links_by_index)  #update necessary??

        #apply paths that do exist now and *were* already in the last graph...
        new_paths_by_id = {}
        for service in new_graph.paths:
            if service in graph.paths:
                if isinstance(service,
                              NetGraph.Service) and not service == graph.root:
                    current_bw = graph.paths[service].current_bandwidth
                    new_path = new_graph.paths[service]
                    with graph.paths[service].lock:
                        new_path.links = [
                            graph.links_by_index[link.index]
                            for link in new_path.links
                        ]
                        graph.paths[service] = new_path
                        graph.paths[
                            service].current_bandwidth = current_bw  #the new paths have the clean maximum computed. Here we need the bookkeeping of the old path.
                        change_loss(service, graph.paths[service].drop)
                        change_latency(service, graph.paths[service].latency,
                                       graph.paths[service].jitter)
                    new_paths_by_id[new_path.id] = new_path
            #... or not
            else:  # service is now reachable after not having been reachable
                if isinstance(service, NetGraph.Service):
                    with graph.paths[service].lock:
                        graph.paths[service] = new_graph.paths[service]
                        graph.paths[service].links = update_links([
                            link.index for link in graph.paths[service].links
                        ], graph.links_by_index)
                        graph.paths[service].current_bandwidth = 0
                        change_loss(service, graph.paths[service].drop)
                        change_latency(service, graph.paths[service].latency,
                                       graph.paths[service].jitter)
                    new_paths_by_id[new_path.id] = new_path
        graph.paths_by_id = new_paths_by_id
        graph.links_by_index = new_graph.links_by_index  #update necessary??
    except Exception as e:
        print_message("Error updating paths: " + str(e))
    end = time()
    print_message("recalculated in " + '{p:.4f}'.format(p=end - start))
Example #5
0
    def schedule_bridge_join(self, time, graph, name):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        bridge = new_graph.removed_bridges[name]
        del new_graph.removed_bridges[name]
        new_graph.bridges[name] = bridge

        self.recompute_and_store(new_graph, time)

        print_message("Bridge " + name + " scheduled to join back at " +
                      str(time))
        self.graph_changes.append((time, [graph, new_graph]))
Example #6
0
    def schedule_new_link(self, time, graph, source, destination, latency,
                          jitter, drop, bandwidth, network):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        new_graph.new_link(source, destination, latency, jitter, drop,
                           bandwidth, network)

        self.recompute_and_store(new_graph, time)

        print_message("Link " + source + "--" + destination +
                      " scheduled to newly join at " + str(time))
        self.graph_changes.append((time, [graph, new_graph]))
Example #7
0
    def schedule_bridge_leave(self, time, graph, name):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        #hack to still find the bridge in XMLGraphParser at startup time
        #it doesn't matter to have it there, because it will be overwritten by dynamic changes at runtime
        graph.removed_bridges[name] = new_graph.bridges[name]

        new_graph.removed_bridges[name] = new_graph.bridges[name]
        del new_graph.bridges[name]

        self.recompute_and_store(new_graph, time)

        print_message("Bridge " + name + " scheduled to leave at " + str(time))
        self.graph_changes.append((time, [graph, new_graph]))
 def start_god_container(self, label):
     while True:
         try:
             # If we are bootstrapper:
             us = None
             while not us:
                 containers = self.high_level_client.containers.list()
                 for container in containers:
                     if "boot" + label in container.labels:
                         us = container
             
                 sleep(1)
         
             boot_image = us.image
         
             inspect_result = self.low_level_client.inspect_container(us.id)
             env = inspect_result["Config"]["Env"]
         
             print_message("[Py (bootstrapper)] ip: " + str(socket.gethostbyname(socket.gethostname())))
             
             # create a "God" container that is in the host's Pid namespace
             self.high_level_client.containers.run(image=boot_image,
                                                   command=["-g", label, str(us.id)],
                                                   privileged=True,
                                                   pid_mode="host",
                                                   network="host",
                                                   shm_size=int(os.getenv('SHM_SIZE', '8000000000')),
                                                   remove=True,
                                                   name="god_" + str(random.getrandbits(64)),  # grep friendly
                                                   environment=env,
                                                   volumes_from=[us.id],
                                                   # network_mode="container:"+us.id,  # share the network stack with this container
                                                   # network='test_overlay',
                                                   labels=["god" + label],
                                                   detach=True)
                                                 # stderr=True,
                                                 # stdout=True)
         
             print_named("bootstrapper", "Started God container. Waiting for experiment to finish...")
         
             pause()
             return
     
         except Exception as e:
             print_error(e)
             sleep(5)
             continue  # If we get any exceptions try again
Example #9
0
    def schedule_link_change(self, time, graph, origin, destination, bandwidth,
                             latency, jitter, drop):
        current_graph = self.get_current_graph(graph)
        new_graph = self.initialize_new_graph(current_graph)

        for link in new_graph.links:
            if link.source.name == origin and link.destination.name == destination:
                link.bandwidth_bps = bandwidth if bandwidth >= 0 else link.bandwidth_bps
                link.latency = float(latency) if latency >= 0 else link.latency
                link.jitter = float(jitter) if jitter >= 0 else link.jitter
                link.drop = float(drop) if drop >= 0 else link.drop

        self.recompute_and_store(new_graph, time)

        print_message("Link " + origin + "--" + destination +
                      " scheduled to change at " + str(time))
        self.graph_changes.append((time, [graph, new_graph]))
Example #10
0
    def parse_schedule(self, service, graph):
        """
        :param service: NetGraph.Service
        :return:
        """
        XMLtree = ET.parse(self.file)
        root = XMLtree.getroot()
        if root.tag != 'experiment':
            print_and_fail(
                'Not a valid Kollaps topology file, root is not <experiment>')

        dynamic = None

        for child in root:
            if child.tag == 'dynamic':
                if dynamic is not None:
                    print_and_fail("Only one <dynamic> block is allowed.")
                dynamic = child

        scheduler = EventScheduler()
        first_join = -1.0
        first_leave = float('inf')

        # if there is no dynamic block than this instance joins straight away
        if dynamic is None:
            scheduler.schedule_join(0.0)
            return scheduler

        seed(12345)
        replicas = []
        for i in range(service.replica_count):
            replicas.append(
                [False, False,
                 False])  # Joined = False, Disconnected = False, Used = False

        # indexes for replicas entries
        JOINED = 0
        DISCONNECTED = 1
        USED = 2

        # there is a dynamic block, so check if there is anything scheduled for us
        for event in dynamic:
            if event.tag != 'schedule':
                print_and_fail("Only <schedule> is allowed inside <dynamic>")

            # parse time of event
            time = 0.0
            try:
                time = float(event.attrib['time'])
                if time < 0.0:
                    print_and_fail("time attribute must be a positive number")
            except ValueError as e:
                print_and_fail("time attribute must be a valid real number")

            if 'name' in event.attrib and 'time' in event.attrib and 'action' in event.attrib:
                node_name = event.attrib['name']
                bridge_names = []
                for bridge in list(graph.bridges.keys()) + list(
                        graph.removed_bridges.keys()):
                    bridge_names.append(bridge)

                # if a bridge is scheduled
                if node_name in bridge_names:
                    if event.attrib['action'] == 'join':
                        scheduler.schedule_bridge_join(time, graph, node_name)
                    elif event.attrib['action'] == 'leave':
                        scheduler.schedule_bridge_leave(time, graph, node_name)
                    continue

                # parse name of service. only process actions that target us
                if node_name != service.name:
                    continue

                # parse amount of replicas affected
                amount = 1
                if 'amount' in event.attrib:
                    amount = int(event.attrib['amount'])

                # parse action
                if event.attrib['action'] == 'join':
                    for i in range(amount):
                        available = False
                        id = 0
                        # Pick a random replica
                        while (not available):
                            id = randrange(0, service.replica_count)
                            available = not replicas[id][JOINED]
                            if not service.reuse_ip:
                                available = available and not replicas[id][USED]

                        # Mark the state
                        replicas[id][JOINED] = True
                        if not service.reuse_ip:
                            replicas[id][USED] = True

                        # if its us, schedule the action
                        if service.replica_id == id:
                            scheduler.schedule_join(time)
                            print_message(service.name + " replica " +
                                          str(service.replica_id) +
                                          " scheduled to join at " + str(time))
                        if first_join < 0.0:
                            first_join = time

                elif event.attrib['action'] == 'leave' or event.attrib[
                        'action'] == 'crash':
                    for i in range(amount):
                        up = False
                        id = 0
                        # Pick a random replica
                        while (not up):
                            id = randrange(0, service.replica_count)
                            up = replicas[id][JOINED]

                        # Mark the state
                        replicas[id][JOINED] = False

                        # if its us, schedule the action
                        if service.replica_id == id:
                            if event.attrib['action'] == 'leave':
                                scheduler.schedule_leave(time)
                                print_message(service.name + " replica " +
                                              str(service.replica_id) +
                                              " scheduled to leave at " +
                                              str(time))
                            elif event.attrib['action'] == 'crash':
                                scheduler.schedule_crash(time)
                                print_message(service.name + " replica " +
                                              str(service.replica_id) +
                                              " scheduled to crash at " +
                                              str(time))
                        if first_leave > time:
                            first_leave = time

                elif event.attrib['action'] == 'reconnect':
                    for i in range(amount):
                        disconnected = False
                        id = 0
                        # Pick a random replica
                        while (not disconnected):
                            id = randrange(0, service.replica_count)
                            disconnected = replicas[id][DISCONNECTED]

                        # Mark the state
                        replicas[id][DISCONNECTED] = False

                        # if its us, schedule the action
                        if service.replica_id == id:
                            print_message(service.name + " replica " +
                                          str(service.replica_id) +
                                          " scheduled to reconnect at " +
                                          str(time))
                            scheduler.schedule_reconnect(time)

                elif event.attrib['action'] == 'disconnect':
                    for i in range(amount):
                        connected = False
                        id = 0
                        # Pick a random replica
                        while (not connected):
                            id = randrange(0, service.replica_count)
                            connected = replicas[id][
                                JOINED] and not replicas[id][DISCONNECTED]

                        # Mark the state
                        replicas[id][DISCONNECTED] = True

                        # if its us, schedule the action
                        if service.replica_id == id:
                            print_message(service.name + " replica " +
                                          str(service.replica_id) +
                                          " scheduled to disconnect at " +
                                          str(time))
                            scheduler.schedule_disconnect(time)
                else:
                    print_and_fail(
                        "Unrecognized action: " + event.attrib['action'] +
                        " , allowed actions are join, leave, crash, disconnect, reconnect"
                    )

            #Do something dynamically with a link
            elif 'origin' in event.attrib and 'dest' in event.attrib and 'time' in event.attrib:

                #parse origin and destination
                origin = event.attrib['origin']
                destination = event.attrib['dest']

                if 'action' in event.attrib:  #link is joining or leaving
                    if event.attrib['action'] == 'leave':
                        scheduler.schedule_link_leave(time, graph, origin,
                                                      destination)
                    elif event.attrib['action'] == 'join':
                        #Link is already defined but has been removed before
                        if not 'upload' in event.attrib or not 'latency' in event.attrib:
                            scheduler.schedule_link_join(
                                time, graph, origin, destination)
                        #A completely new link with defined properties joins
                        elif not 'upload' in event.attrib and not 'latency' in event.attrib and not 'network' in event.attrib:
                            print_and_fail(
                                "Link description incomplete. For a new link, you must provide at least latency, upload, and network attributes."
                            )
                        else:
                            bandwidth = event.attrib['upload']
                            latency = float(event.attrib['latency'])
                            drop = 0
                            if 'drop' in event.attrib:
                                drop = float(event.attrib['drop'])
                            jitter = 0
                            if 'jitter' in event.attrib:
                                jitter = float(event.attrib['jitter'])
                            network = event.attrib['network']

                            scheduler.schedule_new_link(
                                time, graph, origin, destination, latency,
                                jitter, drop, bandwidth, network)
                            if 'download' in event.attrib:
                                bandwidth = event.attrib['download']
                                scheduler.schedule_new_link(
                                    time, graph, destination, origin, latency,
                                    jitter, drop, bandwidth, network)

                    else:
                        print_and_fail("Unrecognized action for link: " +
                                       event.attrib['action'] +
                                       ", allowed are join and leave")

                else:  #properties of link are changing
                    bandwidth = -1
                    if 'upload' in event.attrib:
                        bandwidth = graph.bandwidth_in_bps(
                            event.attrib['upload'])
                    latency = -1
                    if 'latency' in event.attrib:
                        latency = float(event.attrib['latency'])
                    drop = -1
                    if 'drop' in event.attrib:
                        drop = float(event.attrib['drop'])
                    jitter = -1
                    if 'jitter' in event.attrib:
                        jitter = float(event.attrib['jitter'])

                    scheduler.schedule_link_change(time, graph, origin,
                                                   destination, bandwidth,
                                                   latency, jitter, drop)

            else:
                print_and_fail(
                    '<schedule> must have either name, time and action attributes,'
                    + ' or link origin dest and properties attributes')

        # deal with auto join
        if first_join < 0.0:
            print_message(service.name + " scheduled to join at " + str(0.0))
            scheduler.schedule_join(0.0)
        if first_leave < first_join:
            print_and_fail("Dynamic: service " + service.name +
                           " leaves before having joined")

        scheduler.schedule_graph_changes()

        return scheduler
Example #11
0
def main():
    if len(sys.argv) < 4:
        print_and_fail("Missing arguments. emucore <topology> <container id>")
    else:
        topology_file = sys.argv[1]
    # For future reference: This topology file must not exceed 512KB otherwise docker refuses
    # to copy it as a config file, this has happened with the 2k scale-free topology...

    setup_container(sys.argv[2], sys.argv[3])

    # Because of the bootstrapper hack we cant get output from the emucore through standard docker logs...
    #sys.stdout = open("/var/log/need.log", "w")
    #sys.stderr = sys.stdout

    graph = NetGraph()

    parser = XMLGraphParser(topology_file, graph)
    parser.fill_graph()
    print_message("Done parsing topology")

    print_message("Resolving hostnames...")
    graph.resolve_hostnames()
    print_message("All hosts found!")

    print_message("Determining the root of the tree...")
    # Get our own ip address and set the root of the "tree"
    ownIP = get_own_ip(graph)
    graph.root = graph.hosts_by_ip[ip2int(ownIP)]

    if graph.root is None:
        print_and_fail(
            "Failed to identify current service instance in topology!")
    print_message("We are " + graph.root.name + "@" + ownIP)

    print_identified(graph, "Calculating shortest paths...")
    graph.calculate_shortest_paths()

    print_message("Parsing dynamic event schedule...")
    scheduler = parser.parse_schedule(graph.root, graph)

    signal(SIGTERM, lambda signum, frame: exit(0))

    print_message("Initializing network emulation...")
    manager = EmulationCore(graph, scheduler)
    manager.initialize()
    print_identified(graph, "Waiting for command to start experiment")
    sys.stdout.flush()
    sys.stderr.flush()

    if getenv('RUNTIME_EMULATION', 'true') != 'false':
        # Enter the emulation loop
        manager.emulation_loop()
Example #12
0
    def receive_dashboard_commands(self):
        self.dashboard_socket.listen()
        while True:
            connection, addr = self.dashboard_socket.accept()
            connection.settimeout(5)
            try:
                data = connection.recv(1)
                if data:
                    command = struct.unpack("<1B", data)[0]
                    if command == CommunicationsManager.STOP_COMMAND:
                        connection.close()
                        with self.stop_lock:
                            print_message("Stopping experiment")
                            self.broadcast_groups = []
                            #TODO Stop is now useless, probably best to just replace with shutdown

                    elif command == CommunicationsManager.SHUTDOWN_COMMAND:
                        print_message("Received Shutdown command")

                        msg = "packets: recv " + str(
                            self.received) + ", prod " + str(self.produced)
                        print_identified(self.graph, msg)

                        connection.send(
                            struct.pack("<3Q", self.produced, 50,
                                        self.received))
                        ack = connection.recv(1)

                        if len(ack) != 1:
                            print_error("Bad ACK len:" + str(len(ack)))
                            connection.close()
                            continue

                        if struct.unpack("<1B",
                                         ack)[0] != CommunicationsManager.ACK:
                            print_error("Bad ACK, not and ACK" +
                                        str(struct.unpack("<1B", ack)))
                            connection.close()
                            continue

                        connection.close()

                        with self.stop_lock:
                            # self.process_pool.terminate()
                            # self.process_pool.join()
                            self.dashboard_socket.close()
                            for s in broadcast_sockets:
                                s.close()

                            # self.sock.close()
                            PathEmulation.tearDown()
                            print_identified(self.graph, "Shutting down")
                            sys.stdout.flush()
                            sys.stderr.flush()
                            stop_experiment()
                            interrupt_main()

                            return

                    elif command == CommunicationsManager.READY_COMMAND:
                        connection.send(
                            struct.pack("<1B", CommunicationsManager.ACK))
                        connection.close()

                    elif command == CommunicationsManager.START_COMMAND:
                        connection.close()
                        print_message("Starting Experiment!")
                        self.scheduler.start()

            except OSError as e:
                continue  # Connection timed out (most likely)
Example #13
0
def stopExperiment():
    with DashboardState.lock:
        if DashboardState.stopping or not DashboardState.ready:
            return
        else:
            DashboardState.stopping = True
    produced = 0
    received = 0
    gaps = []

    to_kill = []
    for node in DashboardState.hosts:
        host = DashboardState.hosts[node]
        if node.supervisor:
            continue
        to_kill.append(host)
    to_stop = to_kill[:]

    # Stop all services
    while to_stop:
        host = to_stop.pop()
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(2)
            s.connect((host.ip, CommunicationsManager.TCP_PORT))
            s.send(struct.pack("<1B", CommunicationsManager.STOP_COMMAND))
            s.close()
            
        except OSError as e:
            print_error(e)
            to_stop.insert(0, host)
            sleep(0.5)

    # Collect sent/received statistics and shutdown
    while to_kill:
        host = to_kill.pop()
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(2)
            s.connect((host.ip, CommunicationsManager.TCP_PORT))
            s.send(struct.pack("<1B", CommunicationsManager.SHUTDOWN_COMMAND))
            data = s.recv(64)
            if len(data) < struct.calcsize("<3Q"):
                s.close()
                print_message("Got less than 24 bytes for counters.")
                to_kill.insert(0, host)
                continue
                
            s.send(struct.pack("<1B", CommunicationsManager.ACK))
            s.close()
            data_tuple = struct.unpack("<3Q", data)
            produced += data_tuple[0]
            received += data_tuple[2]
            with DashboardState.lock:
                host.status = 'Down'
                continue
                
        except OSError as e:
            print_error("timed out\n" + str(e))
            to_kill.insert(0, host)
            sleep(0.5)

    with DashboardState.lock:
    
        print_named("dashboard", "packets: recv " + str(received) + ", prod " + str(produced))
        sys.stdout.flush()
        
        if produced > 0:
            DashboardState.lost_packets = 1-(received/produced)
        else:
            DashboardState.lost_packets = 0
        DashboardState.stopping = False