Exemple #1
0
    def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest):
        # you would probably either create one simulator per function, or use a generalized simulator, this is just
        # to demonstrate how the simulators are used to encapsulate simulator behavior.

        logger.info('[simtime=%.2f] invoking function %s on node %s', env.now, request, replica.node.name)

        # for full flexibility you decide the resources used
        cpu_millis = replica.node.capacity.cpu_millis * 0.1
        env.resource_state.put_resource(replica, 'cpu', cpu_millis)
        node = replica.node

        node.current_requests.add(request)

        if replica.function.name == 'python-pi':
            if replica.node.name.startswith('rpi3'):  # those are nodes we created in basic.example_topology()
                yield env.timeout(20)  # invoking this function takes 20 seconds on a raspberry pi
            else:
                yield env.timeout(2)  # invoking this function takes 2 seconds on all other nodes in the cluster
        elif replica.function.name == 'resnet50-inference':
            yield env.timeout(0.5)  # invoking this function takes 500 ms
        else:
            yield env.timeout(0)

        # also, you have to release them at the end
        env.resource_state.remove_resource(replica, 'cpu', cpu_millis)
        node.current_requests.remove(request)
Exemple #2
0
    def execute(self, env: Environment, replica: FunctionReplica,
                request: FunctionRequest):
        # mock download, for actual network download simulation look at simulate_data_download
        yield env.timeout(1)

        # training
        yield env.timeout(5)

        # mock upload
        yield env.timeout(1)
Exemple #3
0
    def teardown(self, env: Environment, replica: FunctionReplica):
        # basic cpu usage, in %
        env.resource_state.remove_resource(replica, 'cpu', 0.08)

        # basic memory consumption, in MB
        env.resource_state.remove_resource(replica, 'memory', 200)
        yield env.timeout(0)
Exemple #4
0
    def startup(self, env: Environment, replica: FunctionReplica):
        logger.info(
            '[simtime=%.2f] starting up function replica for function %s',
            env.now, replica.function.name)

        # you could create a very fine-grained setup routines here
        yield env.timeout(10)  # simulate docker startup
 def solve(self,
           env: Environment) -> Generator[simpy.events.Event, Any, Any]:
     while self.running:
         yield env.timeout(self.reconcile_interval)
         yield from self.solver.solve(env)
         # TODO remove when contention is implemented
         self.stop()
Exemple #6
0
def faas_idler(env: Environment,
               inactivity_duration=300,
               reconcile_interval=30):
    """
    https://github.com/openfaas-incubator/faas-idler
    https://github.com/openfaas-incubator/faas-idler/blob/master/main.go

    default values:
    https://github.com/openfaas-incubator/faas-idler/blob/668991c532156275993399ee79a297a4c2d651ec/docker-compose.yml

    :param env: the faas environment
    :param inactivity_duration: i.e. 15m (Golang duration)
    :param reconcile_interval: i.e. 1m (default value)
    :return: an event generator
    """
    faas: FaasSystem = env.faas
    while True:
        yield env.timeout(reconcile_interval)

        for deployment in faas.get_deployments():
            if not deployment.scale_zero:
                continue

            for function in deployment.function_definitions.values():
                if len(faas.get_replicas(function.name,
                                         FunctionState.RUNNING)) == 0:
                    continue

                idle_time = env.now - env.metrics.last_invocation[
                    function.name]
                if idle_time >= inactivity_duration:
                    env.process(faas.suspend(function.name))
                    logger.debug('%.2f function %s has been idle for %.2fs',
                                 env.now, function.name, idle_time)
Exemple #7
0
def simulate_data_upload(env: Environment, replica: FunctionReplica):
    node = replica.node.ether_node
    func = replica
    started = env.now

    if 'data.skippy.io/sends-to-storage' not in func.pod.spec.labels:
        return

    # FIXME: storage
    size = parse_size_string(
        func.pod.spec.labels['data.skippy.io/sends-to-storage'])
    path = func.pod.spec.labels['data.skippy.io/sends-to-storage/path']

    storage_node_name = env.cluster.get_storage_nodes(path)[0]
    logger.debug('%.2f replica %s uploading data %s to %s', env.now, node,
                 path, storage_node_name)

    if storage_node_name == node.name:
        # FIXME this is essentially a disk read and not a network connection
        yield env.timeout(size / 1.25e+8)  # 1.25e+8 = 1 GBit/s
        return

    storage_node = env.cluster.get_node(storage_node_name)
    route = env.topology.route_by_node_name(node.name, storage_node.name)
    flow = SafeFlow(env, size, route)
    yield flow.start()
    for hop in route.hops:
        env.metrics.log_network(size, 'data_upload', hop)
    env.metrics.log_flow(size, env.now - started, route.source,
                         route.destination, 'data_upload')
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        token = self.queue.request()
        t_wait_start = env.now
        yield token  # wait for access
        t_wait_end = env.now
        t_fet_start = env.now
        # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the
        # same Function
        factor = max(1, self.scale(self.queue.count, self.queue.capacity))
        try:
            fet = self.deployment.sample_fet(replica.node.name)
            if fet is None:
                logging.error(
                    f"FET for node {replica.node.name} for function {self.deployment.image} was not found"
                )
                raise ValueError(f'{replica.node.name}')
            fet = float(fet) * factor

            image = replica.pod.spec.containers[0].image
            if 'preprocessing' in image or 'training' in image:
                yield from simulate_data_download(env, replica)
            start = env.now
            # replica.node.current_requests.add(request)
            call = FunctionCall(request, replica, start)
            replica.node.all_requests.append(call)
            yield env.timeout(fet)

            # add degradation
            end = env.now
            degradation = replica.node.estimate_degradation(start, end)
            delay = max(0, (fet * degradation) - fet)
            yield env.timeout(delay)
            if 'preprocessing' in image or 'training' in image:
                yield from simulate_data_upload(env, replica)
            t_fet_end = env.now
            env.metrics.log_fet(request.name, replica.function.image,
                                replica.node.name, t_fet_start, t_fet_end,
                                t_wait_start, t_wait_end, degradation,
                                id(replica))
            replica.node.set_end(request.request_id, end + delay)
            # replica.node.current_requests.remove(request)
        except KeyError:
            pass

        self.queue.release(token)
Exemple #9
0
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        # 1) get parameters of base distribution (ideal case)
        # 2) check the utilization of the node the replica is running on
        # 3) transform distribution parameters with degradation function depending on utilization
        # 4) sample from that distribution
        logger.info('invoking %s on %s (%d in parallel)', request.name,
                    replica.node.name, len(replica.node.current_requests))

        yield env.timeout(1)
Exemple #10
0
def function_trigger(env: Environment,
                     deployment: FunctionDeployment,
                     ia_generator,
                     max_requests=None):
    try:
        if max_requests is None:
            while True:
                ia = next(ia_generator)
                yield env.timeout(ia)
                env.process(env.faas.invoke(FunctionRequest(deployment.name)))
        else:
            for _ in range(max_requests):
                ia = next(ia_generator)
                yield env.timeout(ia)
                env.process(env.faas.invoke(FunctionRequest(deployment.name)))

    except simpy.Interrupt:
        pass
    except StopIteration:
        logging.error(f'{deployment.name} gen has finished')
Exemple #11
0
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        # you would probably either create one simulator per function, or use a generalized simulator, this is just
        # to demonstrate how the simulators are used to encapsulate simulator behavior.

        logger.info('[simtime=%.2f] invoking function %s on node %s', env.now,
                    request, replica.node.name)

        if replica.function.name == 'python-pi':
            if replica.node.name.startswith(
                    'rpi3'
            ):  # those are nodes we created in basic.example_topology()
                yield env.timeout(
                    20
                )  # invoking this function takes 20 seconds on a raspberry pi
            else:
                yield env.timeout(
                    2
                )  # invoking this function takes 2 seconds on all other nodes in the cluster
        elif replica.function.name == 'resnet50-inference':
            yield env.timeout(0.5)  # invoking this function takes 500 ms
        else:
            yield env.timeout(0)
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        token = self.queue.request()
        yield token  # wait for access

        # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the
        # same Function
        factor = max(1, self.scale(self.queue.count, self.queue.capacity))
        try:
            fet = self.deployment.sample_fet(replica.node.name)
            if fet is None:
                logging.error(
                    f"FET for node {replica.node.name} for function {self.deployment.image} was not found"
                )
                raise ValueError(f'{replica.node.name}')
            fet = float(fet) * factor
            yield env.timeout(fet)

        except KeyError:
            pass

        self.queue.release(token)
    def solve(self,
              env: Environment) -> Generator[simpy.events.Event, Any, Any]:
        logging.info('Calculating Pod Labels')
        start = time.time()

        if self.clusters is None or len(self.clusters) == 0:
            # TODO caching because this may bottleneck - needs to figure out if clusters/devices have changed
            self.clusters: Dict[str, Cluster] = create_clusters(env)
            self.devices = get_devices(env)
            self.state = State(self.devices, self.clusters)

        results = []
        if self.settings.parallel:
            self.execute_ga_parallel(results)
        else:
            self.execute_ga_single_threaded(results)

        for result in results:
            set_reqs_for_cluster(result.instance.cluster, result.requirements,
                                 env)
        end = time.time()
        logging.info("Done calculating pods")
        yield env.timeout(end - start)
Exemple #14
0
 def run(self, env: Environment):
     yield env.timeout(0)
Exemple #15
0
 def teardown(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
Exemple #16
0
 def setup(self, env: Environment, replica: FunctionReplica):
     # no setup routine
     yield env.timeout(0)
Exemple #17
0
 def claim_resources(self, env: Environment, replica: FunctionReplica,
                     request: FunctionRequest):
     env.resource_state.put_resource(replica, 'cpu', 0.7)
     env.resource_state.put_resource(replica, 'memory', 0.3)
     yield env.timeout(0)
Exemple #18
0
 def invoke(self, env: Environment, replica: FunctionReplica,
            request: FunctionRequest):
     yield env.timeout(0)
Exemple #19
0
 def claim_resources(self, env: Environment, replica: FunctionReplica,
                     request: FunctionRequest):
     # no setup time, no memory because everything is cached - only cpu usage
     env.resource_state.put_resource(replica, 'cpu', 0.2)
     yield env.timeout(0)
Exemple #20
0
 def startup(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
Exemple #21
0
 def deploy(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
 def solve(self,
           env: Environment) -> Generator[simpy.events.Event, Any, Any]:
     yield env.timeout(0)
Exemple #23
0
 def release_resources(self, env: Environment, replica: FunctionReplica,
                       request: FunctionRequest):
     env.resource_state.remove_resource(replica, 'cpu', 0.2)
     yield env.timeout(0)
Exemple #24
0
 def execute(self, env: Environment, replica: FunctionReplica,
             request: FunctionRequest):
     yield env.timeout(0.2)