Example #1
0
    def invoke(self, env: Environment, replica: FunctionReplica, request: FunctionRequest):
        # you would probably either create one simulator per function, or use a generalized simulator, this is just
        # to demonstrate how the simulators are used to encapsulate simulator behavior.

        logger.info('[simtime=%.2f] invoking function %s on node %s', env.now, request, replica.node.name)

        # for full flexibility you decide the resources used
        cpu_millis = replica.node.capacity.cpu_millis * 0.1
        env.resource_state.put_resource(replica, 'cpu', cpu_millis)
        node = replica.node

        node.current_requests.add(request)

        if replica.function.name == 'python-pi':
            if replica.node.name.startswith('rpi3'):  # those are nodes we created in basic.example_topology()
                yield env.timeout(20)  # invoking this function takes 20 seconds on a raspberry pi
            else:
                yield env.timeout(2)  # invoking this function takes 2 seconds on all other nodes in the cluster
        elif replica.function.name == 'resnet50-inference':
            yield env.timeout(0.5)  # invoking this function takes 500 ms
        else:
            yield env.timeout(0)

        # also, you have to release them at the end
        env.resource_state.remove_resource(replica, 'cpu', cpu_millis)
        node.current_requests.remove(request)
Example #2
0
def faas_idler(env: Environment,
               inactivity_duration=300,
               reconcile_interval=30):
    """
    https://github.com/openfaas-incubator/faas-idler
    https://github.com/openfaas-incubator/faas-idler/blob/master/main.go

    default values:
    https://github.com/openfaas-incubator/faas-idler/blob/668991c532156275993399ee79a297a4c2d651ec/docker-compose.yml

    :param env: the faas environment
    :param inactivity_duration: i.e. 15m (Golang duration)
    :param reconcile_interval: i.e. 1m (default value)
    :return: an event generator
    """
    faas: FaasSystem = env.faas
    while True:
        yield env.timeout(reconcile_interval)

        for deployment in faas.get_deployments():
            if not deployment.scale_zero:
                continue

            for function in deployment.function_definitions.values():
                if len(faas.get_replicas(function.name,
                                         FunctionState.RUNNING)) == 0:
                    continue

                idle_time = env.now - env.metrics.last_invocation[
                    function.name]
                if idle_time >= inactivity_duration:
                    env.process(faas.suspend(function.name))
                    logger.debug('%.2f function %s has been idle for %.2fs',
                                 env.now, function.name, idle_time)
Example #3
0
    def run(self, env: Environment):
        # deploy functions
        deployments = self.prepare_deployments()

        for deployment in deployments:
            yield from env.faas.deploy(deployment)

        # block until replicas become available (scheduling has finished and replicas have been deployed on the node)
        logger.info('waiting for replica')
        yield env.process(env.faas.poll_available_replica('python-pi'))
        yield env.process(env.faas.poll_available_replica('resnet50-inference'))

        # run workload
        ps = []
        # execute 10 requests in parallel
        logger.info('executing 10 python-pi requests')
        for i in range(10):
            ps.append(env.process(env.faas.invoke(FunctionRequest('python-pi'))))

        logger.info('executing 10 resnet50-inference requests')
        for i in range(10):
            ps.append(env.process(env.faas.invoke(FunctionRequest('resnet50-inference'))))

        # wait for invocation processes to finish
        for p in ps:
            yield p
Example #4
0
    def execute(self, env: Environment, replica: FunctionReplica,
                request: FunctionRequest):
        # mock download, for actual network download simulation look at simulate_data_download
        yield env.timeout(1)

        # training
        yield env.timeout(5)

        # mock upload
        yield env.timeout(1)
 def solve(self,
           env: Environment) -> Generator[simpy.events.Event, Any, Any]:
     while self.running:
         yield env.timeout(self.reconcile_interval)
         yield from self.solver.solve(env)
         # TODO remove when contention is implemented
         self.stop()
Example #6
0
def simulate_data_upload(env: Environment, replica: FunctionReplica):
    node = replica.node.ether_node
    func = replica
    started = env.now

    if 'data.skippy.io/sends-to-storage' not in func.pod.spec.labels:
        return

    # FIXME: storage
    size = parse_size_string(
        func.pod.spec.labels['data.skippy.io/sends-to-storage'])
    path = func.pod.spec.labels['data.skippy.io/sends-to-storage/path']

    storage_node_name = env.cluster.get_storage_nodes(path)[0]
    logger.debug('%.2f replica %s uploading data %s to %s', env.now, node,
                 path, storage_node_name)

    if storage_node_name == node.name:
        # FIXME this is essentially a disk read and not a network connection
        yield env.timeout(size / 1.25e+8)  # 1.25e+8 = 1 GBit/s
        return

    storage_node = env.cluster.get_node(storage_node_name)
    route = env.topology.route_by_node_name(node.name, storage_node.name)
    flow = SafeFlow(env, size, route)
    yield flow.start()
    for hop in route.hops:
        env.metrics.log_network(size, 'data_upload', hop)
    env.metrics.log_flow(size, env.now - started, route.source,
                         route.destination, 'data_upload')
Example #7
0
    def teardown(self, env: Environment, replica: FunctionReplica):
        # basic cpu usage, in %
        env.resource_state.remove_resource(replica, 'cpu', 0.08)

        # basic memory consumption, in MB
        env.resource_state.remove_resource(replica, 'memory', 200)
        yield env.timeout(0)
Example #8
0
    def startup(self, env: Environment, replica: FunctionReplica):
        logger.info(
            '[simtime=%.2f] starting up function replica for function %s',
            env.now, replica.function.name)

        # you could create a very fine-grained setup routines here
        yield env.timeout(10)  # simulate docker startup
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        token = self.queue.request()
        t_wait_start = env.now
        yield token  # wait for access
        t_wait_end = env.now
        t_fet_start = env.now
        # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the
        # same Function
        factor = max(1, self.scale(self.queue.count, self.queue.capacity))
        try:
            fet = self.deployment.sample_fet(replica.node.name)
            if fet is None:
                logging.error(
                    f"FET for node {replica.node.name} for function {self.deployment.image} was not found"
                )
                raise ValueError(f'{replica.node.name}')
            fet = float(fet) * factor

            image = replica.pod.spec.containers[0].image
            if 'preprocessing' in image or 'training' in image:
                yield from simulate_data_download(env, replica)
            start = env.now
            # replica.node.current_requests.add(request)
            call = FunctionCall(request, replica, start)
            replica.node.all_requests.append(call)
            yield env.timeout(fet)

            # add degradation
            end = env.now
            degradation = replica.node.estimate_degradation(start, end)
            delay = max(0, (fet * degradation) - fet)
            yield env.timeout(delay)
            if 'preprocessing' in image or 'training' in image:
                yield from simulate_data_upload(env, replica)
            t_fet_end = env.now
            env.metrics.log_fet(request.name, replica.function.image,
                                replica.node.name, t_fet_start, t_fet_end,
                                t_wait_start, t_wait_end, degradation,
                                id(replica))
            replica.node.set_end(request.request_id, end + delay)
            # replica.node.current_requests.remove(request)
        except KeyError:
            pass

        self.queue.release(token)
Example #10
0
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        # 1) get parameters of base distribution (ideal case)
        # 2) check the utilization of the node the replica is running on
        # 3) transform distribution parameters with degradation function depending on utilization
        # 4) sample from that distribution
        logger.info('invoking %s on %s (%d in parallel)', request.name,
                    replica.node.name, len(replica.node.current_requests))

        yield env.timeout(1)
Example #11
0
 def __init__(self,
              topology: Topology,
              benchmark: Benchmark,
              env: Environment = None,
              timeout=None,
              name=None):
     self.env = env or Environment()
     self.topology = topology
     self.benchmark = benchmark
     self.timeout = timeout
     self.name = name
Example #12
0
    def run(self, env: Environment):
        for deployment in self.deployments:
            yield from env.faas.deploy(deployment)
        for deployment in self.deployments:
            yield env.process(env.faas.poll_available_replica(deployment.name))

        ps = []
        logging.info('executing requests')
        for deployment in self.deployments:
            try:
                ia_generator = self.arrival_profiles[deployment.name]
                if self.duration is None:
                    p = env.process(function_trigger(env, deployment, ia_generator, max_requests=1000))
                else:
                    p = env.process(function_trigger(env, deployment, ia_generator))
                ps.append(p)
            except KeyError:
                logging.warning('no arrival profile for deployment %s', deployment.name)

        if self.duration is not None:
            env.process(self.wait(env, ps))

        yield from ps
Example #13
0
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        # you would probably either create one simulator per function, or use a generalized simulator, this is just
        # to demonstrate how the simulators are used to encapsulate simulator behavior.

        logger.info('[simtime=%.2f] invoking function %s on node %s', env.now,
                    request, replica.node.name)

        if replica.function.name == 'python-pi':
            if replica.node.name.startswith(
                    'rpi3'
            ):  # those are nodes we created in basic.example_topology()
                yield env.timeout(
                    20
                )  # invoking this function takes 20 seconds on a raspberry pi
            else:
                yield env.timeout(
                    2
                )  # invoking this function takes 2 seconds on all other nodes in the cluster
        elif replica.function.name == 'resnet50-inference':
            yield env.timeout(0.5)  # invoking this function takes 500 ms
        else:
            yield env.timeout(0)
Example #14
0
def pull(env: Environment, image_str: str, node: Node):
    """
    Simulate a docker pull command of the given image on the given node.

    :param env: the simulation environment
    :param image_str: the name of the image (<repository[:tag]>)
    :param node: the node on which to run the pull command
    :return: a simpy process (a generator)
    """
    started = env.now
    # TODO: there's a lot of potential to improve fidelity here: consider image layers, simulate extraction time, etc.
    #  e.g., docker pull on a 13MB container takes about 5 seconds. the simulated time at 120 MBit/sec would be <1s

    # find the image in the registry with the node's architecture
    images = env.container_registry.find(image_str, arch=node.arch)
    if not images:
        raise ValueError('image not in registry: %s arch=%s' %
                         (image_str, node.arch))
    image = images[0]

    node_state = env.get_node_state(node.name)
    if node_state:
        if image in node_state.docker_images:
            return
        else:
            node_state.docker_images.add(image)

    size = image.size

    if size <= 0:
        return

    # # FIXME: crude simulation of layer sharing (90% across images is shared)
    # num_images = len(env.cluster.images_on_nodes[node.name]) - 1
    # if num_images > 0:
    #     size = size * 0.1

    route = env.topology.route(DockerRegistry, node)
    flow = SafeFlow(env, size, route)

    yield flow.start()

    # for hop in route.hops:
    #     env.metrics.log_network(size, 'docker_pull', hop)
    env.metrics.log_flow(size, env.now - started, route.source,
                         route.destination, 'docker_pull')
Example #15
0
    def run(self, env: Environment):
        # deploy functions
        deployments = self.prepare_deployments()

        for deployment in deployments:
            yield from env.faas.deploy(deployment)

        # block until replicas become available (scheduling has finished and replicas have been deployed on the node)
        logger.info('waiting for replica')
        yield env.process(env.faas.poll_available_replica('python-pi'))

        # generate profile
        ia_generator = expovariate_arrival_profile(
            constant_rps_profile(rps=20))

        # run profile
        yield from function_trigger(env,
                                    deployments[0],
                                    ia_generator,
                                    max_requests=100)
Example #16
0
    def invoke(self, env: Environment, replica: FunctionReplica,
               request: FunctionRequest):
        token = self.queue.request()
        yield token  # wait for access

        # because of GIL and Threads, we can easily estimate the additional time caused by concurrent requests to the
        # same Function
        factor = max(1, self.scale(self.queue.count, self.queue.capacity))
        try:
            fet = self.deployment.sample_fet(replica.node.name)
            if fet is None:
                logging.error(
                    f"FET for node {replica.node.name} for function {self.deployment.image} was not found"
                )
                raise ValueError(f'{replica.node.name}')
            fet = float(fet) * factor
            yield env.timeout(fet)

        except KeyError:
            pass

        self.queue.release(token)
Example #17
0
    def solve(self,
              env: Environment) -> Generator[simpy.events.Event, Any, Any]:
        logging.info('Calculating Pod Labels')
        start = time.time()

        if self.clusters is None or len(self.clusters) == 0:
            # TODO caching because this may bottleneck - needs to figure out if clusters/devices have changed
            self.clusters: Dict[str, Cluster] = create_clusters(env)
            self.devices = get_devices(env)
            self.state = State(self.devices, self.clusters)

        results = []
        if self.settings.parallel:
            self.execute_ga_parallel(results)
        else:
            self.execute_ga_single_threaded(results)

        for result in results:
            set_reqs_for_cluster(result.instance.cluster, result.requirements,
                                 env)
        end = time.time()
        logging.info("Done calculating pods")
        yield env.timeout(end - start)
Example #18
0
def function_trigger(env: Environment,
                     deployment: FunctionDeployment,
                     ia_generator,
                     max_requests=None):
    try:
        if max_requests is None:
            while True:
                ia = next(ia_generator)
                yield env.timeout(ia)
                env.process(env.faas.invoke(FunctionRequest(deployment.name)))
        else:
            for _ in range(max_requests):
                ia = next(ia_generator)
                yield env.timeout(ia)
                env.process(env.faas.invoke(FunctionRequest(deployment.name)))

    except simpy.Interrupt:
        pass
    except StopIteration:
        logging.error(f'{deployment.name} gen has finished')
Example #19
0
 def deploy(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
Example #20
0
sched_params = {
    'percentage_of_nodes_to_score': 100,
    'priorities': priorities,
    'predicates': predicates
}

# Set arrival profiles/workload pattern
benchmark = ConstantBenchmark('mixed', duration=200, rps=50)

# Initialize topology
storage_index = StorageIndex()
topology = urban_sensing_topology(ether_nodes, storage_index)

# Initialize environment
env = Environment()

env.simulator_factory = AIPythonHTTPSimulatorFactory(
    get_raith21_function_characterizations(resource_oracle, fet_oracle))
env.metrics = Metrics(env, log=RuntimeLogger(SimulatedClock(env)))
env.topology = topology
env.faas = DefaultFaasSystem(env, scale_by_requests=True)
env.container_registry = ContainerRegistry()
env.storage_index = storage_index
env.cluster = SimulationClusterContext(env)
env.scheduler = Scheduler(env.cluster, **sched_params)

sim = Simulation(env.topology, benchmark, env=env)
result = sim.run()

dfs = {
Example #21
0
 def setup(self, env: Environment, replica: FunctionReplica):
     # no setup routine
     yield env.timeout(0)
Example #22
0
 def release_resources(self, env: Environment, replica: FunctionReplica,
                       request: FunctionRequest):
     env.resource_state.remove_resource(replica, 'cpu', 0.2)
     yield env.timeout(0)
Example #23
0
 def claim_resources(self, env: Environment, replica: FunctionReplica,
                     request: FunctionRequest):
     # no setup time, no memory because everything is cached - only cpu usage
     env.resource_state.put_resource(replica, 'cpu', 0.2)
     yield env.timeout(0)
Example #24
0
 def execute(self, env: Environment, replica: FunctionReplica,
             request: FunctionRequest):
     yield env.timeout(0.2)
Example #25
0
 def solve(self,
           env: Environment) -> Generator[simpy.events.Event, Any, Any]:
     yield env.timeout(0)
Example #26
0
 def invoke(self, env: Environment, replica: FunctionReplica,
            request: FunctionRequest):
     yield env.timeout(0)
Example #27
0
 def startup(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
Example #28
0
 def teardown(self, env: Environment, replica: FunctionReplica):
     yield env.timeout(0)
Example #29
0
 def run(self, env: Environment):
     yield env.timeout(0)
Example #30
0
 def claim_resources(self, env: Environment, replica: FunctionReplica,
                     request: FunctionRequest):
     env.resource_state.put_resource(replica, 'cpu', 0.7)
     env.resource_state.put_resource(replica, 'memory', 0.3)
     yield env.timeout(0)