def __init__(self, topology: Topology, cluster_context: ClusterContext = None, initial_time=0, scheduler_params=None): super().__init__(initial_time) self.request_generator = object self.request_queue = simpy.Store(self) self.scheduler_queue = simpy.Store(self) self.topology: Topology = topology topology.create_index() # allows us to inject a pre-calculated bandwidth graph that was cached if cluster_context is None: self.cluster: ClusterContext = topology.create_cluster_context() else: self.cluster: ClusterContext = cluster_context if scheduler_params: self.scheduler = Scheduler(self.cluster, **scheduler_params) else: self.scheduler = Scheduler(self.cluster) self.faas_gateway = FaasGateway(self) self.execution_simulator = ExecutionSimulator(self) self.clock = SimulatedClock(self, start=datetime(1970, 1, 1, 0, 0, 0)) self.metrics = Metrics(self, RuntimeLogger(self.clock)) self.execution_time_oracle = oracles.FittedExecutionTimeOracle() self.startup_time_oracle = oracles.HackedFittedStartupTimeOracle()
def run_scheduler_worker(env: simpy.Environment, queue: simpy.Store, context: ClusterContext, scheduler: Scheduler, oracles: List[Oracle], log: List[LoggingRow]): while True: logging.debug('Scheduler waiting for pod...') pod = yield queue.get() # TODO fix time not changing (env.now) logging.debug('Pod received by scheduler at %.2f', env.now) log.append(LoggingRow(env.now, EventType.POD_RECEIVED, pod.name)) # execute scheduling algorithm then = time.time() result = scheduler.schedule(pod) duration = time.time() - then yield env.timeout(duration) logging.debug('Pod scheduling took %.2f ms, and yielded %s', duration * 1000, result) # weight the startup metadata = dict([o.estimate(context, pod, result) for o in oracles]) # also add the image name and the selected node to the metadata metadata['image'] = pod.spec.containers[0].image metadata['suggested_host'] = None if result.suggested_host is None else result.suggested_host.name log.append(LoggingRow(env.now, EventType.POD_SCHEDULED, pod.name, metadata))
def configure_env(settings: EnvSettings, topology: Topology): env = Environment() env.simulator_factory = settings.simulator_factory if settings.null_logger: env.metrics = OurMetrics(env, log=NullLogger()) else: env.metrics = OurMetrics(env, log=RuntimeLogger(SimulatedClock(env))) env.topology = topology env.faas = OurFaas(env, settings.scale_by_requests, settings.scale_by_requests_per_replica, settings.scale_by_queue_requests_per_replica) env.container_registry = ContainerRegistry() env.storage_index = settings.storage_index env.cluster = SimulationClusterContext(env) env.scheduler = Scheduler(env.cluster, **settings.sched_params) env.metrics_server = MetricsServer() # TODO inject resource oracle resource_monitor = ResourceMonitor( env, ResourceOracle(resources_per_node_image)) env.background_processes.append(lambda env: resource_monitor.run()) if settings.scale_by_resources: hpa_settings = settings.hpaSettings hpa = HorizontalPodAutoscaler( env, average_window=hpa_settings.average_window, reconcile_interval=hpa_settings.reconcile_interval, target_tolerance=hpa_settings.target_tolerance) env.background_processes.append(lambda env: hpa.run()) if settings.label_problem_solver_settings is not None: solver = LabelSolverProcess(settings.label_problem_solver_settings) env.background_processes.append(lambda env: solver.solve(env)) return env
def run_experiment(num_nodes, sched_params): synth = CloudRegionsSynthesizer(regions=10, vms_per_region=int(num_nodes / 10)) t = synth.create_topology() t._bandwidth_graph = RandomBwGraph() ctx = t.create_cluster_context() scheduler = Scheduler(ctx, **sched_params) run_loop(ctx, pod_gen(ctx), scheduler)
def create_scheduler(self, env): return Scheduler(env.cluster)
def main(): # Parse the arguments parser = argparse.ArgumentParser(description='Skippy - Navigating functions to the edge of the world (i.e. K8s)') parser.add_argument('-s', '--scheduler-name', action='store', dest='scheduler_name', help='Change the name of the scheduler. New pods which should be placed by this scheduler need ' 'to define this name. Set \'None\' to disable the name check (if this scheduler is ' 'completely replacing the kube-scheduler).', default='skippy-scheduler') parser.add_argument('-n', '--namespace', action='store', dest='namespace', help='Only watch pods of a specific namespace.') parser.add_argument('-w', '--weights', action='store', dest='weights', help='An array of floats defining the weights of the different priority functions', default=None) parser.add_argument('-u', '--use-default', action='store_true', dest='default', help='Use the predicate and priority functions of the default scheduler.', default=False) parser.add_argument('-c', '--kube-config', action='store_true', dest='kube_config', help='Load kube-config from home dir instead of in-cluster-config from envs.', default=False) parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Enable debug logs.', default=False) args = parser.parse_args() level = logging.DEBUG if args.debug else logging.INFO scheduler_name = None if args.scheduler_name == 'None' else args.scheduler_name namespace = None if args.namespace == 'None' else args.namespace weights = None if args.weights is None else [float(x) for x in ast.literal_eval(args.weights)] # Set the log level logging.getLogger().setLevel(level) # Load the kubernetes API config if args.kube_config: # Load the configuration from ~/.kube logging.debug('Loading kube config...') config.load_kube_config() else: # Load the configuration when running inside the cluster (by reading envs set by k8s) logging.debug('Loading in-cluster config...') config.load_incluster_config() if weights: logging.info('Using custom weights: %s', weights) # Initialize the API, context and scheduler cluster_context = KubeClusterContext() api = client.CoreV1Api() if args.default: logging.debug('Using default scheduler priority functions') priorities = [(1.0, BalancedResourcePriority()), (1.0, ImageLocalityPriority())] if weights is None \ else [(weights[0], BalancedResourcePriority()), (weights[1], ImageLocalityPriority())] else: priorities = None if weights is None \ else [(weights[0], BalancedResourcePriority()), (weights[1], LatencyAwareImageLocalityPriority()), (weights[2], LocalityTypePriority()), (weights[3], DataLocalityPriority()), (weights[4], CapabilityPriority())] scheduler = Scheduler(cluster_context, priorities=priorities) # Either watch all namespaces or only the one set as argument w = watch.Watch() if namespace is not None: logging.debug('Watching for new pod events in namespace %s...', args.namespace) stream = w.stream(api.list_namespaced_pod, args.namespace) else: logging.debug('Watching for new pod events across all namespaces...') stream = w.stream(api.list_pod_for_all_namespaces) if scheduler_name: logging.debug('Watching for new pods with defined scheduler-name \'%s\'...', scheduler_name) else: logging.debug('Watching for pods without caring about defined scheduler-name...') # Start the liveness probe (used by kubernetes to restart the service if it's not responding anymore) logging.debug('Starting liveness / readiness probe...') LivenessProbe.start() # Main event loop watching for new pods logging.info('Everything is in place for new pods to be scheduled. Waiting for new events...') try: for event in stream: # noinspection PyBroadException try: if event['object'].status.phase == 'Pending' and \ (scheduler_name is None or event['object'].spec.scheduler_name == scheduler_name) and \ event['type'] == 'ADDED': pod = create_pod(event['object']) logging.debug('There\'s a new pod to schedule: ' + pod.name) result = scheduler.schedule(pod) logging.debug('Pod yielded %s', result) except ApiException as e: # Parse the JSON message body of the exception logging.exception('ApiExceptionMessage: %s', json.loads(e.body)['message']) except Exception: # We really don't want the scheduler to die, therefore we catch Exception here logging.exception('Exception in outer event loop caught. ' 'It will be ignored to make sure the scheduler continues to run.') except KeyboardInterrupt: logging.info('Shutting down after receiving a keyboard interrupt.')
# Initialize topology storage_index = StorageIndex() topology = urban_sensing_topology(ether_nodes, storage_index) # Initialize environment env = Environment() env.simulator_factory = AIPythonHTTPSimulatorFactory( get_raith21_function_characterizations(resource_oracle, fet_oracle)) env.metrics = Metrics(env, log=RuntimeLogger(SimulatedClock(env))) env.topology = topology env.faas = DefaultFaasSystem(env, scale_by_requests=True) env.container_registry = ContainerRegistry() env.storage_index = storage_index env.cluster = SimulationClusterContext(env) env.scheduler = Scheduler(env.cluster, **sched_params) sim = Simulation(env.topology, benchmark, env=env) result = sim.run() dfs = { "invocations_df": sim.env.metrics.extract_dataframe('invocations'), "scale_df": sim.env.metrics.extract_dataframe('scale'), "schedule_df": sim.env.metrics.extract_dataframe('schedule'), "replica_deployment_df": sim.env.metrics.extract_dataframe('replica_deployment'), "function_deployments_df": sim.env.metrics.extract_dataframe('function_deployments'),