Exemple #1
0
    def handle_container_allocation(self, node, allocated, job, task, time_millis):
        # Check if this is a SMARTG allocation. If so, fork off a state and be fully REGULAR on it.
        if allocated.memory_mb < task.resource.memory_mb:
            # Make sure to mark that we've tried to give this container this exact amount of memory
            if job.yarn_id not in self.state.container_offers:
                self.state.container_offers[job.yarn_id] = {}
            job_container_offers = self.state.container_offers[job.yarn_id]
            if job.next_container_id not in job_container_offers:
                job_container_offers[job.next_container_id] = set()
            current_container_offers = job_container_offers[job.next_container_id]

            if allocated.memory_mb not in current_container_offers:
                # Add this offer to the existing ones
                current_container_offers.add(allocated.memory_mb)
                # Duplicate all internal state
                new_state = copy.deepcopy(self.state)
                # Add a new SymbexDecisionResumeStateEvent at the head of the simulator.
                # Skip the current job, since this state will be REGULAR on this decision.
                symbex_resume_state_event = SymbexDecisionResumeStateEvent(new_state, node.name,
                                                                           self.job_queue.index(job) + 1)
                new_state.add_event(symbex_resume_state_event)
                # Add this new state to the list of states forked.
                self.new_states.append(new_state)

        # Finish this run
        YarnSmartgScheduler.handle_container_allocation(self, node, allocated, job, task, time_millis)
Exemple #2
0
 def schedule(self, node, queue_start=0):
     self.new_states = []
     scheduling_result = YarnSmartgScheduler.schedule(self, node, queue_start=queue_start)
     if not self.new_states:
         return scheduling_result
     else:
         return bool(node.allocated_containers), (EventResult.PAUSE, self.new_states)
Exemple #3
0
 def _get_scheduler(self, user_config, yarn_topo):
     elastic_pool = None
     if user_config.ep is not None:
         elastic_pool = user_config.ep
     scheduler_type = self.state.scheduler_type
     if scheduler_type is YarnSchedulerType.REGULAR:
         return YarnRegularScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.SRTF:
         return YarnSRTFScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.GREEDY:
         return YarnGreedyScheduler(state=self.state,
                                    node_count=len(yarn_topo[1]),
                                    elastic_pool=elastic_pool)
     elif scheduler_type is YarnSchedulerType.SMARTG:
         return YarnSmartgScheduler(state=self.state,
                                    node_count=len(yarn_topo[1]),
                                    elastic_pool=elastic_pool)
     elif scheduler_type is YarnSchedulerType.SYMBEX:
         return YarnSymbexScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.RACE_LOCKSTEP or \
             scheduler_type is YarnSchedulerType.RACE_CONTINUOUS:
         self.state.oracle_type = scheduler_type
         return YarnGlobalDecisionScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.RACE_JOB:
         self.state.oracle_type = scheduler_type
         return YarnJobDecisionScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.RACE_NODEG:
         self.state.oracle_type = scheduler_type
         return YarnNodeGDecisionScheduler(state=self.state)
     elif scheduler_type is YarnSchedulerType.PEEK:
         self.state.oracle_type = scheduler_type
         return YarnPeekScheduler(state=self.state)
     else:
         raise Exception("Invalid scheduler model specified: " +
                         scheduler_type)
Exemple #4
0
    def allocate_on_node(self, node, task):

        # First try being regular
        allocated = YarnRegularScheduler.allocate_on_node(self, node, task)
        if allocated is not None:
            return allocated

        job_id = task.job.job_id
        if self.job_behaviors[job_id] is YarnSchedulerType.SMARTG:
            allocated = YarnSmartgScheduler.allocate_on_node(self, node, task)
            if allocated is None or self.elastic_time_limits[
                    job_id] is sys.maxint:
                return allocated
            # Check if there is enough time remaining to run this container elasticly.
            elastic_finish_time = allocated[
                1] + self.state.simulator.clock_millis
            LOG.info(
                "PEEK: Job {} finish time: {}, task finish time: {}".format(
                    job_id, self.elastic_time_limits[job_id],
                    elastic_finish_time))
            if not self.elastic_will_pushback(job_id, allocated, node,
                                              elastic_finish_time):
                return allocated
            else:
                LOG.info("PEEK: Job " + str(job_id) +
                         ", ELASTIC not possible.")

        return None
Exemple #5
0
    def allocate_on_node(self, node, task):

        # First try being regular
        regular = YarnRegularScheduler.allocate_on_node(self, node, task)
        if regular is not None:
            return regular

        if self.job_behaviors[task.job.job_id] is YarnSchedulerType.SMARTG:
            return YarnSmartgScheduler.allocate_on_node(self, node, task)

        return None
Exemple #6
0
    def allocate_on_node(self, node, task):

        # First try being regular
        regular = YarnRegularScheduler.allocate_on_node(self, node, task)
        if regular is not None:
            return regular

        if self.node_behaviors[node.name] is YarnSchedulerType.SMARTG:
            return YarnSmartgScheduler.allocate_on_node(self, node, task)

        return None
Exemple #7
0
    def allocate_on_node(self, node, task):

        # First try being regular
        allocated = YarnRegularScheduler.allocate_on_node(self, node, task)
        if allocated is not None:
            return allocated

        job_id = task.job.job_id
        if self.job_behaviors[job_id] is YarnSchedulerType.SMARTG:
            allocated = YarnSmartgScheduler.allocate_on_node(self, node, task)
            if allocated is None or self.elastic_time_limits[job_id] is sys.maxint:
                return allocated
            # Check if there is enough time remaining to run this container elasticly.
            elastic_finish_time = allocated[1] + self.state.simulator.clock_millis
            LOG.info("PEEK: Job {} finish time: {}, task finish time: {}".format(
              job_id, self.elastic_time_limits[job_id], elastic_finish_time
            ))
            if not self.elastic_will_pushback(job_id, allocated, node, elastic_finish_time):
                return allocated
            else:
                LOG.info("PEEK: Job " + str(job_id) + ", ELASTIC not possible.")

        return None
Exemple #8
0
 def __init__(self, state):
     YarnSmartgScheduler.__init__(self, state, 0, None)
     self.new_states = []
Exemple #9
0
    def generate_oracle_runner(state, log, simulator_clock, scheduler_type,
                               simulation_type):
        generate_node_heartbeats = False
        # Duplicate all of the current state
        while True:
            try:
                deepcopy_memo = {}
                if simulation_type is YarnSimulation.RACE:
                    new_state = YarnRaceState(user_config=state.user_config)
                else:
                    new_state = YarnState(user_config=state.user_config)

                new_state.simulation_type = simulation_type

                deepcopy_memo[id(state)] = new_state
                # Add the simulator & scheduler references (many other objects reference it).
                # SIMULATOR (initial)
                new_state.simulator = Simulator()
                new_state.simulator.clock_millis = simulator_clock
                deepcopy_memo[id(state.simulator)] = new_state.simulator

                # JOBS
                # noinspection PyArgumentList
                new_state.jobs = list(
                    copy.deepcopy(job, deepcopy_memo)
                    for job in state.scheduler.running_jobs)

                # NODES
                # noinspection PyArgumentList
                new_state.nodes = dict(
                    (node.name, copy.deepcopy(node, deepcopy_memo))
                    for node in state.nodes.values())

                # SCHEDULER
                old_scheduler = state.scheduler
                if scheduler_type is YarnSchedulerType.REGULAR or \
                        scheduler_type is YarnSchedulerType.SMARTG:

                    # Generate the correct schedulers for each state.
                    if scheduler_type is YarnSchedulerType.REGULAR:
                        new_state.scheduler = YarnRegularScheduler(new_state)
                    elif scheduler_type is YarnSchedulerType.SMARTG:
                        new_state.scheduler = YarnSmartgScheduler(
                            new_state, old_scheduler.node_count,
                            old_scheduler.elastic_pool)
                    else:
                        log.error("Invalid scheduler type: " +
                                  str(scheduler_type))
                        return None

                    # Copy the scheduler state to the new scheduler
                    new_scheduler = new_state.scheduler
                    new_scheduler.next_job_id = old_scheduler.next_job_id
                    new_scheduler.running_jobs = set(
                        new_state.jobs
                    )  # This works since we only copy running jobs.
                    new_scheduler.completed_jobs = old_scheduler.completed_jobs.copy(
                    )
                    # noinspection PyArgumentList
                    new_scheduler.allocated_containers = \
                        dict((job_id, set(copy.deepcopy(container, deepcopy_memo)
                                          for container in old_scheduler.allocated_containers[job_id]))
                             for job_id in old_scheduler.allocated_containers)
                    # noinspection PyArgumentList
                    new_scheduler.job_queue = list(
                        copy.deepcopy(job, deepcopy_memo)
                        for job in old_scheduler.job_queue)
                else:
                    # noinspection PyArgumentList
                    new_state.scheduler = copy.deepcopy(
                        state.scheduler, deepcopy_memo)

                deepcopy_memo[id(old_scheduler)] = new_state.scheduler

                # GENERATOR
                old_generator = state.generator
                new_generator = old_generator.__class__(state)
                new_generator.state = new_state
                deepcopy_memo[id(old_generator)] = new_generator
                new_state.generator = new_generator

                # SIMULATOR (copy state)
                new_state.simulator.queue.counter = peekable(
                    count(state.simulator.queue.counter.peek()))
                new_state.simulator.queue.pq = []
                # Also eliminate all the task duration errors injected
                # (the simulation needs to be unaware of them)
                queue_needs_resorting = False
                for queue_el in state.simulator.queue.pq:
                    # queue_el = [ time_millis, tie_breaker_count, event ]
                    event_time = queue_el[0]
                    event_count = queue_el[1]
                    event = queue_el[2]
                    new_event = None
                    if (issubclass(type(event), YarnJobArriveEvent)) or \
                            issubclass(type(event), YarnOracleSimulationEvent) or \
                            type(event) is YarnOccupancyStatsEvent or \
                            type(event) is YarnResumeSchedulingEvent:

                        if type(event) is YarnResumeSchedulingEvent:
                            generate_node_heartbeats = True
                    else:
                        # noinspection PyArgumentList
                        new_event = copy.deepcopy(event, deepcopy_memo)
                        if type(
                                event
                        ) is YarnNodeContainerFinishEvent and event.duration_error != 0:
                            event_time -= event.duration_error
                            new_event.time_millis -= event.duration_error
                            queue_needs_resorting = True

                    new_state.simulator.queue.pq.append(
                        [event_time, event_count, new_event])

                if queue_needs_resorting:
                    heapq.heapify(new_state.simulator.queue.pq)

                # Copy elements which do not require duplication
                new_state.user_config = state.user_config
                break
            except RuntimeError as e:
                if "recursion depth" in str(e):
                    old_limit = sys.getrecursionlimit()
                    log.warning("Increasing recursion limit from " +
                                str(old_limit) + " to " + str(old_limit * 2))
                    sys.setrecursionlimit(old_limit * 2)
                else:
                    raise e

        if generate_node_heartbeats:
            # Generate NodeHeartbeat events for all the nodes.
            map(lambda x: x.next_heartbeat.generate_next_heartbeat(),
                (node for node in new_state.nodes.values()
                 if node.next_heartbeat.handled))

        # Disable the use of gaps.
        new_state.use_gaps = False

        log.info("ORACLE_STATE_DUPLICATION_DONE")

        return new_state