Example #1
0
    def __init__(self):
        # observation and action space
        self.setup_space()

        # random seed
        self.seed(config.seed)

        # global timer
        self.wall_time = WallTime()

        # uses priority queue
        self.timeline = Timeline()

        # executors
        self.executors = OrderedSet()
        for exec_id in range(config.exec_cap):
            self.executors.add(Executor(exec_id))

        # free executors
        self.free_executors = FreeExecutors(self.executors)

        # moving executors
        self.moving_executors = MovingExecutors()

        # executor commit
        self.exec_commit = ExecutorCommit()

        # prevent agent keeps selecting the same node
        self.node_selected = set()

        # for computing reward at each step
        self.reward_calculator = RewardCalculator()
Example #2
0
    def reset(self, max_time=np.inf):
        # reset observation and action space
        self.setup_space()

        self.max_time = max_time
        self.wall_time.reset()
        self.timeline.reset()
        self.exec_commit.reset()
        self.moving_executors.reset()
        self.reward_calculator.reset()
        self.finished_job_dags = OrderedSet()
        self.node_selected.clear()
        for executor in self.executors:
            executor.reset()
        self.free_executors.reset(self.executors)
        # generate a set of new jobs
        self.job_dags = generate_jobs(
            self.np_random, self.timeline, self.wall_time)
        # map action to dag_idx and node_idx
        self.action_map = compute_act_map(self.job_dags)
        # add initial set of jobs in the system
        for job_dag in self.job_dags:
            self.add_job(job_dag)
        # put all executors as source executors initially
        self.source_job = None
        self.num_source_exec = len(self.executors)
        self.exec_to_schedule = OrderedSet(self.executors)

        return self.observe()
Example #3
0
    def __init__(self, dag_db):

        self.dag_db = dag_db

        self.job_dags = OrderedSet()
        self.action_map = {}  # action index -> node
        self.available_executors = {}
        self.last_trigger = None

        # executors
        self.executors = {}
        for exec_id in range(config.exec_cap):
            self.executors[exec_id] = Executor(exec_id)

        # dynamically bind {app_id -> job_dag}
        self.spark_dag_map = {}
        # dynamically bind {job_dag -> app_id}
        self.spark_inverse_dag_map = {}
        # dynamically bind {(app_id, stage_id) -> node}
        self.spark_node_map = {}
        # dynamically bind {node -> (app_id, stage_id)}
        self.spark_inverse_node_map = {}

        # dynamically bind {app_id -> {exec_id -> re-usable track_id}}
        self.exec_id_track_id_map = {}
Example #4
0
def generate_jobs(np_random, timeline, wall_time):

    job_dags = OrderedSet()
    tpch_size = ['2g','5g','10g','20g','50g','80g','100g']
    tpch_num = 22
    t = 0

    for _ in range(config.num_init_dags):
        # generate query
        query_size = tpch_size[np_random.randint(len(tpch_size))]
        query_idx = str(np_random.randint(tpch_num) + 1)
        # generate job
        job_dag = load_job(
            query_size, query_idx, wall_time, np_random)
        # job already arrived, put in job_dags
        job_dag.start_time = t
        job_dag.arrived = True
        job_dags.add(job_dag)

    for _ in range(config.num_stream_dags):
        # poisson process
        t += int(np_random.exponential(config.stream_interval))
        # uniform distribution
        query_size = tpch_size[np_random.randint(len(tpch_size))]
        query_idx = str(np_random.randint(tpch_num) + 1)
        # generate job
        job_dag = load_job(
            query_size, query_idx, wall_time, np_random)
        # push into timeline
        job_dag.start_time = t
        timeline.push(t, job_dag)

    return job_dags
Example #5
0
 def reset(self):
     for node in self.nodes:
         node.reset()
     self.num_nodes_done = 0
     self.executors = OrderedSet()
     self.frontier_nodes = OrderedSet()
     for node in self.nodes:
         if node.is_schedulable():
             self.frontier_nodes.add(node)
     self.arrived = False
     self.completed = False
     self.completion_time = np.inf
Example #6
0
    def get_frontier_nodes(self):
        # frontier nodes := unsaturated nodes with all parent nodes saturated
        frontier_nodes = OrderedSet()
        for job_dag in self.job_dags:
            for node in job_dag.nodes:
                if not node in self.node_selected and not self.saturated(node):
                    parents_saturated = True
                    for parent_node in node.parent_nodes:
                        if not self.saturated(parent_node):
                            parents_saturated = False
                            break
                    if parents_saturated:
                        frontier_nodes.add(node)

        return frontier_nodes
Example #7
0
    def __init__(self, nodes, adj_mat, name):
        # nodes: list of N nodes
        # adj_mat: N by N 0-1 adjacency matrix, e_ij = 1 -> edge from i to j
        assert len(nodes) == adj_mat.shape[0]
        assert adj_mat.shape[0] == adj_mat.shape[1]

        self.name = name

        self.nodes = nodes
        self.adj_mat = adj_mat

        self.num_nodes = len(self.nodes)
        self.num_nodes_done = 0

        # set of executors currently running on the job
        self.executors = OrderedSet()

        # the computation graph needs to be a DAG
        assert is_dag(self.num_nodes, self.adj_mat)

        # get the set of schedule nodes
        self.frontier_nodes = OrderedSet()
        for node in self.nodes:
            if node.is_schedulable():
                self.frontier_nodes.add(node)

        # assign job dag to node
        self.assign_job_dag_to_node()

        # dag is arrived
        self.arrived = False

        # dag is completed
        self.completed = False

        # dag start ime
        self.start_time = None

        # dag completion time
        self.completion_time = np.inf

        # map a executor number to an interval
        self.executor_interval_map = \
            self.get_executor_interval_map()
Example #8
0
 def assign_executor(self, executor, frontier_changed):
     if executor.node is not None and not executor.node.no_more_tasks:
         # keep working on the previous node
         task = executor.node.schedule(executor)
         self.timeline.push(task.finish_time, task)
     else:
         # need to move on to other nodes
         if frontier_changed:
             # frontier changed, need to consult all free executors
             # note: executor.job_dag might change after self.schedule()
             source_job = executor.job_dag
             if len(self.exec_commit[executor.node]) > 0:
                 # directly fulfill the commitment
                 self.exec_to_schedule = {executor}
                 self.schedule()
             else:
                 # free up the executor
                 self.free_executors.add(source_job, executor)
             # then consult all free executors
             self.exec_to_schedule = OrderedSet(self.free_executors[source_job])
             self.source_job = source_job
             self.num_source_exec = len(self.free_executors[source_job])
         else:
             # just need to schedule one current executor
             self.exec_to_schedule = {executor}
             # only care about executors on the node
             if len(self.exec_commit[executor.node]) > 0:
                 # directly fulfill the commitment
                 self.schedule()
             else:
                 # need to consult for ALL executors on the node
                 # Note: self.exec_to_schedule is immediate
                 #       self.num_source_exec is for commit
                 #       so len(self.exec_to_schedule) !=
                 #       self.num_source_exec can happen
                 self.source_job = executor.job_dag
                 self.num_source_exec = len(executor.node.executors)
Example #9
0
    def __init__(self, idx, tasks, task_duration, wall_time, np_random):
        self.idx = idx
        self.tasks = tasks
        self.wall_time = wall_time
        self.np_random = np_random

        self.task_duration = task_duration

        self.num_tasks = len(tasks)
        self.num_finished_tasks = 0
        self.next_task_idx = 0
        self.no_more_tasks = False
        self.tasks_all_done = False
        self.node_finish_time = np.inf

        self.executors = OrderedSet()

        # uninitialized
        self.parent_nodes = []
        self.child_nodes = []
        self.descendant_nodes = []
        self.job_dag = None

        self.assign_node_to_tasks()
Example #10
0
    def run(self):

        # set up ipc communication
        context = zmq.Context()
        socket = context.socket(zmq.REP)
        ipc_msg = IPCMessage()
        ipc_reply = IPCReply()

        os.system('rm /tmp/spark_scheduling_java_python_ipc')
        socket.bind("ipc:///tmp/spark_scheduling_java_python_ipc")

        # for reward computation
        num_active_jobs = 0
        prev_time = time.time()

        while not self.exit.is_set():
            msg = socket.recv()
            ipc_msg.ParseFromString(msg)

            if ipc_msg.msg_type == 'register':
                self.dag_db.add_new_app(ipc_msg.app_name, ipc_msg.app_id)
                job_dag = self.env.add_job_dag(ipc_msg.app_id)
                add_job_in_graph(self.graph, job_dag)
                ipc_reply.msg = \
                    "external scheduler register app " + str(ipc_msg.app_name)

            elif ipc_msg.msg_type == 'bind':
                self.env.bind_exec_id(ipc_msg.app_id, ipc_msg.exec_id,
                                      ipc_msg.track_id)
                ipc_reply.msg = \
                    "external scheduler bind app_id " + \
                    str(ipc_msg.app_id) + " exec_id " + \
                    str(ipc_msg.exec_id) + " on track_id " + \
                    str(ipc_msg.track_id)

            elif ipc_msg.msg_type == 'inform':
                self.env.complete_tasks(ipc_msg.app_id, ipc_msg.stage_id,
                                        ipc_msg.num_tasks_left)
                ipc_reply.msg = \
                    "external scheduler updated app_id " + \
                    str(ipc_msg.app_id) + \
                    " stage_id " + \
                    str(ipc_msg.stage_id) + \
                    " with " + str(ipc_msg.num_tasks_left) + " tasks left"

            elif ipc_msg.msg_type == 'update':
                frontier_nodes_changed = \
                    self.env.complete_stage(ipc_msg.app_id, ipc_msg.stage_id)

                ipc_reply.msg = \
                    "external scheduler updated app_id " + \
                    str(ipc_msg.app_id) + \
                    " stage_id " + \
                    str(ipc_msg.stage_id)

            elif ipc_msg.msg_type == 'tracking':
                # master asks which app it should assign the executor to
                ipc_reply.app_id, ipc_reply.num_executors_to_take = \
                    self.exec_tracker.pop_executor_flow(ipc_msg.num_available_executors)
                ipc_reply.msg = \
                    "external scheduler moves " + \
                    str(ipc_reply.num_executors_to_take) + \
                    " executor to app " + ipc_reply.app_id

            elif ipc_msg.msg_type == 'consult':

                # convert ipc_msg.app_id and ipc_msg.stage_id to corresponding
                # executors in virtual environment and then inovke the
                # scheduling agent

                # 1. translate the raw information into observation space
                # sort out the exec_map (where the executors are)
                exec_map = {job_dag: 0 for job_dag in self.env.job_dags}
                for app_id in self.dag_db.apps_map:
                    if app_id in self.exec_tracker.executor_flow:
                        job_dag = self.dag_db.apps_map[app_id]
                        exec_map[job_dag] = self.exec_tracker.executor_flow[
                            app_id]

                source_job = self.dag_db.apps_map[ipc_msg.app_id]

                frontier_nodes = OrderedSet()
                for job_dag in self.env.job_dags:
                    for node in job_dag.frontier_nodes:
                        frontier_nodes.add(node)

                for job_dag in self.env.job_dags:
                    for node in job_dag.nodes:
                        feature = np.zeros([6])
                        # number of executors already in the job
                        feature[0] = exec_map[job_dag]
                        # source executor is from the current job (locality)
                        feature[1] = job_dag is source_job
                        # number of source executors
                        feature[2] = 1
                        # remaining number of tasks in the node
                        feature[3] = node.num_tasks - node.next_task_idx
                        # average task duration of the node
                        feature[4] = node.tasks[-1].duration
                        # is the current node valid
                        feature[5] = node in frontier_nodes

                        # update feature in observation
                        self.graph.update_nodes({node: feature})

                # update mask in the action space
                self.action_space.update_valid_set(frontier_nodes)

                # 2. gather feedback for the previous action
                curr_time = time.time()
                elapsed_time = curr_time - prev_time
                prev_reward = num_active_jobs * elapsed_time
                prev_done = False  # spark can be long running
                prev_info = {'elapsed_time': elapsed_time}
                num_active_jobs = len(self.env.job_dags)
                prev_time = curr_time

                # 3. get the action from the agent
                node = self.agent.get_action(self.graph, prev_reward,
                                             prev_done, prev_info)

                # 4. translate the action to ipc reply
                if node is None:
                    # no-action was made
                    ipc_reply.app_id = 'void'
                    ipc_reply.stage_id = -1
                else:
                    ipc_reply.app_id, ipc_reply.stage_id = self.env.spark_inverse_node_map[
                        node]
                    if node.idx not in node.job_dag.frontier_nodes:
                        # move (or stay) the executor to the job only
                        ipc_reply.stage_id = -1

                if ipc_msg.app_id != 'void' and \
                   ipc_reply.app_id != 'void' and \
                   ipc_msg.app_id != ipc_reply.app_id:
                    # executor needs to move to another job, keep track of it
                    self.exec_tracker.add_executor_flow(ipc_reply.app_id, 1)

                ipc_reply.msg = \
                    "external scheduler return app_id " + str(ipc_reply.app_id) + \
                    " stage_id " + str(ipc_reply.stage_id) + \
                    " for exec_id " + str(ipc_msg.exec_id)

            elif ipc_msg.msg_type == 'deregister':
                job_dag = self.env.remove_job_dag(ipc_msg.app_id)
                remove_job_from_graph(self.graph, job_dag)
                self.dag_db.remove_app(ipc_msg.app_id)
                self.exec_tracker.remove_app(ipc_msg.app_id)
                ipc_reply.msg = \
                    "external scheduler deregister app " + ipc_msg.app_id

            print("time:", datetime.now())
            print("msg_type:", ipc_msg.msg_type)
            print("app_name:", ipc_msg.app_name)
            print("app_id:", ipc_msg.app_id)
            print("stage_id:", ipc_msg.stage_id)
            print("executor_id:", ipc_msg.exec_id)
            print("track_id:", ipc_msg.track_id)
            print("num_available_executors:", ipc_msg.num_available_executors)
            print("num_tasks_left", ipc_msg.num_tasks_left)
            print("reply_msg:", ipc_reply.msg)
            print("")
            sys.stdout.flush()

            socket.send(ipc_reply.SerializeToString())
Example #11
0
class JobDAG(object):
    def __init__(self, nodes, adj_mat, name):
        # nodes: list of N nodes
        # adj_mat: N by N 0-1 adjacency matrix, e_ij = 1 -> edge from i to j
        assert len(nodes) == adj_mat.shape[0]
        assert adj_mat.shape[0] == adj_mat.shape[1]

        self.name = name

        self.nodes = nodes
        self.adj_mat = adj_mat

        self.num_nodes = len(self.nodes)
        self.num_nodes_done = 0

        # set of executors currently running on the job
        self.executors = OrderedSet()

        # the computation graph needs to be a DAG
        assert is_dag(self.num_nodes, self.adj_mat)

        # get the set of schedule nodes
        self.frontier_nodes = OrderedSet()
        for node in self.nodes:
            if node.is_schedulable():
                self.frontier_nodes.add(node)

        # assign job dag to node
        self.assign_job_dag_to_node()

        # dag is arrived
        self.arrived = False

        # dag is completed
        self.completed = False

        # dag start ime
        self.start_time = None

        # dag completion time
        self.completion_time = np.inf

        # map a executor number to an interval
        self.executor_interval_map = \
            self.get_executor_interval_map()

    def assign_job_dag_to_node(self):
        for node in self.nodes:
            node.job_dag = self

    def get_executor_interval_map(self):
        executor_interval_map = {}
        executor_data_point = [5, 10, 20, 40, 50, 60, 80, 100]
        entry_pt = 0

        # get the left most map
        for e in range(executor_data_point[0] + 1):
            executor_interval_map[e] = \
                (executor_data_point[0],
                 executor_data_point[0])

        # get the center map
        for i in range(len(executor_data_point) - 1):
            for e in range(executor_data_point[i] + 1,
                           executor_data_point[i + 1]):
                executor_interval_map[e] = \
                    (executor_data_point[i],
                     executor_data_point[i + 1])
            # at the data point
            e = executor_data_point[i + 1]
            executor_interval_map[e] = \
                (executor_data_point[i + 1],
                 executor_data_point[i + 1])

        # get the residual map
        if config.exec_cap > executor_data_point[-1]:
            for e in range(executor_data_point[-1] + 1, config.exec_cap + 1):
                executor_interval_map[e] = \
                    (executor_data_point[-1],
                     executor_data_point[-1])

        return executor_interval_map

    def get_nodes_duration(self):
        # Warning: this is slow O(num_nodes * num_tasks)
        # get the duration over all nodes
        duration = 0
        for node in self.nodes:
            duration += node.get_node_duration()
        return duration

    def reset(self):
        for node in self.nodes:
            node.reset()
        self.num_nodes_done = 0
        self.executors = OrderedSet()
        self.frontier_nodes = OrderedSet()
        for node in self.nodes:
            if node.is_schedulable():
                self.frontier_nodes.add(node)
        self.arrived = False
        self.completed = False
        self.completion_time = np.inf

    def update_frontier_nodes(self, node):
        frontier_nodes_changed = False
        for child in node.child_nodes:
            if child.is_schedulable():
                if child.idx not in self.frontier_nodes:
                    self.frontier_nodes.add(child)
                    frontier_nodes_changed = True
        return frontier_nodes_changed
Example #12
0
class Node(object):
    def __init__(self, idx, tasks, task_duration, wall_time, np_random):
        self.idx = idx
        self.tasks = tasks
        self.wall_time = wall_time
        self.np_random = np_random

        self.task_duration = task_duration

        self.num_tasks = len(tasks)
        self.num_finished_tasks = 0
        self.next_task_idx = 0
        self.no_more_tasks = False
        self.tasks_all_done = False
        self.node_finish_time = np.inf

        self.executors = OrderedSet()

        # uninitialized
        self.parent_nodes = []
        self.child_nodes = []
        self.descendant_nodes = []
        self.job_dag = None

        self.assign_node_to_tasks()

    def assign_node_to_tasks(self):
        for task in self.tasks:
            task.node = self

    def get_node_duration(self):
        # Warning: this is slow O(num_tasks)
        # get the total duration over all tasks
        duration = 0
        for task in self.tasks:
            duration += task.get_duration()
        return duration

    def is_schedulable(self):
        if self.no_more_tasks:  # no more tasks
            return False
        if self.tasks_all_done:  # node done
            return False
        for node in self.parent_nodes:
            if not node.tasks_all_done:  # a parent node not done
                return False
        return True

    def reset(self):
        for task in self.tasks:
            task.reset()
        self.executors.clear()
        self.num_finished_tasks = 0
        self.next_task_idx = 0
        self.no_more_tasks = False
        self.tasks_all_done = False
        self.node_finish_time = np.inf

    def sample_executor_key(self, num_executors):
        (left_exec, right_exec) = \
            self.job_dag.executor_interval_map[num_executors]

        executor_key = None

        if left_exec == right_exec:
            executor_key = left_exec

        else:
            rand_pt = self.np_random.randint(1, right_exec - left_exec + 1)
            if rand_pt <= num_executors - left_exec:
                executor_key = left_exec
            else:
                executor_key = right_exec

        if executor_key not in self.task_duration['first_wave']:
            # more executors than number of tasks in the job
            largest_key = 0
            for e in self.task_duration['first_wave']:
                if e > largest_key:
                    largest_key = e
            executor_key = largest_key

        return executor_key

    def schedule(self, executor):
        assert self.next_task_idx < self.num_tasks
        task = self.tasks[self.next_task_idx]

        # task duration is determined by wave
        num_executors = len(self.job_dag.executors)
        assert num_executors > 0

        # sample an executor point in the data
        executor_key = self.sample_executor_key(num_executors)

        if executor.task is None or \
            executor.task.node.job_dag != task.node.job_dag:
            # the executor never runs a task in this job
            # fresh executor incurrs a warmup delay
            if len(self.task_duration['fresh_durations'][executor_key]) > 0:
                # (1) try to directly retrieve the warmup delay from data
                fresh_durations = \
                    self.task_duration['fresh_durations'][executor_key]
                i = np.random.randint(len(fresh_durations))
                duration = fresh_durations[i]
            else:
                # (2) use first wave but deliberately add in a warmup delay
                first_wave = \
                    self.task_duration['first_wave'][executor_key]
                i = np.random.randint(len(first_wave))
                duration = first_wave[i] + config.warmup_delay

        elif executor.task is not None and \
                executor.task.node == task.node and \
                len(self.task_duration['rest_wave'][executor_key]) > 0:
            # executor was working on this node
            # the task duration should be retrieved from rest wave
            rest_wave = self.task_duration['rest_wave'][executor_key]
            i = np.random.randint(len(rest_wave))
            duration = rest_wave[i]
        else:
            # executor is fresh to this node, use first wave
            if len(self.task_duration['first_wave'][executor_key]) > 0:
                # (1) try to retrieve first wave from data
                first_wave = \
                    self.task_duration['first_wave'][executor_key]
                i = np.random.randint(len(first_wave))
                duration = first_wave[i]
            else:
                # (2) first wave doesn't exist, use fresh durations instead
                # (should happen very rarely)
                fresh_durations = \
                    self.task_duration['fresh_durations'][executor_key]
                i = np.random.randint(len(fresh_durations))
                duration = fresh_durations[i]

        # # Hack! only use first/fresh duration
        # # executor is fresh to this node, use first wave
        # if len(self.task_duration['first_wave'][executor_key]) > 0:
        #     # (1) try to retrieve first wave from data
        #     first_wave = \
        #         self.task_duration['first_wave'][executor_key]
        #     i = self.np_random.randint(len(first_wave))
        #     duration = first_wave[i]
        # else:
        #     # (2) first wave doesn't exist, use fresh durations instead
        #     # (should happen very rarely)
        #     fresh_durations = \
        #         self.task_duration['fresh_durations'][executor_key]
        #     i = self.np_random.randint(len(fresh_durations))
        #     duration = fresh_durations[i]

        # detach the executor from old node
        # the executor can run task means it is local
        # to the job at this point
        executor.detach_node()

        # schedule the task
        task.schedule(self.wall_time.curr_time, duration, executor)

        # mark executor as running in the node
        self.executors.add(executor)
        executor.node = self

        self.next_task_idx += 1
        self.no_more_tasks = (self.next_task_idx >= self.num_tasks)

        if self.no_more_tasks:
            if self in self.job_dag.frontier_nodes:
                self.job_dag.frontier_nodes.remove(self)

        return task
Example #13
0
 def add_job(self, job):
     self.free_executors[job] = OrderedSet()
Example #14
0
    def step(self, action):

        assert self.action_space.contains(action)

        next_node, limit_idx = action

        # index starts from 0 but degree of parallelism starts with 1
        limit = limit_idx + 1

        # mark the node as selected
        assert next_node not in self.node_selected
        self.node_selected.add(next_node)
        # commit the source executor
        executor = next(iter(self.exec_to_schedule))
        source = executor.job_dag if executor.node is None else executor.node

        # compute number of valid executors to assign
        if next_node is not None:
            use_exec = min(next_node.num_tasks - next_node.next_task_idx - \
                           self.exec_commit.node_commit[next_node] - \
                           self.moving_executors.count(next_node), limit,
                           self.num_source_exec)
        else:
            use_exec = self.num_source_exec

        assert use_exec > 0

        self.exec_commit.add(source, next_node, use_exec)
        # deduct the executors that know the destination
        self.num_source_exec -= use_exec
        assert self.num_source_exec >= 0

        if self.num_source_exec == 0:
            # now a new scheduling round, clean up node selection
            self.node_selected.clear()
            # all commitments are made, now schedule free executors
            self.schedule()

        # Now run to the next event in the virtual timeline
        while len(self.timeline) > 0 and self.num_source_exec == 0:
            # consult agent by putting executors in source_exec

            new_time, obj = self.timeline.pop()
            self.wall_time.update_time(new_time)

            # case task: a task completion event, and frees up an executor.
            # case query: a new job arrives
            # case executor: an executor arrives at certain job

            if isinstance(obj, Task):  # task completion event
                finished_task = obj
                node = finished_task.node
                node.num_finished_tasks += 1

                # bookkeepings for node completion
                frontier_changed = False
                if node.num_finished_tasks == node.num_tasks:
                    assert not node.tasks_all_done  # only complete once
                    node.tasks_all_done = True
                    node.job_dag.num_nodes_done += 1
                    node.node_finish_time = self.wall_time.curr_time

                    frontier_changed = node.job_dag.update_frontier_nodes(node)

                # assign new destination for the job
                self.assign_executor(finished_task.executor, frontier_changed)

                # bookkeepings for job completion
                if node.job_dag.num_nodes_done == node.job_dag.num_nodes:
                    assert not node.job_dag.completed  # only complete once
                    node.job_dag.completed = True
                    node.job_dag.completion_time = self.wall_time.curr_time
                    self.remove_job(node.job_dag)

            elif isinstance(obj, JobDAG):  # new job arrival event
                job_dag = obj
                # job should be arrived at the first time
                assert not job_dag.arrived
                job_dag.arrived = True
                # inform agent about job arrival when stream is enabled
                self.job_dags.add(job_dag)
                self.add_job(job_dag)
                self.action_map = compute_act_map(self.job_dags)
                # assign free executors (if any) to the new job
                if len(self.free_executors[None]) > 0:
                    self.exec_to_schedule = \
                        OrderedSet(self.free_executors[None])
                    self.source_job = None
                    self.num_source_exec = \
                        len(self.free_executors[None])

            elif isinstance(obj, Executor):  # executor arrival event
                executor = obj
                # pop destination from the tracking record
                node = self.moving_executors.pop(executor)

                if node is not None:
                    # the job is not yet done when executor arrives
                    executor.job_dag = node.job_dag
                    node.job_dag.executors.add(executor)

                if node is not None and not node.no_more_tasks:
                    # the node is still schedulable
                    if node in node.job_dag.frontier_nodes:
                        # node is immediately runnable
                        task = node.schedule(executor)
                        self.timeline.push(task.finish_time, task)
                    else:
                        # free up the executor in this job
                        self.free_executors.add(executor.job_dag, executor)
                else:
                    # the node is saturated or the job is done
                    # by the time the executor arrives, use
                    # backup logic
                    self.backup_schedule(executor)

            else:
                print("illegal event type")
                exit(1)

        # compute reward
        reward = self.reward_calculator.get_reward(
            self.job_dags, self.wall_time.curr_time)

        # no more decision to make, jobs all done or time is up
        done = (self.num_source_exec == 0) and \
               ((len(self.timeline) == 0) or \
               (self.wall_time.curr_time >= self.max_time))

        if done:
            assert self.wall_time.curr_time >= self.max_time or \
                   len(self.job_dags) == 0

        return self.observe(), reward, done, None
Example #15
0
class SparkSimEnv(core.Env):
    """
    A trace-driven simulator for the dynamics of the scheduling module in Apache Spark.
    The intricacies to closely simulate the system in reality are mainly (1) the "moving
    cost" of executors acorss jobs (due to the overhead of starting a new JVM); (2) the
    wave effect of running tasks of the same stage on an executor (overhead of loading
    data in the first wave of tasks); (3) the dimenishing speedup in job runtime when
    assigning more executors to a job. See reference for more details.

    * STATE *
        Graph type of observation. It consists of features associated with each node (
        a tensor of dimension n * m, where n is number of nodes, m is number of features),
        and adjacency matrix (a sparse 0-1 matrix of dimension n * n).
        The features on each node is
        [number_of_executors_currently_in_this_job, is_current_executor_local_to_this_job,
         number_of_free_executors, total_work_remaining_on_this_node,
         number_of_tasks_remaining_on_this_node]

    * ACTIONS *
        Two dimensional action, [node_idx_to_schedule_next, number_of_executors_to_assign]
        Note: the set of available nodes has to contain node_idx, and the number of
        executors to assign must not exceed the limit. Both the available set and the limit
        are provided in the (auxiliary) state.

    * REWARD *
        Negative time elapsed for each job in the system since last action.
        For example, the virtual time was 0 for the last action, 4 jobs
        was in the system (either in the queue waiting or being processed),
        job 1 finished at time 1, job 2 finished at time 2.4 and job 3 and 4
        are still running at the next action. The next action is taken at
        time 5. Then the reward is - (1 * 1 + 1 * 2.4 + 2 * 5).
        Thus, the sum of the rewards would be negative of total
        (waiting + processing) time for all jobs.
    
    * REFERENCE *
        Section 6.2
        Learning Scheduling Algorithms for Data Processing Clusters
        H Mao, M Schwarzkopf, SB Venkatakrishnan, M Alizadeh
        https://arxiv.org/pdf/1810.01963.pdf
    """
    def __init__(self):
        # observation and action space
        self.setup_space()

        # random seed
        self.seed(config.seed)

        # global timer
        self.wall_time = WallTime()

        # uses priority queue
        self.timeline = Timeline()

        # executors
        self.executors = OrderedSet()
        for exec_id in range(config.exec_cap):
            self.executors.add(Executor(exec_id))

        # free executors
        self.free_executors = FreeExecutors(self.executors)

        # moving executors
        self.moving_executors = MovingExecutors()

        # executor commit
        self.exec_commit = ExecutorCommit()

        # prevent agent keeps selecting the same node
        self.node_selected = set()

        # for computing reward at each step
        self.reward_calculator = RewardCalculator()

    def add_job(self, job_dag):
        self.moving_executors.add_job(job_dag)
        self.free_executors.add_job(job_dag)
        self.exec_commit.add_job(job_dag)
        add_job_in_graph(self.graph, job_dag)

    def assign_executor(self, executor, frontier_changed):
        if executor.node is not None and not executor.node.no_more_tasks:
            # keep working on the previous node
            task = executor.node.schedule(executor)
            self.timeline.push(task.finish_time, task)
        else:
            # need to move on to other nodes
            if frontier_changed:
                # frontier changed, need to consult all free executors
                # note: executor.job_dag might change after self.schedule()
                source_job = executor.job_dag
                if len(self.exec_commit[executor.node]) > 0:
                    # directly fulfill the commitment
                    self.exec_to_schedule = {executor}
                    self.schedule()
                else:
                    # free up the executor
                    self.free_executors.add(source_job, executor)
                # then consult all free executors
                self.exec_to_schedule = OrderedSet(self.free_executors[source_job])
                self.source_job = source_job
                self.num_source_exec = len(self.free_executors[source_job])
            else:
                # just need to schedule one current executor
                self.exec_to_schedule = {executor}
                # only care about executors on the node
                if len(self.exec_commit[executor.node]) > 0:
                    # directly fulfill the commitment
                    self.schedule()
                else:
                    # need to consult for ALL executors on the node
                    # Note: self.exec_to_schedule is immediate
                    #       self.num_source_exec is for commit
                    #       so len(self.exec_to_schedule) !=
                    #       self.num_source_exec can happen
                    self.source_job = executor.job_dag
                    self.num_source_exec = len(executor.node.executors)

    def backup_schedule(self, executor):
        backup_scheduled = False
        if executor.job_dag is not None:
            # first try to schedule on current job
            for node in executor.job_dag.frontier_nodes:
                if not self.saturated(node):
                    # greedily schedule a frontier node
                    task = node.schedule(executor)
                    self.timeline.push(task.finish_time, task)
                    backup_scheduled = True
                    break
        # then try to schedule on any available node
        if not backup_scheduled:
            schedulable_nodes = self.get_frontier_nodes()
            if len(schedulable_nodes) > 0:
                node = next(iter(schedulable_nodes))
                self.timeline.push(
                    self.wall_time.curr_time + config.moving_delay, executor)
                # keep track of moving executors
                self.moving_executors.add(executor, node)
                backup_scheduled = True
        # at this point if nothing available, leave executor idle
        if not backup_scheduled:
            self.free_executors.add(executor.job_dag, executor)

    def get_frontier_nodes(self):
        # frontier nodes := unsaturated nodes with all parent nodes saturated
        frontier_nodes = OrderedSet()
        for job_dag in self.job_dags:
            for node in job_dag.nodes:
                if not node in self.node_selected and not self.saturated(node):
                    parents_saturated = True
                    for parent_node in node.parent_nodes:
                        if not self.saturated(parent_node):
                            parents_saturated = False
                            break
                    if parents_saturated:
                        frontier_nodes.add(node)

        return frontier_nodes

    def get_executor_limits(self):
        # "minimum executor limit" for each job
        # executor limit := {job_dag -> int}
        executor_limit = {}

        for job_dag in self.job_dags:

            if self.source_job == job_dag:
                curr_exec = self.num_source_exec
            else:
                curr_exec = 0

            # note: this does not count in the commit and moving executors
            executor_limit[job_dag] = len(job_dag.executors) - curr_exec

        return executor_limit

    def observe(self):
        # valid set of nodes
        frontier_nodes = self.get_frontier_nodes()

        # sort out the exec_map (where the executors are)
        exec_map = {}
        for job_dag in self.job_dags:
            exec_map[job_dag] = len(job_dag.executors)
        # count in moving executors
        for node in self.moving_executors.moving_executors.values():
            exec_map[node.job_dag] += 1
        # count in executor commit
        for s in self.exec_commit.commit:
            if isinstance(s, JobDAG):
                j = s
            elif isinstance(s, Node):
                j = s.job_dag
            elif s is None:
                j = None
            else:
                print('source', s, 'unknown')
                exit(1)
            for n in self.exec_commit.commit[s]:
                if n is not None and n.job_dag != j:
                    exec_map[n.job_dag] += self.exec_commit.commit[s][n]

        for job_dag in self.job_dags:
            for node in job_dag.nodes:
                feature = np.zeros([6])
                # number of executors already in the job
                feature[0] = exec_map[job_dag]
                # source executor is from the current job (locality)
                feature[1] = job_dag is self.source_job
                # number of source executors
                feature[2] = self.num_source_exec
                # remaining number of tasks in the node
                feature[3] = node.num_tasks - node.next_task_idx
                # average task duration of the node
                feature[4] = node.tasks[-1].duration
                # is the current node valid
                feature[5] = node in frontier_nodes

                # update feature in observation
                self.graph.update_nodes({node: feature})

        # update mask in the action space
        self.action_space[0].update_valid_set(frontier_nodes)

        # return the graph as observation
        obs = self.graph
        assert self.observation_space.contains(obs)

        return obs

    def saturated(self, node):
        # frontier nodes := unsaturated nodes with all parent nodes saturated
        anticipated_task_idx = node.next_task_idx + \
           self.exec_commit.node_commit[node] + \
           self.moving_executors.count(node)
        # note: anticipated_task_idx can be larger than node.num_tasks
        # when the tasks finish very fast before commitments are fulfilled
        return anticipated_task_idx >= node.num_tasks

    def schedule(self):
        executor = next(iter(self.exec_to_schedule))
        source = executor.job_dag if executor.node is None else executor.node

        # schedule executors from the source until the commitment is fulfilled
        while len(self.exec_commit[source]) > 0 and \
              len(self.exec_to_schedule) > 0:

            # keep fulfilling the commitment using free executors
            node = self.exec_commit.pop(source)
            executor = self.exec_to_schedule.pop()

            # mark executor as in use if it was free executor previously
            if self.free_executors.contain_executor(executor.job_dag, executor):
                self.free_executors.remove(executor)

            if node is None:
                # the next node is explicitly silent, make executor ilde
                if executor.job_dag is not None and \
                   any([not n.no_more_tasks for n in \
                        executor.job_dag.nodes]):
                    # mark executor as idle in its original job
                    self.free_executors.add(executor.job_dag, executor)
                else:
                    # no where to assign, put executor in null pool
                    self.free_executors.add(None, executor)


            elif not node.no_more_tasks:
                # node is not currently saturated
                if executor.job_dag == node.job_dag:
                    # executor local to the job
                    if node in node.job_dag.frontier_nodes:
                        # node is immediately runnable
                        task = node.schedule(executor)
                        self.timeline.push(task.finish_time, task)
                    else:
                        # put executor back in the free pool
                        self.free_executors.add(executor.job_dag, executor)

                else:
                    # need to move executor
                    self.timeline.push(
                        self.wall_time.curr_time + config.moving_delay, executor)
                    # keep track of moving executors
                    self.moving_executors.add(executor, node)

            else:
                # node is already saturated, use backup logic
                self.backup_schedule(executor)

    def step(self, action):

        assert self.action_space.contains(action)

        next_node, limit_idx = action

        # index starts from 0 but degree of parallelism starts with 1
        limit = limit_idx + 1

        # mark the node as selected
        assert next_node not in self.node_selected
        self.node_selected.add(next_node)
        # commit the source executor
        executor = next(iter(self.exec_to_schedule))
        source = executor.job_dag if executor.node is None else executor.node

        # compute number of valid executors to assign
        if next_node is not None:
            use_exec = min(next_node.num_tasks - next_node.next_task_idx - \
                           self.exec_commit.node_commit[next_node] - \
                           self.moving_executors.count(next_node), limit,
                           self.num_source_exec)
        else:
            use_exec = self.num_source_exec

        assert use_exec > 0

        self.exec_commit.add(source, next_node, use_exec)
        # deduct the executors that know the destination
        self.num_source_exec -= use_exec
        assert self.num_source_exec >= 0

        if self.num_source_exec == 0:
            # now a new scheduling round, clean up node selection
            self.node_selected.clear()
            # all commitments are made, now schedule free executors
            self.schedule()

        # Now run to the next event in the virtual timeline
        while len(self.timeline) > 0 and self.num_source_exec == 0:
            # consult agent by putting executors in source_exec

            new_time, obj = self.timeline.pop()
            self.wall_time.update_time(new_time)

            # case task: a task completion event, and frees up an executor.
            # case query: a new job arrives
            # case executor: an executor arrives at certain job

            if isinstance(obj, Task):  # task completion event
                finished_task = obj
                node = finished_task.node
                node.num_finished_tasks += 1

                # bookkeepings for node completion
                frontier_changed = False
                if node.num_finished_tasks == node.num_tasks:
                    assert not node.tasks_all_done  # only complete once
                    node.tasks_all_done = True
                    node.job_dag.num_nodes_done += 1
                    node.node_finish_time = self.wall_time.curr_time

                    frontier_changed = node.job_dag.update_frontier_nodes(node)

                # assign new destination for the job
                self.assign_executor(finished_task.executor, frontier_changed)

                # bookkeepings for job completion
                if node.job_dag.num_nodes_done == node.job_dag.num_nodes:
                    assert not node.job_dag.completed  # only complete once
                    node.job_dag.completed = True
                    node.job_dag.completion_time = self.wall_time.curr_time
                    self.remove_job(node.job_dag)

            elif isinstance(obj, JobDAG):  # new job arrival event
                job_dag = obj
                # job should be arrived at the first time
                assert not job_dag.arrived
                job_dag.arrived = True
                # inform agent about job arrival when stream is enabled
                self.job_dags.add(job_dag)
                self.add_job(job_dag)
                self.action_map = compute_act_map(self.job_dags)
                # assign free executors (if any) to the new job
                if len(self.free_executors[None]) > 0:
                    self.exec_to_schedule = \
                        OrderedSet(self.free_executors[None])
                    self.source_job = None
                    self.num_source_exec = \
                        len(self.free_executors[None])

            elif isinstance(obj, Executor):  # executor arrival event
                executor = obj
                # pop destination from the tracking record
                node = self.moving_executors.pop(executor)

                if node is not None:
                    # the job is not yet done when executor arrives
                    executor.job_dag = node.job_dag
                    node.job_dag.executors.add(executor)

                if node is not None and not node.no_more_tasks:
                    # the node is still schedulable
                    if node in node.job_dag.frontier_nodes:
                        # node is immediately runnable
                        task = node.schedule(executor)
                        self.timeline.push(task.finish_time, task)
                    else:
                        # free up the executor in this job
                        self.free_executors.add(executor.job_dag, executor)
                else:
                    # the node is saturated or the job is done
                    # by the time the executor arrives, use
                    # backup logic
                    self.backup_schedule(executor)

            else:
                print("illegal event type")
                exit(1)

        # compute reward
        reward = self.reward_calculator.get_reward(
            self.job_dags, self.wall_time.curr_time)

        # no more decision to make, jobs all done or time is up
        done = (self.num_source_exec == 0) and \
               ((len(self.timeline) == 0) or \
               (self.wall_time.curr_time >= self.max_time))

        if done:
            assert self.wall_time.curr_time >= self.max_time or \
                   len(self.job_dags) == 0

        return self.observe(), reward, done, None

    def remove_job(self, job_dag):
        for executor in list(job_dag.executors):
            executor.detach_job()
        self.exec_commit.remove_job(job_dag)
        self.free_executors.remove_job(job_dag)
        self.moving_executors.remove_job(job_dag)
        self.job_dags.remove(job_dag)
        self.finished_job_dags.add(job_dag)
        remove_job_from_graph(self.graph, job_dag)
        self.action_map = compute_act_map(self.job_dags)

    def reset(self, max_time=np.inf):
        # reset observation and action space
        self.setup_space()

        self.max_time = max_time
        self.wall_time.reset()
        self.timeline.reset()
        self.exec_commit.reset()
        self.moving_executors.reset()
        self.reward_calculator.reset()
        self.finished_job_dags = OrderedSet()
        self.node_selected.clear()
        for executor in self.executors:
            executor.reset()
        self.free_executors.reset(self.executors)
        # generate a set of new jobs
        self.job_dags = generate_jobs(
            self.np_random, self.timeline, self.wall_time)
        # map action to dag_idx and node_idx
        self.action_map = compute_act_map(self.job_dags)
        # add initial set of jobs in the system
        for job_dag in self.job_dags:
            self.add_job(job_dag)
        # put all executors as source executors initially
        self.source_job = None
        self.num_source_exec = len(self.executors)
        self.exec_to_schedule = OrderedSet(self.executors)

        return self.observe()

    def seed(self, seed):
        self.np_random = seeding.np_random(seed)

    def setup_space(self):
        # Set up the observation and action space
        # The boundary of the space may change if the dynamics is changed
        # a warning message will show up every time e.g., the observation falls
        # out of the observation space
        self.graph = DirectedGraph()
        self.obs_node_low = np.array([0] * 6)
        self.obs_node_high = np.array([config.exec_cap, 1, config.exec_cap, 1000, 100000, 1])
        self.obs_edge_low = self.obs_edge_high = np.array([])  # features on nodes only
        self.observation_space = spaces.Graph(
            node_feature_space=spaces.MultiBox(
                low=self.obs_node_low,
                high=self.obs_node_high,
                dtype=np.float32),
            edge_feature_space=spaces.MultiBox(
                low=self.obs_edge_low,
                high=self.obs_edge_high,
                dtype=np.float32))
        self.action_space = spaces.Tuple(
            (spaces.NodeInGraph(self.graph),
            spaces.MaskedDiscrete(config.num_servers)))
Example #16
0
class Environment(object):
    def __init__(self, dag_db):

        self.dag_db = dag_db

        self.job_dags = OrderedSet()
        self.action_map = {}  # action index -> node
        self.available_executors = {}
        self.last_trigger = None

        # executors
        self.executors = {}
        for exec_id in range(config.exec_cap):
            self.executors[exec_id] = Executor(exec_id)

        # dynamically bind {app_id -> job_dag}
        self.spark_dag_map = {}
        # dynamically bind {job_dag -> app_id}
        self.spark_inverse_dag_map = {}
        # dynamically bind {(app_id, stage_id) -> node}
        self.spark_node_map = {}
        # dynamically bind {node -> (app_id, stage_id)}
        self.spark_inverse_node_map = {}

        # dynamically bind {app_id -> {exec_id -> re-usable track_id}}
        self.exec_id_track_id_map = {}

    def add_job_dag(self, app_id):
        job_dag = self.dag_db.apps_map[app_id]
        job_dag.arrived = True

        self.job_dags.add(job_dag)

        # update map for job_dag
        self.spark_dag_map[app_id] = job_dag
        self.spark_inverse_dag_map[job_dag] = app_id

        # update exec_id track_id bind map
        self.exec_id_track_id_map[app_id] = {}

        # update map for node
        node_idx_to_stage_id_map = self.dag_db.stage_map[app_id]
        for node in job_dag.nodes:
            stage_id = node_idx_to_stage_id_map[node.idx]
            self.spark_node_map[(app_id, stage_id)] = node
            self.spark_inverse_node_map[node] = (app_id, stage_id)

        # update map for actions
        self.action_map.clear()
        self.action_map.update(self.pre_compute_action_map())

        return job_dag

    def bind_exec_id(self, app_id, exec_id, track_id):
        assert 0 <= track_id < config.exec_cap
        self.exec_id_track_id_map[app_id][exec_id] = track_id

    def complete_stage(self, app_id, stage_id):
        node = self.spark_node_map[(app_id, stage_id)]
        # bookkeepings for node completion
        assert not node.tasks_all_done  # only complete once
        node.tasks_all_done = True
        node.job_dag.update_frontier_nodes(node)
        node.job_dag.num_nodes_done += 1

        # bookkeepings for job completion
        if node.job_dag.num_nodes_done == node.job_dag.num_nodes:
            assert not node.job_dag.completed  # only complete once
            node.job_dag.completed = True

    def complete_tasks(self, app_id, stage_id, num_tasks_left):
        node = self.spark_node_map[(app_id, stage_id)]
        prev_finished_tasks = node.num_finished_tasks
        # update number of finished tasks for the node
        node.num_finished_tasks = node.num_tasks - num_tasks_left
        # update the next task index of the node
        node.next_task_idx += node.num_finished_tasks - prev_finished_tasks
        # remove node from frontier node if it is saturated
        node.no_more_tasks = (node.next_task_idx >= node.num_tasks)
        if node.no_more_tasks:
            if node.idx in node.job_dag.frontier_nodes:
                del node.job_dag.frontier_nodes[node.idx]

    def pre_compute_action_map(self):
        # translate action ~ [0, num_nodes_in_all_dags) to node object
        action_map = {}
        action = 0
        for job_dag in self.job_dags:
            for node in job_dag.nodes:
                action_map[action] = node
                action += 1
        return action_map

    def remove_job_dag(self, app_id):
        job_dag = self.dag_db.apps_map[app_id]

        self.job_dags.remove(job_dag)

        # free up stage holding executors
        for executor in job_dag.executors:
            executor.task = None
            executor.job_dag = None

        # update exec_id track_id map
        del self.exec_id_track_id_map[app_id]

        # update map for job_dag
        del self.spark_dag_map[app_id]
        del self.spark_inverse_dag_map[job_dag]

        # update map for node
        node_idx_to_stage_id_map = self.dag_db.stage_map[app_id]
        for node in job_dag.nodes:
            stage_id = node_idx_to_stage_id_map[node.idx]
            del self.spark_node_map[(app_id, stage_id)]
            del self.spark_inverse_node_map[node]

        # update map for actions
        self.action_map.clear()
        self.action_map.update(self.pre_compute_action_map())

        return job_dag
Example #17
0
 def reset(self, executors):
     self.free_executors = {}
     self.free_executors[None] = OrderedSet()
     for executor in executors:
         self.free_executors[None].add(executor)