Esempio n. 1
0
class HeftExecutor(FailRandom, BaseExecutor):

    def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion ,
                 initial_schedule = None, logger=None):
        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        # DynamicHeft
        self.heft_planner = heft_planner
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        self.initial_schedule = initial_schedule
        self.current_schedule = initial_schedule

        self.logger = logger

    def init(self):
        if self.initial_schedule is None:
            self.current_schedule  = Schedule({node:[] for node in self.heft_planner.get_nodes()})
            self.current_schedule = self.heft_planner.run(self.current_schedule)
        else:
            id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)}
            mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()}
            self.current_schedule = Schedule(mapping)
        self._post_new_events()



    def _generate_failtime_and_duration(self, item):
        # generate fail time, post it
        duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
        time_of_fail = (item.end_time - self.current_time)*random.random()
        return (time_of_fail, duration)

    def _task_start_handler(self, event):
        # check task as executing
        # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)

        # try to find nodes in cloud


        # check if failed and post
        (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if self._check_fail(event.task, node):

            (time_of_fail, duration) = self._generate_failtime_and_duration(item)
            time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            event_nodeup = NodeUp(node)
            event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            self.post(event_nodeup)
            # remove TaskFinished event
            self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)])
            pass

        pass

    def _task_finished_handler(self, event):
        # check task finished
        self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED)
        pass

    def _node_failed_handler(self, event):
        # check node down
        self.heft_planner.resource_manager.node(event.node).state = Node.Down
        # check failed event in schedule
        ## TODO: ambigious choice
        ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
        it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING]
        if len(it) != 1:
            ## TODO: raise exception here
            pass

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time

        self._reschedule(event)
        pass

    def _node_up_handler(self, event):
        # check node up
        self.heft_planner.resource_manager.node(event.node).state = Node.Unknown
        self._reschedule(event)
        pass

    pass
Esempio n. 2
0
class GAExecutor(FailRandom, BaseExecutor):
    def __init__(self, workflow, resource_manager, estimator,
                 base_fail_duration, base_fail_dispersion, initial_schedule):
        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        self.workflow = workflow
        # DynamicHeft
        #self.heft_planner = heft_planner
        self.resource_manager = resource_manager
        self.estimator = estimator
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()})
        self.initial_schedule = initial_schedule
        self.current_schedule = Schedule(
            {key: []
             for key in initial_schedule.mapping.keys()})

        #self.ready_tasks = []
        self.finished_tasks = [self.workflow.head_task.id]

        ## TODO: correct this stub later
        self.logger = None

    def init(self):
        #self.current_schedule = self.heft_planner.run(self.current_schedule)

        #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)]
        unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None)
        #run ready tasks
        self.post_new_events(unstarted_tasks)

    def is_ready(self, task):
        nope = False in [(p.id in self.finished_tasks) for p in task.parents]
        return not nope

    def is_next_to_run(self, task):
        (node, item) = self.initial_schedule.place(task)
        its = [
            it for it in self.initial_schedule.mapping[node]
            if it.start_time < item.start_time
        ]
        not_next = False in [(it.job.id in self.finished_tasks) for it in its]
        return not not_next

    def _task_start_handler(self, event):
        (node,
         item) = self.current_schedule.place_by_time(event.task,
                                                     event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if self._check_fail(event.task, node):
            # generate fail time, post it
            duration = self.base_fail_duration + self.base_fail_dispersion * random.random(
            )
            time_of_fail = (item.end_time -
                            self.current_time) * random.random()
            time_of_fail = self.current_time + (
                time_of_fail if time_of_fail > 0 else 0.01
            )  ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            event_nodeup = NodeUp(node)
            event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            self.post(event_nodeup)
            # remove TaskFinished event
            self.queue = deque([
                ev for ev in self.queue
                if not (isinstance(ev, TaskFinished)
                        and ev.task.id == event.task.id)
            ])

            pass
        pass

    def _task_finished_handler(self, event):
        # check task finished

        self.current_schedule.change_state_executed(event.task,
                                                    ScheduleItem.FINISHED)

        self.finished_tasks.append(event.task.id)

        unstarted_items = self.get_ready_tasks(event.task, event.node)

        ##TODO: remove it later
        #print("==============================")
        #print("Task " + str(event.task) + " finished")
        #for item in unstarted_items:
        #    print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0]))
        #print("==============================")
        #generate new task start events
        self.post_new_events(unstarted_items)
        pass

    def _node_failed_handler(self, event):
        # check node down
        self.resource_manager.node(event.node).state = Node.Down
        # check failed event in schedule
        ## TODO: ambigious choice
        ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
        it = [
            item for item in self.current_schedule.mapping[event.node]
            if item.job.id == event.task.id
            and item.state == ScheduleItem.EXECUTING
        ]
        if len(it) != 1:
            ## TODO: raise exception here
            pass

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time
        pass

    def _node_up_handler(self, event):
        # check node up
        self.resource_manager.node(event.node).state = Node.Unknown
        #get next task for this node
        next_sched_item = []
        for item in self.initial_schedule.mapping[event.node]:
            if item.job.id not in self.finished_tasks:
                next_sched_item = item
                break

        runtime = next_sched_item.end_time - next_sched_item.start_time
        start_time = self.current_time
        end_time = start_time + runtime

        actual_sched_item = ScheduleItem(next_sched_item.job, start_time,
                                         end_time)
        self.post_new_events([actual_sched_item])
        pass

    def get_ready_tasks(self, ptask, pnode):
        unstarted_items = []
        next_for_ptask = self.initial_schedule.get_next_item(ptask)
        #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job]
        tsks = [
            tsk for tsk in ptask.children
            if self.is_ready(tsk) and self.is_next_to_run(tsk)
        ]
        ##TODO: refactor it later
        if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready(
                next_for_ptask.job) and self.is_next_to_run(
                    next_for_ptask.job):
            tsks.append(next_for_ptask.job)

        # tsks mustn't be finished, executing or their node is Down
        def appropriate_to_run(tsk):
            if tsk.id in self.finished_tasks:
                return False
            if self.current_schedule.is_executing(tsk):
                return False
            nd = self.initial_schedule.place(tsk)[0]
            if self.resource_manager.node(nd).state == Node.Down:
                return False
            return True

        tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)]

        for child in tsks:
            (node, item) = self.initial_schedule.place(child)

            ## TODO: remake it later
            # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child)
            # runtime = item.end_time - item.start_time
            # start_time = self.current_time + transf
            # end_time = start_time + runtime

            sitems = self.current_schedule.mapping.items()
            pids = [p.id for p in child.parents]
            mp = {
                it.job.id: (pnd, it)
                for (pnd, items) in sitems for it in items
                if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED)
            }
            estms = [
                it.end_time +
                self.estimator.estimate_transfer_time(pnd, node, it.job, child)
                for (id, (pnd, it)) in mp.items()
            ]
            transf_end = 0 if len(estms) == 0 else max(estms)

            runtime = item.end_time - item.start_time
            start_time = max(self.current_time, transf_end)
            end_time = start_time + runtime

            actual_sched_item = ScheduleItem(item.job, start_time, end_time)
            unstarted_items.append(actual_sched_item)
        return unstarted_items

    def post_new_events(self, unstarted_items):
        for item in unstarted_items:
            (node, it) = self.initial_schedule.place(item.job)

            event_start = TaskStart(item.job)
            event_start.time_happened = item.start_time
            event_start.node = node

            event_finish = TaskFinished(item.job)
            event_finish.time_happened = item.end_time
            event_finish.node = node

            self.post(event_start)
            self.post(event_finish)

            self.current_schedule.mapping[node].append(item)
        pass
Esempio n. 3
0
class GAExecutor(FailRandom, BaseExecutor):

    def __init__(self,
                 workflow,
                 resource_manager,
                 estimator,
                 base_fail_duration,
                 base_fail_dispersion,
                 initial_schedule):
        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        self.workflow = workflow
        # DynamicHeft
        #self.heft_planner = heft_planner
        self.resource_manager = resource_manager
        self.estimator = estimator
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()})
        self.initial_schedule = initial_schedule
        self.current_schedule = Schedule({key:[] for key in initial_schedule.mapping.keys()})

        #self.ready_tasks = []
        self.finished_tasks = [self.workflow.head_task.id]

        ## TODO: correct this stub later
        self.logger = None

    def init(self):
        #self.current_schedule = self.heft_planner.run(self.current_schedule)

        #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)]
        unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None)
        #run ready tasks
        self.post_new_events(unstarted_tasks)

    def is_ready(self, task):
        nope = False in [(p.id in self.finished_tasks) for p in task.parents]
        return not nope

    def is_next_to_run(self, task):
        (node, item) = self.initial_schedule.place(task)
        its = [it for it in self.initial_schedule.mapping[node] if it.start_time < item.start_time]
        not_next = False in [(it.job.id in self.finished_tasks) for it in its]
        return not not_next

    def _task_start_handler(self, event):
        (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if self._check_fail(event.task, node):
            # generate fail time, post it
            duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
            time_of_fail = (item.end_time - self.current_time)*random.random()
            time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            event_nodeup = NodeUp(node)
            event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            self.post(event_nodeup)
            # remove TaskFinished event
            self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)])

            pass
        pass

    def _task_finished_handler(self, event):
        # check task finished

        self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED)

        self.finished_tasks.append(event.task.id)

        unstarted_items = self.get_ready_tasks(event.task, event.node)

        ##TODO: remove it later
        #print("==============================")
        #print("Task " + str(event.task) + " finished")
        #for item in unstarted_items:
        #    print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0]))
        #print("==============================")
        #generate new task start events
        self.post_new_events(unstarted_items)
        pass

    def _node_failed_handler(self, event):
        # check node down
        self.resource_manager.node(event.node).state = Node.Down
        # check failed event in schedule
        ## TODO: ambigious choice
        ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
        it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING]
        if len(it) != 1:
            ## TODO: raise exception here
            pass

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time
        pass

    def _node_up_handler(self, event):
        # check node up
        self.resource_manager.node(event.node).state = Node.Unknown
        #get next task for this node
        next_sched_item = []
        for item in self.initial_schedule.mapping[event.node]:
            if item.job.id not in self.finished_tasks:
                next_sched_item = item
                break

        runtime = next_sched_item.end_time - next_sched_item.start_time
        start_time = self.current_time
        end_time = start_time + runtime

        actual_sched_item = ScheduleItem(next_sched_item.job, start_time, end_time)
        self.post_new_events([actual_sched_item])
        pass


    def get_ready_tasks(self, ptask, pnode):
        unstarted_items = []
        next_for_ptask = self.initial_schedule.get_next_item(ptask)
        #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job]
        tsks = [tsk for tsk in ptask.children if self.is_ready(tsk) and self.is_next_to_run(tsk)]
        ##TODO: refactor it later
        if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready(next_for_ptask.job) and self.is_next_to_run(next_for_ptask.job):
            tsks.append(next_for_ptask.job)

        # tsks mustn't be finished, executing or their node is Down
        def appropriate_to_run(tsk):
            if tsk.id in self.finished_tasks:
                return False
            if self.current_schedule.is_executing(tsk):
                return False
            nd = self.initial_schedule.place(tsk)[0]
            if self.resource_manager.node(nd).state == Node.Down:
                return False
            return True

        tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)]

        for child in tsks:
            (node, item) = self.initial_schedule.place(child)

            ## TODO: remake it later
            # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child)
            # runtime = item.end_time - item.start_time
            # start_time = self.current_time + transf
            # end_time = start_time + runtime

            sitems = self.current_schedule.mapping.items()
            pids = [p.id for p in child.parents]
            mp = {it.job.id: (pnd, it) for (pnd, items) in sitems for it in items if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED) }
            estms = [it.end_time + self.estimator.estimate_transfer_time(pnd, node, it.job, child) for (id, (pnd, it)) in mp.items()]
            transf_end = 0 if len(estms) == 0 else max(estms)

            runtime = item.end_time - item.start_time
            start_time = max(self.current_time, transf_end)
            end_time = start_time + runtime


            actual_sched_item = ScheduleItem(item.job, start_time, end_time)
            unstarted_items.append(actual_sched_item)
        return unstarted_items

    def post_new_events(self, unstarted_items):
        for item in unstarted_items:
            (node, it) = self.initial_schedule.place(item.job)

            event_start = TaskStart(item.job)
            event_start.time_happened = item.start_time
            event_start.node = node

            event_finish = TaskFinished(item.job)
            event_finish.time_happened = item.end_time
            event_finish.node = node

            self.post(event_start)
            self.post(event_finish)

            self.current_schedule.mapping[node].append(item)
        pass
Esempio n. 4
0
class CloudHeftExecutor(EventMachine):

    STATUS_RUNNING = 'running'
    STATUS_FINISHED = 'finished'

    def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion, desired_reliability, public_resource_manager, initial_schedule = None):
        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        # DynamicHeft
        self.heft_planner = heft_planner
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        self.desired_reliability = desired_reliability
        self.public_resources_manager = public_resource_manager
        #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()})
        self.initial_schedule = initial_schedule
        self.current_schedule = initial_schedule

        self.register = dict()


    def init(self):
        #self.current_schedule = self.heft_planner.run(self.current_schedule)
        if self.initial_schedule is None:
            self.current_schedule  = Schedule({node:[] for node in self.heft_planner.get_nodes()})
            self.current_schedule = self.heft_planner.run(self.current_schedule)
        else:
            id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)}
            mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()}
            self.current_schedule = Schedule(mapping)
        self.post_new_events()

    def event_arrived(self, event):

        def reschedule(event):
            self.heft_planner.current_time = self.current_time
            current_cleaned_schedule = self.clean_events(event)
            self.current_schedule = self.heft_planner.run(current_cleaned_schedule)
            self.post_new_events()

        def check_fail(reliability):
            res = random.random()
            if res > reliability:
                return True
            return False


        if isinstance(event, TaskStart):

            # TODO: if node is cloud node, do nothing
            prm = self.public_resources_manager
            if prm.isCloudNode(event.node):
                return None

            # check if failed and post
            (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened)
            item.state = ScheduleItem.EXECUTING

            # check task as executing
            # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)

            # public_resources_manager:
            #   determine nodes of proper soft type
            #   check and determine free nodes
            #   determine reliability of every nodes
            #   determine time_of_execution probability for (task,node) pair

            # try to find nodes in cloud

            if event.task not in self.register:

                proper_nodes = prm.get_by_softreq(event.task.soft_reqs)
                proper_nodes = [node for node in proper_nodes if not prm.isBusy(node)]
                sorted_proper_nodes = sorted(proper_nodes, key=lambda x: prm.get_reliability(x.name))
                current_set = []

                base_reliability = self.heft_planner.estimator.estimate_reliability(event.task, event.node)
                obtained_reliability = base_reliability
                dt = item.end_time - item.start_time
                def calc(node, dt):
                        #(dt, task, node, transfer_estimation)
                        # TODO: add proper transfer time here
                        fp = prm.get_reliability(node.name)
                        comp_time = self.heft_planner.estimator.estimate_runtime(event.task, node)
                        cp = prm.probability_estimator(dt, comp_time, 0)
                        #TODO: remove it later
                        #cp = 0.95
                        #print("cp: " + str(cp))
                        return (node, fp, cp )

                it_comm_buf = 0
                for pnode in sorted_proper_nodes:
                    common_reliability = 1 - base_reliability
                    #TODO: refactor this later
                    if 1 - common_reliability >= self.desired_reliability:
                        break
                    res = calc(pnode, dt)
                    current_set.append(res)
                    #TODO: add dencity law of probability for dedicated resource

                    for (nd, fp, cp) in current_set:
                        common_reliability *= (1 - fp*cp)
                    common_reliability = 1 - common_reliability
                    #print("common_reliability: " + str(common_reliability))
                    it_comm_buf = common_reliability
                    if common_reliability >= self.desired_reliability:
                        #print("Commmon: "+ str(common_reliability))
                        break

                #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id))
                #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task))

                def frange(x, y, jump):
                    while x < y:
                        yield x
                        x += jump

                for (nd, fp, cp) in current_set:
                    comp_time = self.heft_planner.estimator.estimate_runtime(event.task, nd)
                    #sigma 0.1*M lets take 0.6*M


                    #TODO: uncomment it later

                    ints = [(i, calc(nd, i))for i in frange(0, comp_time + 0.2*comp_time, 0.05*comp_time)]
                    rd = random.random()
                    generated_comp_time = comp_time
                    for (i, p) in ints:
                        if p[2] > rd:
                            generated_comp_time = i
                            break

                    #comp_time + 0.6*comp_time
                    # TODO: remove it later
                    #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time)
                    #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95))



                    #print("cloud reliability: " + str(fp))
                    if check_fail(fp):

                        event_start = TaskStart(event.task)
                        event_start.time_happened = self.current_time
                        event_start.node = nd
                        self.post(event_start)


                        duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
                        time_of_fail = generated_comp_time*random.random()
                        time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

                        event_failed = NodeFailed(nd, event.task)
                        event_failed.time_happened = time_of_fail

                        event_nodeup = NodeUp(nd)
                        event_nodeup.time_happened = time_of_fail + duration

                        self.post(event_failed)
                        self.post(event_nodeup)
                    else:
                        event_start = TaskStart(event.task)
                        event_start.time_happened = self.current_time
                        event_start.node = nd

                        event_finish = TaskFinished(event.task)
                        event_finish.time_happened = self.current_time + generated_comp_time
                        event_finish.node = nd

                        self.post(event_start)
                        self.post(event_finish)

                    prm.checkBusy(nd, True)

                self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING
                pass

            reliability = self.heft_planner.estimator.estimate_reliability(event.task, node)
            if check_fail(reliability):
                # generate fail time, post it
                duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
                time_of_fail = (item.end_time - self.current_time)*random.random()
                time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

                event_failed = NodeFailed(node, event.task)
                event_failed.time_happened = time_of_fail

                event_nodeup = NodeUp(node)
                event_nodeup.time_happened = time_of_fail + duration

                self.post(event_failed)
                self.post(event_nodeup)
                # remove TaskFinished event
                self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id and not prm.isCloudNode(ev.node))])

                pass
            return None
        if isinstance(event, TaskFinished):

            # check if it cloud task
            # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function
            # if task cloud and not first: free cloud node, end_of_function
            # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)
            if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_RUNNING:
                # print("gotcha task: " + str(event.task))
                self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED
                ## TODO: correct it
                ## if event.task failed and went through rescheduling,
                ## it would be possible that currently ScheduleItem of event.task on dedicated resource
                ## has UNSTARTED state.
                ## TODO: add additional functional to schedule to record such situations and validate it after
                found = self.current_schedule.change_state_executed_with_end_time(event.task, ScheduleItem.FINISHED, self.current_time)
                pair = self.current_schedule.place_single(event.task)
                if pair is not None:
                    ## TODO: The bug is here. Fix it later.
                    ## the unstarted case must be taken into account in schedule and in the validity check procedure too
                    (nd, item) = pair
                    if item.state == ScheduleItem.EXECUTING:
                        item.start_time = event.time_happened
                        item.end_time = event.time_happened
                        item.state = ScheduleItem.FINISHED
                        self.queue = [ev for ev in self.queue if not (not isinstance(ev, NodeUp) and ev.task.id == event.task.id)]
                    else:
                        prm.checkBusy(event.node, False)
                        return None
                def check(ev):
                    if isinstance(ev, TaskFinished) or isinstance(ev, NodeFailed):
                        if ev.task.id == event.task.id and not prm.isCloudNode(ev.node):
                            return False
                    ## TODO: make it later
                    ##if isinstance(ev, NodeUp):
                    return True
                self.queue = [ev for ev in self.queue if check(ev)]
                prm.checkBusy(event.node, False)
                reschedule(event)
                return None
            if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_FINISHED:
                prm.checkBusy(event.node, False)
                return None

            # check task finished
            self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED
            self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED)
            return None
        if isinstance(event, NodeFailed):

            # check if cloud node
            # if cloud node: check as down, free node, end_of_function
            # if not cloud node: check as down, reschedule, end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)

            if from_cloud:
                prm.checkDown(event.node.name, True)
                prm.checkBusy(event.node, False)
                return None


            # check node down
            self.heft_planner.resource_manager.node(event.node).state = Node.Down
            # check failed event in schedule
            ## TODO: ambigious choice
            ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
            it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING]
            if len(it) != 1:
                ## TODO: raise exception here
                pass

            it[0].state = ScheduleItem.FAILED
            it[0].end_time = self.current_time

            reschedule(event)
            return None
        if isinstance(event, NodeUp):

            # check if cloud
            # if cloud: check node up, end_of_function
            # if not cloud: check as up, reschedule end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)
            if from_cloud:
                prm.checkDown(event.node.name, False)
                return None


            # check node up
            self.heft_planner.resource_manager.node(event.node).state = Node.Unknown
            reschedule(event)
            return None
        return None

    def post_new_events(self):
        unstarted_items = set()
        for (node, items) in self.current_schedule.mapping.items():
            for item in items:
                if item.state == ScheduleItem.UNSTARTED:
                    unstarted_items.add((node, item))

        events_to_post = []
        for (node, item) in unstarted_items:
            event_start = TaskStart(item.job)
            event_start.time_happened = item.start_time
            event_start.node = node

            event_finish = TaskFinished(item.job)
            event_finish.time_happened = item.end_time
            event_finish.node = node

            events_to_post
            self.post(event_start)
            self.post(event_finish)
        pass

    def clean_events(self, event):

        # remove all unstarted tasks
        cleaned_task = set()
        if isinstance(event, NodeFailed):
            cleaned_task = set([event.task])

        new_mapping = dict()
        for (node, items) in self.current_schedule.mapping.items():
            new_mapping[node] = []
            for item in items:
                if item.state != ScheduleItem.UNSTARTED:
                    new_mapping[node].append(item)
                else:
                    cleaned_task.add(item.job)
        clean_schedule = Schedule(new_mapping)
        # remove all events associated with these tasks
        prm = self.public_resources_manager

        def check(event):
            if isinstance(event, TaskStart) and event.task in cleaned_task and not prm.isCloudNode(event.node):
                return False
            if isinstance(event, TaskFinished) and event.task in cleaned_task and not prm.isCloudNode(event.node):
                return False
            return True
        new_queue = deque([evnt for evnt in self.queue if check(evnt)])
        self.queue = new_queue
        return clean_schedule
Esempio n. 5
0
class GaHeftExecutor(FailRandom, BaseExecutor):
    #@trace
    def __init__(self, **kwargs):

        super().__init__(**kwargs)

        self.workflow = kwargs["wf"]
        self.resource_manager = kwargs["resource_manager"]
        # DynamicHeft
        # both planners have acess to resource manager and estimator
        self.heft_planner = kwargs["heft_planner"]
        self.base_fail_duration = kwargs["base_fail_duration"]
        self.base_fail_dispersion = kwargs["base_fail_dispersion"]
        self.current_schedule = None
        self.fixed_interval_for_ga = kwargs["fixed_interval_for_ga"]
        self.ga_builder = kwargs["ga_builder"]
        self.replace_anyway = kwargs.get("replace_anyway", True)

        self.back_cmp = None

        pass

    def init(self):
        self.current_schedule = Schedule({node: [] for node in self.heft_planner.get_nodes()})

        initial_schedule = self.heft_planner.run(Schedule({node: [] for node in self.heft_planner.get_nodes()}))

        # print("heft solution!")
        # fsh = [hash(key) for key in initial_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")


        # TODO: change these two ugly records
        result = self.ga_builder()(self.current_schedule, initial_schedule)


        # print("Ga solution is broken!")
        # fsh = [hash(key) for key in result[0][2].mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")



        if not self._apply_mh_if_better(None, heuristic_resulted_schedule=initial_schedule,
                           metaheuristic_resulted_schedule=result[0][2]):
            self.current_schedule = initial_schedule
            self._post_new_events()

        # print("Before Before!")
        # fsh = [hash(key) for key in self.current_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")


        #self.current_schedule = result[0][2]
        #self._post_new_events()
        return result

    def _task_start_handler(self, event):

        res = self._check_event_for_ga_result(event)
        if res:
            return
        # check task as executing
        # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)
        # try to find nodes in cloud
        # check if failed and post
        (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if not self._is_a_fail_possible():
            return

        if self._check_fail(event.task, node):
            # generate fail time, post it
            duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
            time_of_fail = (item.end_time - self.current_time)*random.random()
            time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            event_nodeup = NodeUp(node)
            event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            self.post(event_nodeup)


        pass

    def _task_finished_handler(self, event):
        # check task finished
        self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED)
        self._check_event_for_ga_result(event)
        pass

    def _node_failed_handler(self, event):

        if not self._is_a_fail_possible():
            return



        self._remove_events(lambda ev: not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id))

        ## interrupt ga
        self._stop_ga()
        # check node down
        self.resource_manager.node(event.node).state = Node.Down
        # check failed event in schedule
        ## TODO: ambigious choice
        ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
        it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING]
        if len(it) != 1:
            raise Exception(" Trouble in finding of the task: count of found tasks {0}".format(len(it)))

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time

        # print("Before!")
        # fsh = [hash(key) for key in self.current_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")

        # run HEFT
        self._reschedule(event)

        # print("After!")
        # fsh = [hash(key) for key in self.current_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")


        #run GA
        self._run_ga_in_background(event)
        pass

    def _node_up_handler(self, event):
        ## interrupt ga
        self._stop_ga()
        # check node up
        self.heft_planner.resource_manager.node(event.node).state = Node.Unknown

        # print("Before!")
        # fsh = [hash(key) for key in self.current_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")

        self._reschedule(event)

        # print("After!")
        # fsh = [hash(key) for key in self.current_schedule.mapping.keys()]
        # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
        # if any(((h not in fsh) for h in rm_hashes)):
        #     raise Exception("Fixed schedule is broken")
        #run GA
        self._run_ga_in_background(event)
        pass

    def _stop_ga(self):
        self.back_cmp = None
        pass

    def _actual_ga_run(self):

        ## this way makes it possible to calculate what time
        ## ga actually has to find solution
        ## this value is important when you need account events between
        ## planned start and stop points
        # ga_interval = self.current_time - self.back_cmp.creation_time

        ## fixed_schedule is actual because
        ## we can be here only if there haven't been any invalidate events
        ## such as node failures
        ## in other case current ga background computation would be dropped
        ## and we wouldn't get here at all
        result = self.ga_builder()(self.back_cmp.fixed_schedule,
                                   # self.back_cmp.initial_schedule,
                                   self.back_cmp.current_schedule,
                                   self.current_time)
        print("CURRENT MAKESPAN: {0}".format(Utility.makespan(result[0][2])))
        return result

    def _check_event_for_ga_result(self, event):

        # check for time to get result from GA running background
        if self.back_cmp is None or self.back_cmp.time_to_stop != self.current_time:
            return False
        else:
            print("Event {0}".format(event))
            if isinstance(event, TaskStart):
                print("Task id {0}".format(event.task.id))
            result = self._actual_ga_run()

        if result is not None:
            return self._apply_mh_if_better(event, heuristic_resulted_schedule=self.current_schedule,
                                      metaheuristic_resulted_schedule=result[0][2])

        return False

    def _replace_current_schedule(self, event, new_schedule):
        # syncrhonize fixed part of new_schedule with the old schedule - lets assume new_schedule already synchonized
        # remove all events related with the old schedule
        # replace current with new
        # generate events of new schedule and post their
        if event is not None:
            self._clean_events(event)
        self.current_schedule = new_schedule
        self._post_new_events()

        self.back_cmp = None
        pass

    def _apply_mh_if_better(self, event, heuristic_resulted_schedule, metaheuristic_resulted_schedule):
        t1 = Utility.makespan(metaheuristic_resulted_schedule)
        t2 = Utility.makespan(heuristic_resulted_schedule)
        print("Replace anyway - {0}".format(self.replace_anyway))
        if self.replace_anyway is True or t1 < t2:
            ## generate new events
            self._replace_current_schedule(event, metaheuristic_resulted_schedule)
            ## if event is TaskStarted event the return value means skip further processing
            return True
        else:
            ## TODO: run_ga_yet_another_with_old_genome
            # self.ga_computation_manager.run(self.current_schedule, self.current_time)
            #self._run_ga_in_background(event)
            self.back_cmp = None
            return False
        pass

    # def _is_a_fail_possible(self):
    #     if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 1:
    #         print("DECLINE NODE DOWN")
    #         st = functools.reduce(operator.add, (" {0} - {1}".format(nd.name, nd.state) for nd in self.resource_manager.get_nodes()), "")
    #         print("STATE INFORMATION: " + st)
    #         return False
    #     return True

    def _is_a_fail_possible(self):
        return True



    def _run_ga_in_background(self, event):

        if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 0:
            return

        current_schedule = self.current_schedule
        current_time = self.current_time
        ## TODO: replace by log call
        print("Time: " + str(current_time) + " Creating reschedule point ")
        ## there can be several events in one time
        ## we choose the first to handle background GA run
        def _get_front_line(schedule, current_time, fixed_interval):
            event_time = current_time + fixed_interval
            min_item = ScheduleItem.MIN_ITEM()

            for (node, items) in schedule.mapping.items():
                for item in items:
                    ## It accounts case when event_time appears in a transfer gap(rare situation for all nodes)
                    ## TODO: compare with some precison
                    if event_time < item.end_time < min_item.end_time:
                        min_item = item
                        break

            if min_item.job is None:
                return None
            print("Time: " + str(current_time) + " reschedule point have been founded st:" + str(min_item.start_time) + " end:" + str(min_item.end_time))
            return min_item

        def _get_fixed_schedule(schedule, front_event):
            def is_before_event(item):
                # hard to resolve corner case. The simulator doesn't guranteed the order of appearing events.
                if item.start_time < front_event.end_time:
                    return True
                ## TODO: Urgent!!! experimental change. Perhaps, It should be removed from here later.
                if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.FAILED:
                    return True
                return False
            ##TODO: it's dangerous operation.
            ## TODO: need create new example of ScheduleItem.
            def set_proper_state(item):

                new_item = ScheduleItem.copy(item)

                non_finished = new_item.state == ScheduleItem.EXECUTING or new_item.state == ScheduleItem.UNSTARTED
                ## TODO: Urgent!: dangerous place
                if non_finished and new_item.end_time <= front_event.end_time:
                    new_item.state = ScheduleItem.FINISHED
                if non_finished and new_item.end_time > front_event.end_time:
                    new_item.state = ScheduleItem.EXECUTING
                return new_item
            fixed_mapping = {key: [set_proper_state(item) for item in items if is_before_event(item)] for (key, items) in schedule.mapping.items()}
            return Schedule(fixed_mapping)

        ## TODO: make previous_result used
        def run_ga(current_schedule):
            fixed_interval = self.fixed_interval_for_ga
            front_event = _get_front_line(current_schedule, current_time, fixed_interval)
            # we can't meet the end of computation so we do nothing
            if front_event is None:
                print("GA's computation isn't able to meet the end of computation")
                return
            fixed_schedule = _get_fixed_schedule(current_schedule, front_event)

            #TODO: It isn't a good reliable solution. It should be reconsider later.
            fixed_ids = set(fixed_schedule.get_all_unique_tasks_id())
            all_ids = set(task.id for task in self.workflow.get_all_unique_tasks())

            ## TODO: urgent bugfix to correctly run GaHeftvsHeft
            if len(fixed_ids) == len(all_ids):
                print("Fixed schedule is complete. There is no use to run ga.")
                return

            fsh = [hash(key) for key in fixed_schedule.mapping.keys()]
            rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()]
            if any(((h not in fsh) for h in rm_hashes)):
                raise Exception("Fixed schedule is broken")

            self.back_cmp = BackCmp(fixed_schedule, None, self.current_schedule, event, current_time, front_event.end_time)
            pass

        is_running = self.back_cmp is not None

        if not is_running:
            run_ga(current_schedule)
        else:
            self.back_cmp = None
            run_ga(current_schedule)


        ## TODO: only for debug. remove it later.
        # print("==================FIXED SCHEDULE PART=================")
        # print(self.back_cmp.fixed_schedule)
        # print("======================================================")

    pass
Esempio n. 6
0
class CloudHeftExecutor(EventMachine):

    STATUS_RUNNING = 'running'
    STATUS_FINISHED = 'finished'

    def __init__(self,
                 heft_planner,
                 base_fail_duration,
                 base_fail_dispersion,
                 desired_reliability,
                 public_resource_manager,
                 initial_schedule=None):
        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        # DynamicHeft
        self.heft_planner = heft_planner
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        self.desired_reliability = desired_reliability
        self.public_resources_manager = public_resource_manager
        #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()})
        self.initial_schedule = initial_schedule
        self.current_schedule = initial_schedule

        self.register = dict()

    def init(self):
        #self.current_schedule = self.heft_planner.run(self.current_schedule)
        if self.initial_schedule is None:
            self.current_schedule = Schedule(
                {node: []
                 for node in self.heft_planner.get_nodes()})
            self.current_schedule = self.heft_planner.run(
                self.current_schedule)
        else:
            id_to_task = {
                tsk.id: tsk
                for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)
            }
            mapping = {
                node: [
                    ScheduleItem(id_to_task[item.job.id], item.start_time,
                                 item.end_time) for item in items
                ]
                for (node, items) in self.initial_schedule.mapping.items()
            }
            self.current_schedule = Schedule(mapping)
        self.post_new_events()

    def event_arrived(self, event):
        def reschedule(event):
            self.heft_planner.current_time = self.current_time
            current_cleaned_schedule = self.clean_events(event)
            self.current_schedule = self.heft_planner.run(
                current_cleaned_schedule)
            self.post_new_events()

        def check_fail(reliability):
            res = random.random()
            if res > reliability:
                return True
            return False

        if isinstance(event, TaskStart):

            # TODO: if node is cloud node, do nothing
            prm = self.public_resources_manager
            if prm.isCloudNode(event.node):
                return None

            # check if failed and post
            (node, item) = self.current_schedule.place_by_time(
                event.task, event.time_happened)
            item.state = ScheduleItem.EXECUTING

            # check task as executing
            # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)

            # public_resources_manager:
            #   determine nodes of proper soft type
            #   check and determine free nodes
            #   determine reliability of every nodes
            #   determine time_of_execution probability for (task,node) pair

            # try to find nodes in cloud

            if event.task not in self.register:

                proper_nodes = prm.get_by_softreq(event.task.soft_reqs)
                proper_nodes = [
                    node for node in proper_nodes if not prm.isBusy(node)
                ]
                sorted_proper_nodes = sorted(
                    proper_nodes, key=lambda x: prm.get_reliability(x.name))
                current_set = []

                base_reliability = self.heft_planner.estimator.estimate_reliability(
                    event.task, event.node)
                obtained_reliability = base_reliability
                dt = item.end_time - item.start_time

                def calc(node, dt):
                    #(dt, task, node, transfer_estimation)
                    # TODO: add proper transfer time here
                    fp = prm.get_reliability(node.name)
                    comp_time = self.heft_planner.estimator.estimate_runtime(
                        event.task, node)
                    cp = prm.probability_estimator(dt, comp_time, 0)
                    #TODO: remove it later
                    #cp = 0.95
                    #print("cp: " + str(cp))
                    return (node, fp, cp)

                it_comm_buf = 0
                for pnode in sorted_proper_nodes:
                    common_reliability = 1 - base_reliability
                    #TODO: refactor this later
                    if 1 - common_reliability >= self.desired_reliability:
                        break
                    res = calc(pnode, dt)
                    current_set.append(res)
                    #TODO: add dencity law of probability for dedicated resource

                    for (nd, fp, cp) in current_set:
                        common_reliability *= (1 - fp * cp)
                    common_reliability = 1 - common_reliability
                    #print("common_reliability: " + str(common_reliability))
                    it_comm_buf = common_reliability
                    if common_reliability >= self.desired_reliability:
                        #print("Commmon: "+ str(common_reliability))
                        break

                #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id))
                #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task))

                def frange(x, y, jump):
                    while x < y:
                        yield x
                        x += jump

                for (nd, fp, cp) in current_set:
                    comp_time = self.heft_planner.estimator.estimate_runtime(
                        event.task, nd)
                    #sigma 0.1*M lets take 0.6*M

                    #TODO: uncomment it later

                    ints = [(i, calc(nd, i))
                            for i in frange(0, comp_time +
                                            0.2 * comp_time, 0.05 * comp_time)]
                    rd = random.random()
                    generated_comp_time = comp_time
                    for (i, p) in ints:
                        if p[2] > rd:
                            generated_comp_time = i
                            break

                    #comp_time + 0.6*comp_time
                    # TODO: remove it later
                    #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time)
                    #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95))

                    #print("cloud reliability: " + str(fp))
                    if check_fail(fp):

                        event_start = TaskStart(event.task)
                        event_start.time_happened = self.current_time
                        event_start.node = nd
                        self.post(event_start)

                        duration = self.base_fail_duration + self.base_fail_dispersion * random.random(
                        )
                        time_of_fail = generated_comp_time * random.random()
                        time_of_fail = self.current_time + (
                            time_of_fail if time_of_fail > 0 else 0.01
                        )  ##(item.end_time - self.current_time)*0.01

                        event_failed = NodeFailed(nd, event.task)
                        event_failed.time_happened = time_of_fail

                        event_nodeup = NodeUp(nd)
                        event_nodeup.time_happened = time_of_fail + duration

                        self.post(event_failed)
                        self.post(event_nodeup)
                    else:
                        event_start = TaskStart(event.task)
                        event_start.time_happened = self.current_time
                        event_start.node = nd

                        event_finish = TaskFinished(event.task)
                        event_finish.time_happened = self.current_time + generated_comp_time
                        event_finish.node = nd

                        self.post(event_start)
                        self.post(event_finish)

                    prm.checkBusy(nd, True)

                self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING
                pass

            reliability = self.heft_planner.estimator.estimate_reliability(
                event.task, node)
            if check_fail(reliability):
                # generate fail time, post it
                duration = self.base_fail_duration + self.base_fail_dispersion * random.random(
                )
                time_of_fail = (item.end_time -
                                self.current_time) * random.random()
                time_of_fail = self.current_time + (
                    time_of_fail if time_of_fail > 0 else 0.01
                )  ##(item.end_time - self.current_time)*0.01

                event_failed = NodeFailed(node, event.task)
                event_failed.time_happened = time_of_fail

                event_nodeup = NodeUp(node)
                event_nodeup.time_happened = time_of_fail + duration

                self.post(event_failed)
                self.post(event_nodeup)
                # remove TaskFinished event
                self.queue = deque([
                    ev for ev in self.queue
                    if not (isinstance(ev, TaskFinished) and ev.task.id ==
                            event.task.id and not prm.isCloudNode(ev.node))
                ])

                pass
            return None
        if isinstance(event, TaskFinished):

            # check if it cloud task
            # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function
            # if task cloud and not first: free cloud node, end_of_function
            # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)
            if from_cloud and self.register[
                    event.task] == CloudHeftExecutor.STATUS_RUNNING:
                # print("gotcha task: " + str(event.task))
                self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED
                ## TODO: correct it
                ## if event.task failed and went through rescheduling,
                ## it would be possible that currently ScheduleItem of event.task on dedicated resource
                ## has UNSTARTED state.
                ## TODO: add additional functional to schedule to record such situations and validate it after
                found = self.current_schedule.change_state_executed_with_end_time(
                    event.task, ScheduleItem.FINISHED, self.current_time)
                pair = self.current_schedule.place_single(event.task)
                if pair is not None:
                    ## TODO: The bug is here. Fix it later.
                    ## the unstarted case must be taken into account in schedule and in the validity check procedure too
                    (nd, item) = pair
                    if item.state == ScheduleItem.EXECUTING:
                        item.start_time = event.time_happened
                        item.end_time = event.time_happened
                        item.state = ScheduleItem.FINISHED
                        self.queue = [
                            ev for ev in self.queue
                            if not (not isinstance(ev, NodeUp)
                                    and ev.task.id == event.task.id)
                        ]
                    else:
                        prm.checkBusy(event.node, False)
                        return None

                def check(ev):
                    if isinstance(ev, TaskFinished) or isinstance(
                            ev, NodeFailed):
                        if ev.task.id == event.task.id and not prm.isCloudNode(
                                ev.node):
                            return False
                    ## TODO: make it later
                    ##if isinstance(ev, NodeUp):
                    return True

                self.queue = [ev for ev in self.queue if check(ev)]
                prm.checkBusy(event.node, False)
                reschedule(event)
                return None
            if from_cloud and self.register[
                    event.task] == CloudHeftExecutor.STATUS_FINISHED:
                prm.checkBusy(event.node, False)
                return None

            # check task finished
            self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED
            self.current_schedule.change_state_executed(
                event.task, ScheduleItem.FINISHED)
            return None
        if isinstance(event, NodeFailed):

            # check if cloud node
            # if cloud node: check as down, free node, end_of_function
            # if not cloud node: check as down, reschedule, end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)

            if from_cloud:
                prm.checkDown(event.node.name, True)
                prm.checkBusy(event.node, False)
                return None

            # check node down
            self.heft_planner.resource_manager.node(
                event.node).state = Node.Down
            # check failed event in schedule
            ## TODO: ambigious choice
            ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
            it = [
                item for item in self.current_schedule.mapping[event.node]
                if item.job.id == event.task.id
                and item.state == ScheduleItem.EXECUTING
            ]
            if len(it) != 1:
                ## TODO: raise exception here
                pass

            it[0].state = ScheduleItem.FAILED
            it[0].end_time = self.current_time

            reschedule(event)
            return None
        if isinstance(event, NodeUp):

            # check if cloud
            # if cloud: check node up, end_of_function
            # if not cloud: check as up, reschedule end_of_function
            prm = self.public_resources_manager
            from_cloud = prm.isCloudNode(event.node)
            if from_cloud:
                prm.checkDown(event.node.name, False)
                return None

            # check node up
            self.heft_planner.resource_manager.node(
                event.node).state = Node.Unknown
            reschedule(event)
            return None
        return None

    def post_new_events(self):
        unstarted_items = set()
        for (node, items) in self.current_schedule.mapping.items():
            for item in items:
                if item.state == ScheduleItem.UNSTARTED:
                    unstarted_items.add((node, item))

        events_to_post = []
        for (node, item) in unstarted_items:
            event_start = TaskStart(item.job)
            event_start.time_happened = item.start_time
            event_start.node = node

            event_finish = TaskFinished(item.job)
            event_finish.time_happened = item.end_time
            event_finish.node = node

            events_to_post
            self.post(event_start)
            self.post(event_finish)
        pass

    def clean_events(self, event):

        # remove all unstarted tasks
        cleaned_task = set()
        if isinstance(event, NodeFailed):
            cleaned_task = set([event.task])

        new_mapping = dict()
        for (node, items) in self.current_schedule.mapping.items():
            new_mapping[node] = []
            for item in items:
                if item.state != ScheduleItem.UNSTARTED:
                    new_mapping[node].append(item)
                else:
                    cleaned_task.add(item.job)
        clean_schedule = Schedule(new_mapping)
        # remove all events associated with these tasks
        prm = self.public_resources_manager

        def check(event):
            if isinstance(
                    event, TaskStart
            ) and event.task in cleaned_task and not prm.isCloudNode(
                    event.node):
                return False
            if isinstance(
                    event, TaskFinished
            ) and event.task in cleaned_task and not prm.isCloudNode(
                    event.node):
                return False
            return True

        new_queue = deque([evnt for evnt in self.queue if check(evnt)])
        self.queue = new_queue
        return clean_schedule
Esempio n. 7
0
class GaOldPopExecutor(FailOnce, BaseExecutor):

    def __init__(self, **kwargs):

        super().__init__()

        self.estimator = kwargs["estimator"]
        self.base_fail_duration = kwargs["base_fail_duration"]
        self.base_fail_dispersion = kwargs["base_fail_dispersion"]
        self.workflow = kwargs["wf"]
        self.resource_manager = kwargs["resource_manager"]
        self.stat_saver = kwargs["stat_saver"]
        self.task_id_to_fail = kwargs["task_id_to_fail"]
        self.ga_builder = kwargs["ga_builder"]

        self.current_schedule = None
        self.past_pop = None
        pass

    def init(self):
        ## TODO: replace it with logging
        print("Working with initial state of nodes: {0}".format([n.flops for n in self.resource_manager.get_nodes()]))

        ga_planner = self.ga_builder()
        self.current_schedule = Schedule({node: [] for node in self.resource_manager.get_nodes()})
        (result, logbook) = ga_planner(self.current_schedule, None)
        self.past_pop = ga_planner.get_pop()
        print("Result makespan: " + str(Utility.makespan(result[2])))
        self.current_schedule = result[2]
        self._post_new_events()

        self.failed_once = False
        pass

    def _task_start_handler(self, event):
        # check task as executing
        # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)
        # try to find nodes in cloud
        # check if failed and post
        (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if self._check_fail(event.task, node):
            # generate fail time, post it
            duration = self.base_fail_duration + self.base_fail_dispersion *random.random()
            time_of_fail = (item.end_time - self.current_time)*random.random()
            time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            # event_nodeup = NodeUp(node)
            # event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            # self.post(event_nodeup)

            # remove TaskFinished event
            ##TODO: make a function for this purpose in the base class
            self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)])
        pass

    def _task_finished_handler(self, event):
        # check task finished
        self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED)
        pass

    def _node_failed_handler(self, event):
        self.resource_manager.node(event.node).state = Node.Down
        it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING]
        if len(it) != 1:
            raise Exception("several items founded")
            pass

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time

        self._reschedule(event)
        pass

    def _node_up_handler(self, event):
        self.resource_manager.node(event.node).state = Node.Unknown
        self._reschedule(event)
        pass

    #@timing
    def _clean_chromosome(self, chromosome, event, current_cleaned_schedule):

        not_scheduled_tasks = [ item.job.id for (node, items) in current_cleaned_schedule.mapping.items() for item in items if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.EXECUTING]

        for (node_name, ids) in chromosome.items():
            for_removing = []
            for id in ids:
                if id in not_scheduled_tasks:
                    for_removing.append(id)
                pass
            for r in for_removing:
                ids.remove(r)
                pass
            pass

        if isinstance(event, NodeFailed):
            tasks = chromosome[event.node.name]
            ## TODO: here must be a procedure of getting currently alive nodes
            working_nodes = list(chromosome.keys() - set([event.node.name]))
            for t in tasks:
                lt = len(working_nodes) - 1
                new_node = 0 if lt == 0 else random.randint(0, lt )
                node_name = working_nodes[new_node]
                length = len(chromosome[node_name])
                # TODO: correct 0 and length
                new_place = 0 if length == 0 else random.randint(0, length)
                chromosome[node_name].insert(new_place, t)
            chromosome[event.node.name] = []
            return chromosome
        if isinstance(event, NodeUp):
            pass
        return chromosome

    def _reschedule(self, event):
        current_cleaned_schedule = self._clean_events(event)

        task_id = "" if not hasattr(event, 'task') else " " + str(event.task.id)
        ## scheduling with initial population created of the previous population by moving elements from a downed node
        print("Scheduling with the old pop: " + str(event.__class__.__name__) + task_id )
        ga_planner = self.ga_builder()

        cleaned_chromosomes = [self._clean_chromosome(ch, event, current_cleaned_schedule) for ch in self.past_pop]
        def is_empty(ch):
            return len([item for n, items in ch.items() for item in items]) == 0
        cleaned_chromosomes = [ch for ch in cleaned_chromosomes if not is_empty(ch)]
        cleaned_chromosomes = None if len(cleaned_chromosomes) == 0 else cleaned_chromosomes

        curr_ids = frozenset(current_cleaned_schedule.get_all_unique_tasks_id())
        all_ids = frozenset(t.id for t in self.workflow.get_all_unique_tasks())
        if all_ids == curr_ids:
            print("Schedule alleady has all unique tasks")
            return

        ((v1, v2, resulted_schedule, iter_old_pop), logbook_old_pop) = ga_planner(current_cleaned_schedule, None, self.current_time, initial_population=cleaned_chromosomes)
        #checking
        Utility.check_and_raise_for_fixed_part(resulted_schedule, current_cleaned_schedule, self.current_time)
        makespan_old_pop = Utility.makespan(resulted_schedule)
        print("Result makespan: " + str(makespan_old_pop))



        self.current_schedule = resulted_schedule
        self.past_pop = ga_planner.get_pop()

        ## scheduling with random initial population
        print("Scheduling with a random pop: " + str(event.__class__.__name__)+ task_id)
        ga_planner_with_random_init_population = self.ga_builder()
        ((v3, v4, schedule_with_random, iter_random), logbook_random) = ga_planner_with_random_init_population(current_cleaned_schedule, None, self.current_time, initial_population=None)

        Utility.check_and_raise_for_fixed_part(schedule_with_random, current_cleaned_schedule, self.current_time)
        makespan_random = Utility.makespan(schedule_with_random)

        print("Result makespan: " + str(Utility.makespan(schedule_with_random)))


        # creating and writing some stat data
        # Note: it can be rewritten with using of events
        if self.stat_saver is not None:
            stat_data = {
                "wf_name": self.workflow.name,
                "event_name": event.__class__.__name__,
                "task_id": task_id,
                "with_old_pop": {
                    "iter": iter_old_pop,
                    "makespan": makespan_old_pop,
                    "pop_aggr": logbook_old_pop
                },
                "with_random": {
                    "iter": iter_random,
                    "makespan": makespan_random,
                    "pop_aggr": logbook_random
                }
            }
            self.stat_saver(stat_data)


        self._post_new_events()
        pass

    pass
Esempio n. 8
0
class HeftExecutor(FailRandom, BaseExecutor):
    def __init__(self,
                 resource_manager,
                 heft_planner,
                 base_fail_duration,
                 base_fail_dispersion,
                 fail_count_upper_limit=None,
                 initial_schedule=None,
                 logger=None):
        super().__init__(heft_planner, base_fail_duration,
                         base_fail_dispersion, fail_count_upper_limit,
                         initial_schedule, logger)

        ## TODO: remake it later
        self.queue = deque()
        self.current_time = 0
        # DynamicHeft
        self.heft_planner = heft_planner
        self.base_fail_duration = base_fail_duration
        self.base_fail_dispersion = base_fail_dispersion
        self.initial_schedule = initial_schedule
        self.current_schedule = initial_schedule

        self.resource_manager = resource_manager
        self._fail_count_upper_limit = fail_count_upper_limit

        self.logger = logger

    def init(self):
        if self.initial_schedule is None:
            self.current_schedule = Schedule(
                {node: []
                 for node in self.heft_planner.get_nodes()})
            self.current_schedule = self.heft_planner.run(
                self.current_schedule)
        else:
            id_to_task = {
                tsk.id: tsk
                for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)
            }
            mapping = {
                node: [
                    ScheduleItem(id_to_task[item.job.id], item.start_time,
                                 item.end_time) for item in items
                ]
                for (node, items) in self.initial_schedule.mapping.items()
            }
            self.current_schedule = Schedule(mapping)
        self._post_new_events()

    def _generate_failtime_and_duration(self, item):
        # generate fail time, post it
        duration = self.base_fail_duration + self.base_fail_dispersion * random.random(
        )
        time_of_fail = (item.end_time - self.current_time) * random.random()
        return (time_of_fail, duration)

    def _task_start_handler(self, event):
        # check task as executing
        # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING)

        # try to find nodes in cloud

        # check if failed and post
        (node,
         item) = self.current_schedule.place_by_time(event.task,
                                                     event.time_happened)
        item.state = ScheduleItem.EXECUTING

        if self._check_fail(event.task, node):

            (time_of_fail,
             duration) = self._generate_failtime_and_duration(item)
            time_of_fail = self.current_time + (
                time_of_fail if time_of_fail > 0 else 0.01
            )  ##(item.end_time - self.current_time)*0.01

            event_failed = NodeFailed(node, event.task)
            event_failed.time_happened = time_of_fail

            event_nodeup = NodeUp(node)
            event_nodeup.time_happened = time_of_fail + duration

            self.post(event_failed)
            self.post(event_nodeup)
            # remove TaskFinished event
            self.queue = deque([
                ev for ev in self.queue
                if not (isinstance(ev, TaskFinished)
                        and ev.task.id == event.task.id)
            ])
            pass

        pass

    def _task_finished_handler(self, event):
        # check task finished
        self.current_schedule.change_state_executed(event.task,
                                                    ScheduleItem.FINISHED)
        pass

    def _node_failed_handler(self, event):
        # check node down
        self.heft_planner.resource_manager.node(event.node).state = Node.Down
        # check failed event in schedule
        ## TODO: ambigious choice
        ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED)
        it = [
            item for item in self.current_schedule.mapping[event.node]
            if item.job.id == event.task.id
            and item.state == ScheduleItem.EXECUTING
        ]
        if len(it) != 1:
            ## TODO: raise exception here
            pass

        it[0].state = ScheduleItem.FAILED
        it[0].end_time = self.current_time

        self._reschedule(event)
        pass

    def _node_up_handler(self, event):
        # check node up
        self.heft_planner.resource_manager.node(
            event.node).state = Node.Unknown
        self._reschedule(event)
        pass

    pass