Ejemplo n.º 1
0
    def __init__(self, vm: VirtualMachine, loader: Loader):
        self.loader = loader

        self.vm: VirtualMachine = vm  # Class that control a Virtual machine on the cloud
        # self.queue = queue  # Class with the scheduling plan

        # Control Flags
        self.working = False
        # flag to indicate that the instance is ready to execute
        self.ready = False
        # indicate that VM hibernates
        self.interrupted = False

        # debug flag indicates that the dispatcher should wait for the shutdown command
        self.debug_wait_command = self.loader.debug_conf.debug_mode

        # migration count
        self.migration_count = 0
        '''
        List that determine the execution order of the
        tasks that will be executed in that dispatcher
        '''

        # threading event to waiting for tasks to execute
        # self.waiting_work = threading.Event()
        self.semaphore = threading.Semaphore()

        self.main_thread = threading.Thread(target=self.__execution_loop,
                                            daemon=True)

        self.repo = PostgresRepo()
        self.least_status = None
        self.timestamp_status_update = None
Ejemplo n.º 2
0
    def __init__(self, loader: Loader):

        self.loader = loader

        # load the Scheduler that will be used
        self.scheduler = SimpleScheduler(instance_types=self.loader.env)
        # self.__load_scheduler()

        # read expected_makespan on build_dispatcher()
        # self.expected_makespan_seconds = None
        # self.deadline_timestamp = None

        '''
           If the execution has simulation
           Prepare the simulation environment
        '''
        if self.loader.simulation_conf.with_simulation:
            # start simulator
            self.simulator = RevocationSim(self.loader.revocation_rate)

        # Keep Used EBS Volume
        self.ebs_volume_id = None

        # Vars Datetime to keep track of global execution time
        self.start_timestamp = None
        self.end_timestamp = None
        self.elapsed_time = None

        self.repo = PostgresRepo()

        # Semaphore
        # self.semaphore = threading.Semaphore()
        # self.semaphore_count = threading.Semaphore()

        # TRACKERS VALUES
        self.n_interruptions = 0
        self.n_sim_interruptions = 0

        self.timeout = False

        ''' ABORT FLAG'''
        self.abort = False

        self.task_dispatcher: Dispatcher
        self.terminated_dispatchers = []
        self.task_status = Task.WAITING

        '''
                Build the initial dispatcher
                The class Dispatcher is responsible to manager the execution steps
                '''
        self.__build_dispatcher()

        # Prepare the control database and the folders structure in S3
        try:
            self.__prepare_execution()
        except Exception as e:
            logging.error(e)
            raise e
Ejemplo n.º 3
0
    def __stop(self):
        # START task execution

        self.repo = PostgresRepo()

        action = Daemon.STOP

        try:
            self.communicator.send(action=action, value=self.dict_info)
        except Exception as e:
            logging.error(e)
            self.__stopped(Task.ERROR)
            return
Ejemplo n.º 4
0
    def __get_execution_id(self):
        # """
        # Get next execution_id
        # :return: int
        # """
        # self.execution_id = 1
        """
        Read the database to get the next execution_id
        """

        repo = PostgresRepo()
        row = repo.get_execution(filter={
            'task_id': self.cudalign_task.task_id,
            'limit': 1,
            'order': 'desc'
        })
        if len(row) == 0:
            self.execution_id = 0
        else:
            # get least execution ID
            self.execution_id = row[0].execution_id + 1
Ejemplo n.º 5
0
class ScheduleManager:
    task_dispatcher: Dispatcher
    task_status = Task.WAITING

    def __init__(self, loader: Loader):

        self.loader = loader

        # load the Scheduler that will be used
        self.scheduler = SimpleScheduler(instance_types=self.loader.env)
        # self.__load_scheduler()

        # read expected_makespan on build_dispatcher()
        # self.expected_makespan_seconds = None
        # self.deadline_timestamp = None

        '''
           If the execution has simulation
           Prepare the simulation environment
        '''
        if self.loader.simulation_conf.with_simulation:
            # start simulator
            self.simulator = RevocationSim(self.loader.revocation_rate)

        # Keep Used EBS Volume
        self.ebs_volume_id = None

        # Vars Datetime to keep track of global execution time
        self.start_timestamp = None
        self.end_timestamp = None
        self.elapsed_time = None

        self.repo = PostgresRepo()

        # Semaphore
        # self.semaphore = threading.Semaphore()
        # self.semaphore_count = threading.Semaphore()

        # TRACKERS VALUES
        self.n_interruptions = 0
        self.n_sim_interruptions = 0

        self.timeout = False

        ''' ABORT FLAG'''
        self.abort = False

        self.task_dispatcher: Dispatcher
        self.terminated_dispatchers = []
        self.task_status = Task.WAITING

        '''
                Build the initial dispatcher
                The class Dispatcher is responsible to manager the execution steps
                '''
        self.__build_dispatcher()

        # Prepare the control database and the folders structure in S3
        try:
            self.__prepare_execution()
        except Exception as e:
            logging.error(e)
            raise e

    # # PRE-EXECUTION FUNCTIONS

    # def __load_scheduler(self):
    #
    #     if self.loader.scheduler_name.upper() == Scheduler.CC:
    #         self.scheduler = CCScheduler(loader=self.loader)
    #
    #     elif self.loader.scheduler_name.upper() == Scheduler.IPDPS:
    #         self.scheduler = IPDPS(loader=self.loader)
    #
    #     if self.scheduler is None:
    #         logging.error("<Scheduler Manager {}_{}>: "
    #                       "ERROR - Scheduler {} not found".format(self.loader.job.job_id,
    #                                                               self.loader.execution_id,
    #                                                               self.loader.scheduler_name))
    #         Exception("<Scheduler Manager {}_{}>:  "
    #                   "ERROR - Scheduler {} not found".format(self.loader.job.job_id,
    #                                                           self.loader.execution_id,
    #                                                           self.loader.scheduler_name))

    def __build_dispatcher(self):

        instance_type, market = self.scheduler.choose_initial_best_instance_type(self.loader.cudalign_task,
                                                                                 self.loader.deadline_seconds)

        # Create the Vm that will be used by the dispatcher
        vm = VirtualMachine(
            instance_type=instance_type,
            market=market,
            loader=self.loader
        )

        # than a dispatcher, that will execute the tasks, is create

        dispatcher = Dispatcher(vm=vm, loader=self.loader)

        # check if the VM need to be register on the simulator
        if self.loader.simulation_conf.with_simulation and vm.market == CloudManager.PREEMPTIBLE:
            self.simulator.register_vm(vm)

        # self.semaphore.acquire()

        self.task_dispatcher = dispatcher

        # self.semaphore.release()

    def __prepare_execution(self):
        """
           Prepare control database and all directories to start the execution process
        """
        # get job from control database
        tasks_repo = self.repo.get_tasks(
            filter={
                'task_id': self.loader.cudalign_task.task_id
            }
        )

        # Check if Job is already in the database
        if len(tasks_repo) == 0:
            # add task to database
            self.__add_task_to_database()
        else:
            # Task is already in database
            # Check task and Instances consistency
            logging.info("<Scheduler Manager {}_{}>: - "
                         "Checking database consistency...".format(self.loader.cudalign_task.task_id,
                                                                   self.loader.execution_id))

            task_repo = tasks_repo[0]

            assert task_repo.task_name == self.loader.cudalign_task.task_name, "Consistency error (task name): " \
                                                                               "{} <> {}"\
                .format(task_repo.task_name, self.loader.cudalign_task.task_name)

            assert task_repo.command == self.loader.cudalign_task.simple_command, "Consistency error (task command): " \
                                                                                  "{} <> {} "\
                .format(task_repo.command, self.loader.cudalign_task.command)

        # Check Instances Type
        for key, instance_type in self.loader.env.items():

            types = self.repo.get_instance_type(filter={
                'instance_type': key
            })

            if len(types) == 0:
                # add instance to control database
                self.__add_instance_type_to_database(instance_type)
            # else:
            #     # check instance type consistency
            #     inst_type_repo = types[0]
            #     assert inst_type_repo.vcpu == instance_type.vcpu, "Consistency error (vcpu instance {}): " \
            #                                                       "{} <> {} ".format(key,
            #                                                                          inst_type_repo.vcpu,
            #                                                                          instance_type.vcpu)
            #
            #     assert inst_type_repo.memory == instance_type.memory, "Consistency error (memory instance {}):" \
            #                                                           "{} <> {}".format(key,
            #                                                                             inst_type_repo.memory,
            #                                                                             instance_type.memory)

    def __add_task_to_database(self):
        """Record a Task to the controlgpu database"""

        task_repo = TaskRepo(
            task_id=self.loader.cudalign_task.task_id,
            task_name=self.loader.cudalign_task.task_name,
            command=self.loader.cudalign_task.simple_command
        )

        self.repo.add_task(task_repo)

    def __add_instance_type_to_database(self, instance_type):
        self.repo.add_instance_type(
            InstanceTypeRepo(
                type=instance_type.type,
                provider=instance_type.provider
            )
        )

    '''
    HANDLES FUNCTIONS
    '''

    def __interruption_handle(self):

        # Move task to other VM
        # self.semaphore.acquire()

        if not self.loader.cudalign_task.has_task_finished():
            self.loader.cudalign_task.stop_execution()

        # logging.info("Entrou no interruption_handle")

        # getting volume-id
        if self.loader.file_system_conf.type == EC2Manager.EBS:
            self.ebs_volume_id = self.task_dispatcher.vm.volume_id

        # logging.info("Pegou o id do EBS: {}".format(self.ebs_volume_id))

        # See in which VM we wiil restart
        current_time = self.start_timestamp - datetime.now()

        instance_type, market = self.scheduler.choose_restart_best_instance_type(
            cudalign_task=self.loader.cudalign_task,
            deadline=self.loader.deadline_seconds,
            current_time=current_time.total_seconds()
        )

        # logging.info("Escolheu instancia {} do tipo {}".format(instance_type.type, market))

        if self.loader.cudalign_task.has_task_finished():
            new_vm = VirtualMachine(
                instance_type=instance_type,
                market=market,
                loader=self.loader,
                volume_id=self.ebs_volume_id
            )

            # logging.info("Criou a nova vm!")

            dispatcher = Dispatcher(vm=new_vm, loader=self.loader)

            # check if the VM need to be register on the simulator
            if self.loader.simulation_conf.with_simulation and new_vm.market == CloudManager.PREEMPTIBLE:
                self.simulator.register_vm(new_vm)

            # self.semaphore.acquire()

            self.terminated_dispatchers.append(self.task_dispatcher)
            self.task_dispatcher = dispatcher

            # self.semaphore.release()

            self.__start_dispatcher()

        # self.semaphore.release()

    def __terminated_handle(self):
        # Move task to others VM
        # self.semaphore.acquire()

        if not self.loader.cudalign_task.has_task_finished():
            self.loader.cudalign_task.stop_execution()

        # logging.info("Entrou no terminated_handle")

        # getting volume-id
        if self.loader.file_system_conf.type == EC2Manager.EBS:
            self.ebs_volume_id = self.task_dispatcher.vm.volume_id

        # logging.info("Pegou o id do EBS: {}".format(self.ebs_volume_id))

        # See in which VM will restart
        current_time = self.start_timestamp - datetime.now()

        instance_type, market = self.scheduler.choose_restart_best_instance_type(
            cudalign_task=self.loader.cudalign_task,
            deadline=self.loader.deadline_seconds,
            current_time=current_time.total_seconds()
        )

        # logging.info("Escolheu instancia {} do tipo {}".format(instance_type.type, market))

        if not self.loader.cudalign_task.has_task_finished():
            new_vm = VirtualMachine(
                instance_type=instance_type,
                market=market,
                loader=self.loader,
                volume_id=self.ebs_volume_id
            )

            # logging.info("Criou a nova vm!")

            dispatcher = Dispatcher(vm=new_vm, loader=self.loader)

            # check if the VM need to be register on the simulator
            if self.loader.simulation_conf.with_simulation and new_vm.market == CloudManager.PREEMPTIBLE:
                self.simulator.register_vm(new_vm)

            # self.semaphore.acquire()

            self.terminated_dispatchers.append(self.task_dispatcher)
            self.task_dispatcher = dispatcher

            # self.semaphore.release()

            self.__start_dispatcher()

        # self.semaphore.release()

    def __event_handle(self, event):

        logging.info("<Scheduler Manager {}_{}>: - EVENT_HANDLE "
                     "Instance: '{}', Type: '{}', Market: '{}',"
                     "Event: '{}'".format(self.loader.cudalign_task.task_id,
                                          self.loader.execution_id,
                                          self.task_dispatcher.vm.instance_id,
                                          self.task_dispatcher.vm.type,
                                          self.task_dispatcher.vm.market,
                                          event.value))

        if event.value == CloudManager.IDLE:
            logging.info("<Scheduler Manager {}_{}>: - Calling Idle Handle".format(self.loader.cudalign_task.task_id,
                                                                                   self.loader.execution_id))

            self.loader.cudalign_task.finish_execution()
            self.task_status = Task.FINISHED
        # elif event.value == CloudManager.STOPPING:
        #     # self.semaphore_count.acquire()
        #     self.n_interruptions += 1
        #     # self.semaphore_count.release()
        #
        #     logging.info("<Scheduler Manager {}_{}>: - Calling Interruption Handle"
        #                  .format(self.loader.cudalign_task.task_id, self.loader.execution_id))
        #     self.__interruption_handle()
        elif event.value == CloudManager.STOPPED:
            # self.semaphore_count.acquire()
            self.n_interruptions += 1
            # self.semaphore_count.release()

            self.task_dispatcher.vm.terminate(delete_volume=self.loader.file_system_conf.ebs_delete)

            logging.info("<Scheduler Manager {}_{}>: - Calling Interruption Handle"
                         .format(self.loader.cudalign_task.task_id, self.loader.execution_id))
            # self.__interruption_handle()

        elif event.value in [CloudManager.TERMINATED, CloudManager.ERROR]:
            logging.info("<Scheduler Manager {}_{}>: - Calling Terminate Handle"
                         .format(self.loader.cudalign_task.task_id, self.loader.execution_id))
            if not self.task_dispatcher.vm.marked_to_interrupt:
                self.n_sim_interruptions += 1
            self.__terminated_handle()

        elif event.value in CloudManager.ABORT:
            self.abort = True

    '''
    CHECKERS FUNCTIONS
    '''

    def __checkers(self):
        # Checker loop
        # Checker if all dispatchers have finished the execution
        while self.task_status != Task.FINISHED:

            if self.abort:
                break

            time.sleep(5)

    '''
    Manager Functions
    '''

    def __start_dispatcher(self):
        # self.semaphore.acquire()

        # Starting working dispatcher
        self.task_dispatcher.main_thread.start()
        # self.task_dispatcher.waiting_work.set()

        # self.semaphore.release()

    def __terminate_dispatcher(self):

        if self.loader.debug_conf.debug_mode:
            logging.warning(100 * "#")
            logging.warning("\t<DEBUG MODE>: WAITING COMMAND TO TERMINATE -  PRESS ENTER")
            logging.warning(100 * "#")

            input("")

        logging.info("")
        logging.info("<Scheduler Manager {}_{}>: - Start termination process... "
                     .format(self.loader.cudalign_task.task_id, self.loader.execution_id))

        # terminate simulation
        if self.loader.simulation_conf.with_simulation:
            self.simulator.stop_simulation()

        # self.semaphore.acquire()

        # Terminate DISPATCHER
        logging.info("<Scheduler Manager {}_{}>: - "
                     "Terminating Dispatcher".format(self.loader.cudalign_task.task_id,
                                                     self.loader.execution_id))

        self.task_dispatcher.debug_wait_command = False

        self.task_dispatcher.working = False
        # self.task_dispatcher.waiting_work.set()

        # Confirm Termination
        logging.info("<Scheduler Manager {}_{}>: - Waiting Termination process..."
                     .format(self.loader.cudalign_task.task_id, self.loader.execution_id))

        self.task_dispatcher.debug_wait_command = False
        # waiting thread to terminate

        self.task_dispatcher.main_thread.join()

        # getting volume-id
        if self.loader.file_system_conf.type == EC2Manager.EBS:
            self.ebs_volume_id = self.task_dispatcher.vm.volume_id

        self.terminated_dispatchers.append(self.task_dispatcher)

        # self.semaphore.release()

    def __end_of_execution(self):

        # end of execution
        self.end_timestamp = datetime.now()
        self.elapsed_time = (self.end_timestamp - self.start_timestamp)

        logging.info("<Scheduler Manager {}_{}>: - Waiting Termination...".format(self.loader.cudalign_task.task_id,
                                                                                  self.loader.execution_id))

        cost = 0.0
        on_demand_count = 0
        preemptible_count = 0

        for dispatcher in self.terminated_dispatchers:
            if not dispatcher.vm.failed_to_created:

                if dispatcher.vm.market == CloudManager.ON_DEMAND:
                    on_demand_count += 1
                else:
                    preemptible_count += 1

                cost += dispatcher.vm.uptime.seconds * \
                    (dispatcher.vm.price / 3600.0)  # price in seconds'

        logging.info("")

        if not self.abort:
            execution_info = "    Task: {} Execution: {} Scheduler: SimpleScheduler    "\
                .format(self.loader.cudalign_task.task_id, self.loader.execution_id)
        else:
            execution_info = "    Job: {} Execution: {} Scheduler: SimpleScheduler" \
                             " - EXECUTION ABORTED    ".format(self.loader.cudalign_task.task_id,
                                                               self.loader.execution_id)

        execution_info = 20 * "#" + execution_info + 20 * "#"

        logging.info(execution_info)
        logging.info("")
        total = self.n_sim_interruptions + self.n_interruptions

        logging.info("\t AWS interruption: {} Simulation interruption: {} "
                     "Total interruption: {}".format(self.n_interruptions, self.n_sim_interruptions, total))

        total = on_demand_count + preemptible_count
        logging.info(
            "\t On-demand: {} Preemptible: {} Total: {}".format(on_demand_count,
                                                                preemptible_count,
                                                                total))
        logging.info("")
        logging.info("")
        logging.info("\t Start Time: {}  End Time: {}".format(self.start_timestamp, self.end_timestamp))
        logging.info("\t Elapsed Time: {}".format(self.elapsed_time))
        logging.info("\t Deadline: {}".format(timedelta(seconds=self.loader.deadline_seconds)))
        logging.info("")
        logging.info("")
        logging.info("\t Execution Total Estimated monetary Cost: {}".format(cost))
        logging.info("")

        if self.loader.file_system_conf.type == CloudManager.EBS and not self.loader.file_system_conf.ebs_delete:
            logging.warning("The following EBS VOLUMES will note be deleted by HADS: ")
            logging.warning("\t-> {}".format(self.ebs_volume_id))

        logging.info("")
        logging.info(len(execution_info) * "#")

        status = 'success'

        if self.abort:
            status = 'aborted'

        self.repo.add_statistic(
            StatisticRepo(execution_id=self.loader.execution_id,
                          task_id=self.loader.cudalign_task.task_id,
                          status=status,
                          start=self.start_timestamp,
                          end=self.end_timestamp,
                          deadline=self.loader.deadline_timedelta,
                          cost=cost)
        )

        self.repo.close_session()

        if self.abort:
            error_msg = "<Scheduler Manager {}_{}>: - " \
                        "Check all log-files. Execution Aborted".format(self.loader.cudalign_task.task_id,
                                                                        self.loader.execution_id)
            logging.error(error_msg)
            raise Exception

    def start_execution(self):
        # subscriber events_handle
        subscribers.append(self.__event_handle)

        self.start_timestamp = datetime.now()
        # UPDATE DATETIME DEADLINE

        logging.info("<Scheduler Manager {}_{}>: - Starting Execution.".format(self.loader.cudalign_task.task_id,
                                                                               self.loader.execution_id))
        logging.info("")

        self.__start_dispatcher()

        # Call checkers loop
        self.__checkers()

        self.__terminate_dispatcher()

        self.__end_of_execution()
Ejemplo n.º 6
0
    def __run(self):
        # START task execution

        # logging.info("<Executor {}-{}>: __run function".format(self.task.task_id, self.vm.instance_id))

        self.repo = PostgresRepo()
        current_time = None
        action = Daemon.START

        # if self.task.has_checkpoint:
        #     action = Daemon.RESTART
        try:
            self.communicator.send(action=action, value=self.dict_info)
            current_time = datetime.now()
            # logging.info("<Executor {}-{}>: Action Daemon.START sent".format(self.task.task_id, self.vm.instance_id))
        except Exception as e:
            logging.error(e)
            self.__stopped(Task.ERROR)
            return

        # if task was started with success
        # start execution loop
        if self.communicator.response['status'] == 'success':

            self.status = Task.EXECUTING

            # if action == Daemon.START:
            #     self.status = Task.EXECUTING
            # else:
            #     self.status = Task.RESTARTED

            # self.update_status_table()

            # self.stop_signal = True

            logging.info("<Executor {}-{}>: Begin execution loop".format(
                self.task.task_id, self.vm.instance_id))

            # start task execution Loop
            while (
                    self.status == Task.EXECUTING
            ) and not self.stop_signal and self.vm.state == CloudManager.RUNNING:

                try:
                    # logging.info(
                    # "<Executor {}-{}>: Trying to get task status".format(self.task.task_id, self.vm.instance_id))
                    command_status, current_stage = self.__get_task_status()
                    # logging.info(
                    #     "<Executor {}-{}>: Command status {}".format(self.task.task_id, self.vm.instance_id,
                    #                                                  command_status))

                    instance_action = None
                    if self.vm.market == CloudManager.PREEMPTIBLE:
                        # logging.info(
                        # "<Executor {}-{}>: Trying to get instance action".format(self.task.task_id,
                        #                                                          self.vm.instance_id))
                        instance_action = self.__get_instance_action()
                        # logging.info(
                        #     "<Executor {}-{}>: Instance action {}".format(self.task.task_id, self.vm.instance_id,
                        #                                                   instance_action))

                    # if self.loader.checkpoint_conf.with_checkpoint \
                    #     and self.vm.market == CloudManager.PREEMPTIBLE and self.task.do_checkpoint:
                    #     self.__checkpoint_task()

                except Exception as e:
                    logging.error(e)
                    self.__stopped(Task.ERROR)
                    return

                # check task status
                if command_status is not None and command_status == 'finished':

                    self.status = status = Task.FINISHED

                    self.loader.cudalign_task.finish_execution()
                    self.__stopped(status)
                    return

                if command_status is not None and command_status == 'running':
                    elapsed_time = datetime.now() - current_time
                    current_time = current_time + elapsed_time
                    self.loader.cudalign_task.update_execution_time(
                        elapsed_time.total_seconds())

                if instance_action is not None and instance_action != 'none':
                    self.vm.interrupt()
                    self.__stopped(Task.INTERRUPTED)
                    return

                if command_status is not None and command_status != 'running':
                    self.loader.cudalign_task.stop_execution()
                    self.__stopped(Task.RUNTIME_ERROR)
                    return

                time.sleep(1)

            if self.status != Task.FINISHED:
                self.loader.cudalign_task.stop_execution()
Ejemplo n.º 7
0
class Dispatcher:
    executor: Executor

    def __init__(self, vm: VirtualMachine, loader: Loader):
        self.loader = loader

        self.vm: VirtualMachine = vm  # Class that control a Virtual machine on the cloud
        # self.queue = queue  # Class with the scheduling plan

        # Control Flags
        self.working = False
        # flag to indicate that the instance is ready to execute
        self.ready = False
        # indicate that VM hibernates
        self.interrupted = False

        # debug flag indicates that the dispatcher should wait for the shutdown command
        self.debug_wait_command = self.loader.debug_conf.debug_mode

        # migration count
        self.migration_count = 0
        '''
        List that determine the execution order of the
        tasks that will be executed in that dispatcher
        '''

        # threading event to waiting for tasks to execute
        # self.waiting_work = threading.Event()
        self.semaphore = threading.Semaphore()

        self.main_thread = threading.Thread(target=self.__execution_loop,
                                            daemon=True)

        self.repo = PostgresRepo()
        self.least_status = None
        self.timestamp_status_update = None

        # self.stop_period = None

    # def __get_instance_usage(self):
    #     memory = 0
    #     cpu = 0
    #
    #     communicator = Communicator(self.vm.instance_ip,
    #                                 self.loader.communication_conf.socket_port)
    #
    #     info = {
    #         "task_id": 0,
    #         "command": '',
    #         'cpu_quota': 0
    #     }
    #
    #     max_attempt = 1
    #
    #     for i in range(max_attempt):
    #         try:
    #             communicator.send(action=Daemon.INSTANCE_USAGE, value=info)
    #
    #             result = communicator.response
    #
    #             if result['status'] == 'success':
    #                 memory = float(result['value']['memory'])
    #                 cpu = float(result['value']['cpu'])
    #         except:
    #             logging.error("<Dispatcher {}>: Get Instance Usage {}/{}".format(self.vm.instance_id,
    #                                                                              i + 1,
    #                                                                              max_attempt))
    #
    #     return cpu, memory

    # def __update_instance_status_table(self, state=None):
    #     """
    #     Update Instance Status table
    #     """
    #     if state is None:
    #         state = self.vm.state
    #
    #     # Check if the update have to be done due to the time
    #
    #     time_diff = None
    #     if self.timestamp_status_update is not None:
    #         time_diff = datetime.now() - self.timestamp_status_update
    #
    #     if self.least_status is None or self.least_status != state or \
    #         time_diff > timedelta(seconds=self.loader.scheduler_conf.status_update_time):
    #         cpu = 0.0
    #         memory = 0.0
    #         # cpu, memory = self.__get_instance_usage()
    #         # Update Instance_status Table
    #         self.repo.add_instance_status(InstanceStatusRepo(instance_id=self.vm.instance_id,
    #                                                          timestamp=datetime.now(),
    #                                                          status=state,
    #                                                          memory_footprint=memory,
    #                                                          cpu_usage=cpu,
    #                                                          cpu_credit=self.vm.get_cpu_credits()))
    #
    #         self.timestamp_status_update = datetime.now()
    #         self.least_status = self.vm.state

    # def __update_instance_statistics_table(self):
    #
    #     self.repo.add_instance_status(InstanceStatisticRepo(instance_id=self.vm.instance_id,
    #                                                         deploy_overhead=self.vm.deploy_overhead.seconds,
    #                                                         termination_overhead=self.vm.terminate_overhead.seconds,
    #                                                         uptime=self.vm.uptime.seconds))

    def __notify(self, value):

        kwargs = {'instance_id': self.vm.instance_id, 'dispatcher': self}

        notify(Event(event_type=Event.INSTANCE_EVENT, value=value, **kwargs))

    def __prepare_daemon(self):
        attempt = 1
        while True:
            time.sleep(self.loader.communication_conf.retry_interval)

            try:
                communicator = Communicator(
                    host=self.vm.instance_ip,
                    port=self.loader.communication_conf.socket_port)
                communicator.send(action=Daemon.TEST,
                                  value={
                                      'task_id': None,
                                      'command': None
                                  })

                if communicator.response['status'] == 'success':
                    return True

            except Exception as e:
                if attempt > self.loader.communication_conf.repeat:
                    logging.error(e)
                    return False

            if attempt <= self.loader.communication_conf.repeat:
                logging.info(
                    '<Dispatcher {}>: Trying Daemon handshake... attempt {}/{}'
                    ''.format(self.vm.instance_id, attempt,
                              self.loader.communication_conf.repeat))
            else:
                logging.info(
                    '<Dispatcher {}>: Daemon handshake MAX ATTEMPT ERROR'
                    ''.format(self.vm.instance_id, attempt,
                              self.loader.communication_conf.repeat))

            attempt += 1

    def __execution_loop(self):

        # Start the VM in the cloud
        status = self.vm.deploy()

        # self.expected_makespan_timestamp = self.vm.start_time + timedelta(seconds=self.queue.makespan_seconds)

        # update instance_repo
        self.repo.add_instance(
            InstanceRepo(id=self.vm.instance_id,
                         type=self.vm.instance_type.type,
                         region=self.vm.instance_type.region,
                         zone=self.vm.instance_type.zone,
                         market=self.vm.market,
                         ebs_volume=self.vm.volume_id,
                         price=self.vm.price))

        # self.__update_instance_status_table()

        if status:

            self.working = True

            try:
                self.vm.prepare_vm()
                self.__prepare_daemon()
            except Exception as e:
                logging.error(e)

                # stop working process
                # self.waiting_work.clear()
                # Notify abort!
                self.__notify(CloudManager.ABORT)

            # indicate that the VM is ready to execute
            self.vm.ready = self.ready = True
            cuda_task = self.loader.cudalign_task

            if not cuda_task.has_task_finished() and self.working:
                if self.vm.state == CloudManager.RUNNING:

                    self.semaphore.acquire()
                    # # check running tasks
                    # self.__update_running_executors()

                    if not cuda_task.is_running():
                        self.executor = Executor(task=cuda_task,
                                                 vm=self.vm,
                                                 loader=self.loader)
                        # start the executor loop to execute the task
                        self.executor.thread.start()
                        self.loader.cudalign_task.start_execution(
                            self.vm.instance_type.type)

                    self.semaphore.release()

            while self.working and not self.loader.cudalign_task.has_task_finished(
            ):
                # waiting for work
                # self.waiting_work.wait()
                #
                # self.waiting_work.clear()
                if not self.working:
                    break

                # # execution loop
                # self.semaphore.acquire()
                # self.semaphore.release()

                # Error: instance was not deployed or was terminated
                if self.vm.state in (CloudManager.ERROR,
                                     CloudManager.SHUTTING_DOWN,
                                     CloudManager.TERMINATED):
                    # waiting running tasks
                    self.executor.thread.join()
                    # VM was not created, raise a event
                    self.__notify(CloudManager.TERMINATED)

                    break

                # elif self.vm.state == CloudManager.STOPPING:
                #     # waiting running tasks
                #     self.executor.thread.join()
                #
                #     self.resume = False
                #
                #     self.__notify(CloudManager.STOPPING)

                elif self.vm.state == CloudManager.STOPPED:
                    # STOP and CHECKPOINT all tasks
                    self.executor.stop_signal = True

                    # waiting running tasks
                    self.executor.thread.join()

                    self.resume = False

                    self.__notify(CloudManager.STOPPED)

                    # break

            if self.vm.state == CloudManager.RUNNING:

                # self.__update_instance_status_table(state=CloudManager.IDLE)
                self.__notify(CloudManager.IDLE)

                while self.debug_wait_command:
                    time.sleep(5)

                self.vm.terminate(
                    delete_volume=self.loader.file_system_conf.ebs_delete)

            self.repo.close_session()

        else:
            # Error to start VM
            logging.error(
                "<Dispatcher> Instance type: {} Was not started".format(
                    self.vm.instance_type.type))
            self.__notify(CloudManager.ERROR)
Ejemplo n.º 8
0
class Executor:
    def __init__(self, task: Task, vm: VirtualMachine, loader: Loader):

        self.loader = loader

        self.task = task
        self.vm = vm

        self.repo = None
        # Execution Status
        self.status = Task.WAITING

        # socket.communicator
        # used to send commands to the ec2 instance
        self.communicator = Communicator(
            host=self.vm.instance_ip,
            port=self.loader.communication_conf.socket_port)
        """Track INFO """
        # used to abort the execution loop
        self.stop_signal = False
        # checkpoint tracker
        self.next_checkpoint_datetime = None

        self.thread = threading.Thread(target=self.__run, daemon=True)
        self.thread_executing = False

    def update_status_table(self):
        """
        Update Execution Table
        Call if task status change
        """
        # Update Execution Status Table
        self.repo.add_execution(
            ExecutionRepo(execution_id=self.loader.execution_id,
                          task_id=self.task.task_id,
                          instance_id=self.vm.instance_id,
                          timestamp=datetime.now(),
                          status=self.status))

        # repo.close_session()

    def __run(self):
        # START task execution

        # logging.info("<Executor {}-{}>: __run function".format(self.task.task_id, self.vm.instance_id))

        self.repo = PostgresRepo()
        current_time = None
        action = Daemon.START

        # if self.task.has_checkpoint:
        #     action = Daemon.RESTART
        try:
            self.communicator.send(action=action, value=self.dict_info)
            current_time = datetime.now()
            # logging.info("<Executor {}-{}>: Action Daemon.START sent".format(self.task.task_id, self.vm.instance_id))
        except Exception as e:
            logging.error(e)
            self.__stopped(Task.ERROR)
            return

        # if task was started with success
        # start execution loop
        if self.communicator.response['status'] == 'success':

            self.status = Task.EXECUTING

            # if action == Daemon.START:
            #     self.status = Task.EXECUTING
            # else:
            #     self.status = Task.RESTARTED

            # self.update_status_table()

            # self.stop_signal = True

            logging.info("<Executor {}-{}>: Begin execution loop".format(
                self.task.task_id, self.vm.instance_id))

            # start task execution Loop
            while (
                    self.status == Task.EXECUTING
            ) and not self.stop_signal and self.vm.state == CloudManager.RUNNING:

                try:
                    # logging.info(
                    # "<Executor {}-{}>: Trying to get task status".format(self.task.task_id, self.vm.instance_id))
                    command_status, current_stage = self.__get_task_status()
                    # logging.info(
                    #     "<Executor {}-{}>: Command status {}".format(self.task.task_id, self.vm.instance_id,
                    #                                                  command_status))

                    instance_action = None
                    if self.vm.market == CloudManager.PREEMPTIBLE:
                        # logging.info(
                        # "<Executor {}-{}>: Trying to get instance action".format(self.task.task_id,
                        #                                                          self.vm.instance_id))
                        instance_action = self.__get_instance_action()
                        # logging.info(
                        #     "<Executor {}-{}>: Instance action {}".format(self.task.task_id, self.vm.instance_id,
                        #                                                   instance_action))

                    # if self.loader.checkpoint_conf.with_checkpoint \
                    #     and self.vm.market == CloudManager.PREEMPTIBLE and self.task.do_checkpoint:
                    #     self.__checkpoint_task()

                except Exception as e:
                    logging.error(e)
                    self.__stopped(Task.ERROR)
                    return

                # check task status
                if command_status is not None and command_status == 'finished':

                    self.status = status = Task.FINISHED

                    self.loader.cudalign_task.finish_execution()
                    self.__stopped(status)
                    return

                if command_status is not None and command_status == 'running':
                    elapsed_time = datetime.now() - current_time
                    current_time = current_time + elapsed_time
                    self.loader.cudalign_task.update_execution_time(
                        elapsed_time.total_seconds())

                if instance_action is not None and instance_action != 'none':
                    self.vm.interrupt()
                    self.__stopped(Task.INTERRUPTED)
                    return

                if command_status is not None and command_status != 'running':
                    self.loader.cudalign_task.stop_execution()
                    self.__stopped(Task.RUNTIME_ERROR)
                    return

                time.sleep(1)

            if self.status != Task.FINISHED:
                self.loader.cudalign_task.stop_execution()

        # if kill signal than checkpoint task (SIMULATION)
        # if self.stop_signal:
        #     # check is task is running
        #     try:
        #         command_status, current_stage = self.__get_task_status()
        #         if command_status is not None and command_status == 'running':
        #             self.__stop()  # Checkpoint and stop task
        #             # self.__stopped(Task.HIBERNATED)
        #         # else:
        #             # self.__stopped(Task.FINISHED)
        #
        #     except Exception as e:
        #         logging.error(e)
        #         # self.__stopped(Task.STOP_SIGNAL)
        #
        #     return

        # self.__stopped(Task.ERROR)

    def __stop(self):
        # START task execution

        self.repo = PostgresRepo()

        action = Daemon.STOP

        try:
            self.communicator.send(action=action, value=self.dict_info)
        except Exception as e:
            logging.error(e)
            self.__stopped(Task.ERROR)
            return

    def __stopped(self, status):
        self.status = status
        #     # update execution time
        #
        #     # if task had Migrated, not to do
        #     if self.status == Task.MIGRATED:
        #         self.repo.close_session()
        #         return
        #
        self.status = status

        self.update_status_table()
        # close repo
        self.repo.close_session()

        # Check if condition is true to checkpoint the task

    # def __checkpoint_task(self):
    #
    #     if self.next_checkpoint_datetime is None:
    #         # compute next_checkpoint datetime
    #         self.next_checkpoint_datetime = datetime.now() + timedelta(seconds=self.task.checkpoint_interval)
    #
    #     elif datetime.now() > self.next_checkpoint_datetime:
    #
    #         self.__checkpoint()
    #         self.next_checkpoint_datetime = datetime.now() + timedelta(seconds=self.task.checkpoint_interval)

    # def __checkpoint(self, stop_task=False):
    #
    #     for i in range(3):
    #         try:
    #
    #             action = Daemon.CHECKPOINT_STOP if stop_task else Daemon.CHECKPOINT
    #
    #             logging.info("<Executor {}-{}>: Checkpointing task...".format(self.task.task_id,
    #                                                                           self.vm.instance_id))
    #
    #             start_ckp = datetime.now()
    #             self.communicator.send(action, value=self.dict_info)
    #
    #             if self.communicator.response['status'] == 'success':
    #                 end_ckp = datetime.now()
    #
    #                 logging.info("<Executor {}-{}>: Checkpointed with success. Time: {}".format(self.task.task_id,
    #                                                                                             self.vm.instance_id,
    #                                                                                             end_ckp - start_ckp))
    #                 self.task.has_checkpoint = True
    #                 self.task.update_task_time()
    #
    #             return
    #         except:
    #             pass
    #
    #     raise Exception("<Executor {}-{}>: Checkpoint error".format(self.task.task_id, self.vm.instance_id))

    def __get_task_status(self):

        for i in range(3):

            try:

                self.communicator.send(action=Daemon.STATUS,
                                       value=self.dict_info)

                result = self.communicator.response

                command_status = result['value']['status']
                current_stage = result['value']['current_stage']

                return command_status, current_stage
            except:
                logging.error("<Executor {}-{}>: Get task Status {}/3".format(
                    self.task.task_id, self.vm.instance_id, i + 1))
                time.sleep(1)

        raise Exception("<Executor {}-{}>: Get task status error".format(
            self.task.task_id, self.vm.instance_id))

    def __get_instance_action(self):

        for i in range(3):

            try:

                self.communicator.send(action=Daemon.INSTANCE_ACTION,
                                       value=self.dict_info)

                result = self.communicator.response

                instance_action = result['value']

                return instance_action
            except:
                logging.error(
                    "<Executor {}-{}>: Get instance action {}/3".format(
                        self.task.task_id, self.vm.instance_id, i + 1))
                time.sleep(1)

        raise Exception("<Executor {}-{}>: Get instance action error".format(
            self.task.task_id, self.vm.instance_id))

    # def __get_task_usage(self):
    #     for i in range(3):
    #         try:
    #             self.communicator.send(action=Daemon.TASK_USAGE,
    #                                    value=self.dict_info)
    #
    #             result = self.communicator.response
    #
    #             usage = None
    #
    #             if result['status'] == 'success':
    #                 usage = result['value']
    #
    #             return usage
    #         except:
    #             logging.error(
    #                 "<Executor {}-{}>: Get task Usage {}/3".format(self.task.task_id, self.vm.instance_id, i + 1))
    #             time.sleep(1)
    #
    #     raise Exception("<Executor {}-{}>: Get task usage error".format(self.task.task_id, self.vm.instance_id))

    # def __to_megabyte(self, str):
    #
    #     pos = str.find('MiB')
    #
    #     if pos == -1:
    #         pos = str.find('GiB')
    #     if pos == -1:
    #         pos = str.find('KiB')
    #     if pos == -1:
    #         pos = str.find('B')
    #
    #     memory = float(str[:pos])
    #     index = str[pos:]
    #
    #     to_megabyte = {
    #         "GiB": 1073.742,
    #         "MiB": 1.049,
    #         "B": 1e+6,
    #         "KiB": 976.562
    #     }
    #
    #     return to_megabyte[index] * memory

    @property
    def dict_info(self):

        info = {"task_id": self.task.task_id, "command": self.task.command}

        return info