Ejemplo n.º 1
0
    def __init__(self, sender, tid):

        if not sender:
            raise RuntimeError('No sender is set!')

        if not tid:
            raise RuntimeError('No tid is set!')

        body = sender + self.field_separator + tid

        super().__init__(MessageType.TASK_FINISHED(), body)
    def create(message):

        if not message:
            raise RuntimeError('Message is not set!')

        message_items = message.split(BaseMessage.field_separator)

        # TODO: len_message_items necessary?
        len_message_items = 0

        if message_items:

            msg_type = message_items[0]

            len_message_items = len(message_items)

        if msg_type == MessageType.TASK_REQUEST() and len_message_items == 2:
            return TaskRequest(message_items[1])

        if msg_type == MessageType.TASK_FINISHED() and len_message_items == 3:
            return TaskFinished(message_items[1], message_items[2])

        if msg_type == MessageType.ACKNOWLEDGE() and len_message_items == 1:
            return Acknowledge()

        if msg_type == MessageType.WAIT_COMMAND() and len_message_items == 2:
            return WaitCommand(message_items[1])

        if msg_type == MessageType.HEARTBEAT() and len_message_items == 2:
            return Heartbeat(message_items[1])

        if msg_type == MessageType.EXIT_COMMAND() and len_message_items == 1:
            return ExitCommand()

        if msg_type == MessageType.TASK_ASSIGN():
            return TaskAssign(message)

        raise RuntimeError(f"No message could be created from: {message}")
    def __init__(self, sender):

        if not sender:
            raise RuntimeError('No sender is set!')

        super().__init__(MessageType.TASK_REQUEST(), sender)
Ejemplo n.º 4
0
def main():

    MinimalPython.check()

    error_count = 0
    max_error_count = 100

    task_generator = None

    try:

        args = init_arg_parser()

        if args.print_version:
            print(f"Version {cyclone.VERSION}")
            sys.exit()

        config_file_reader = MasterConfigFileReader(args.config_file)

        init_logging(config_file_reader.log_filename, args.enable_debug)

        with PIDControl(config_file_reader.pid_file) as pid_control, \
                MasterCommHandler(config_file_reader.comm_target,
                                  config_file_reader.comm_port,
                                  config_file_reader.poll_timeout) as comm_handler, \
                SharedQueue() as task_queue, \
                SharedQueue() as result_queue:

            if pid_control.lock():

                logging.info("Started")
                logging.info(f"Master PID: {pid_control.pid()}")
                logging.info(f"Version: {cyclone.VERSION}")

                signal.signal(signal.SIGHUP, signal_handler)
                signal.signal(signal.SIGINT, signal_handler)
                signal.signal(signal.SIGTERM, signal_handler)

                signal.siginterrupt(signal.SIGHUP, True)
                signal.siginterrupt(signal.SIGINT, True)
                signal.siginterrupt(signal.SIGTERM, True)

                comm_handler.connect()

                controller_heartbeat_dict = dict()
                task_status_dict = dict()

                controller_timeout = config_file_reader.controller_timeout
                controller_wait_duration = config_file_reader.controller_wait_duration
                task_resend_timeout = config_file_reader.task_resend_timeout

                task_generator = create_task_generator(task_queue,
                                                       result_queue,
                                                       config_file_reader)
                task_generator.start()

                # TODO: Make a class for the master.
                global TASK_DISTRIBUTION

                run_flag = True

                while run_flag:

                    try:

                        last_exec_timestamp = int(time.time())

                        recv_data = comm_handler.recv_string()

                        send_msg = None

                        if recv_data:

                            logging.debug("Retrieved message: %s", recv_data)

                            recv_msg = MessageFactory.create(recv_data)
                            recv_msg_type = recv_msg.type()

                            # TODO: Caution, sender is not set everywhere!
                            controller_heartbeat_dict[recv_msg.sender] = int(
                                time.time())

                            if TASK_DISTRIBUTION:

                                if recv_msg_type == MessageType.TASK_REQUEST():

                                    task = None

                                    with CriticalSection(
                                            task_queue.lock,
                                            timeout=1) as critical_section:

                                        if critical_section.is_locked():

                                            if not task_queue.is_empty():
                                                task = task_queue.pop_nowait()

                                            else:

                                                if not task_generator.is_alive(
                                                ):

                                                    TASK_DISTRIBUTION = False
                                                    controller_wait_duration = 0

                                                    # Allow a TaskGenerator to quit itself without notifying the master.
                                                    logging.info(
                                                        "Task Generator is not alive."
                                                    )

                                    if task:

                                        do_task_assign = False

                                        if task.tid in task_status_dict:

                                            task_resend_threshold = \
                                                (task_status_dict[task.tid].timestamp + task_resend_timeout)

                                            if task_status_dict[task.tid].state == TaskState.finished() \
                                                    or last_exec_timestamp >= task_resend_threshold:

                                                do_task_assign = True

                                            elif task_status_dict[task.tid].state == TaskState.assigned() \
                                                    and last_exec_timestamp < task_resend_threshold:

                                                logging.debug(
                                                    "Ignoring task to assign..."
                                                    " - Waiting for task with TID to finish: %s",
                                                    task.tid)

                                                send_msg = WaitCommand(
                                                    controller_wait_duration)

                                            else:
                                                raise RuntimeError(
                                                    f"Undefined state processing task: {task.tid}"
                                                )

                                        else:
                                            do_task_assign = True

                                        # TODO: Could be a method to be called instead of `do_task_assign = True`
                                        if do_task_assign:

                                            task_status_dict[task.tid] = \
                                                TaskStatusItem(task.tid,
                                                               TaskState.assigned(),
                                                               recv_msg.sender,
                                                               int(time.time()))

                                            send_msg = TaskAssign(task)

                                    else:
                                        send_msg = WaitCommand(
                                            controller_wait_duration)

                                    logging.debug("Sending message: %s",
                                                  send_msg.to_string())
                                    comm_handler.send_string(
                                        send_msg.to_string())

                                elif recv_msg_type == MessageType.TASK_FINISHED(
                                ):

                                    tid = recv_msg.tid

                                    if tid in task_status_dict:

                                        if recv_msg.sender == task_status_dict[
                                                tid].controller:

                                            logging.debug(
                                                "Retrieved finished message for TID: %s",
                                                tid)
                                            task_status_dict[
                                                tid].state = TaskState.finished(
                                                )
                                            task_status_dict[
                                                tid].timestamp = int(
                                                    time.time())

                                            logging.debug(
                                                "Pushing TID to result queue: %s",
                                                tid)
                                            result_queue.push(tid)

                                        else:
                                            logging.warning(
                                                "Retrieved task finished from different controller!"
                                            )

                                    else:
                                        raise RuntimeError(
                                            "Inconsistency detected on task finished!"
                                        )

                                    send_msg = Acknowledge()

                                    if logging.root.isEnabledFor(
                                            logging.DEBUG):
                                        logging.debug("Sending message: %s",
                                                      send_msg.to_string())

                                    comm_handler.send_string(
                                        send_msg.to_string())

                                elif recv_msg_type == MessageType.HEARTBEAT():

                                    send_msg = Acknowledge()

                                    if logging.root.isEnabledFor(
                                            logging.DEBUG):
                                        logging.debug("Sending message: %s",
                                                      send_msg.to_string())

                                    comm_handler.send_string(
                                        send_msg.to_string())

                                else:
                                    raise RuntimeError(
                                        f"Undefined type found in message: {recv_msg.to_string()}"
                                    )

                            else:  # Do graceful shutdown, since task distribution is off!

                                send_msg = ExitCommand()

                                if logging.root.isEnabledFor(logging.DEBUG):
                                    logging.debug("Sending message: %s",
                                                  send_msg.to_string())

                                comm_handler.send_string(
                                    send_msg.to_string())  # Does not block.

                                controller_heartbeat_dict.pop(
                                    recv_msg.sender, None)

                                if check_all_controller_down(
                                        len(controller_heartbeat_dict)):
                                    run_flag = False

                        else:  # POLL-TIMEOUT

                            logging.debug('RECV-MSG TIMEOUT')

                            # This gives controllers the last chance to quit themselves until a timeout is reached.
                            if not TASK_DISTRIBUTION:

                                for controller_name in controller_heartbeat_dict.keys(
                                ):

                                    controller_threshold = \
                                        controller_heartbeat_dict[controller_name] + controller_timeout

                                    if last_exec_timestamp >= controller_threshold:
                                        controller_heartbeat_dict.pop(
                                            controller_name, None)

                                if check_all_controller_down(
                                        len(controller_heartbeat_dict)):
                                    run_flag = False

                    except Exception as err:

                        error_count += 1
                        _, _, exc_tb = sys.exc_info()
                        filename = os.path.split(
                            exc_tb.tb_frame.f_code.co_filename)[1]
                        logging.error(
                            f"Caught exception in main loop: {err} - {filename} (line: {exc_tb.tb_lineno})"
                        )

                        stop_task_distribution()

                        if error_count == max_error_count:
                            run_flag = False

            else:

                logging.error(
                    f"Another instance might be already running (PID file: {config_file_reader.pid_file})!"
                )
                sys.exit(1)

    except Exception as err:

        error_count += 1
        _, _, exc_tb = sys.exc_info()
        filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logging.error(
            f"Caught exception in main block: {err} - {filename} (line: {exc_tb.tb_lineno})"
        )

    try:

        if task_generator and task_generator.is_alive():

            os.kill(task_generator.pid, signal.SIGUSR1)

            for _ in range(0, 10, 1):

                if task_generator.is_alive():
                    logging.debug("Waiting for Task Generator to finish...")
                    time.sleep(1)
                else:
                    break

            if task_generator.is_alive():
                task_generator.terminate()
                task_generator.join()

    except Exception as err:

        error_count += 1
        _, _, exc_tb = sys.exc_info()
        filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logging.error(
            f"Exception in {filename} (line: {exc_tb.tb_lineno}): {err}")

    logging.info("Finished")

    if error_count:
        sys.exit(1)

    sys.exit(0)
Ejemplo n.º 5
0
    def __init__(self, sender):

        if not sender:
            raise RuntimeError('No sender is set!')

        super().__init__(MessageType.HEARTBEAT(), sender)
 def __init__(self, duration):
     super().__init__(MessageType.WAIT_COMMAND(), str(duration))
 def __init__(self):
     super().__init__(MessageType.ACKNOWLEDGE(), '')
Ejemplo n.º 8
0
 def __init__(self):
     super().__init__(MessageType.EXIT_COMMAND(), '')
Ejemplo n.º 9
0
def main():

    MinimalPython.check()

    try:

        args = init_arg_parser()

        if args.print_version:
            print(f"Version {cyclone.VERSION}")
            sys.exit()

        config_file_reader = ControllerConfigFileReader(args.config_file)

        init_logging(config_file_reader.log_filename, args.enable_debug)

        with PIDControl(config_file_reader.pid_file) as pid_control, \
                ControllerCommHandler(config_file_reader.comm_target,
                                      config_file_reader.comm_port,
                                      config_file_reader.poll_timeout) as comm_handler, \
                SharedQueue() as result_queue, \
                SharedQueue() as task_queue:

            if pid_control.lock():

                logging.info("Started")
                logging.info(f"Controller PID: {pid_control.pid()}")

                logging.debug("Version: %s", cyclone.VERSION)

                signal.signal(signal.SIGINT, signal.SIG_IGN)
                signal.signal(signal.SIGHUP, signal_handler)
                signal.signal(signal.SIGTERM, signal_handler)

                signal.siginterrupt(signal.SIGHUP, True)
                signal.siginterrupt(signal.SIGINT, True)
                signal.siginterrupt(signal.SIGTERM, True)

                comm_handler.connect()

                request_retry_count = 0
                request_retry_wait_duration = config_file_reader.request_retry_wait_duration
                max_num_request_retries = config_file_reader.max_num_request_retries

                lock_worker_state_table = multiprocessing.Lock()
                lock_result_queue = multiprocessing.Lock()

                cond_result_queue = multiprocessing.Condition(
                    lock_result_queue)

                worker_count = config_file_reader.worker_count
                worker_ids = create_worker_ids(worker_count)
                worker_state_table = create_worker_state_table(worker_ids)

                worker_handle_dict = \
                    create_worker(worker_state_table,
                                  lock_worker_state_table,
                                  task_queue,
                                  result_queue,
                                  cond_result_queue)

                global RUN_CONDITION

                if not start_worker(worker_handle_dict, worker_state_table):

                    logging.error("Not all worker are ready!")
                    RUN_CONDITION = False

                while RUN_CONDITION:

                    try:

                        send_msg = None

                        if not send_msg:

                            with CriticalSection(cond_result_queue):

                                if not result_queue.is_empty():

                                    task_id = result_queue.pop_nowait()

                                    if task_id:

                                        logging.debug("Finished task: %s",
                                                      task_id)
                                        send_msg = TaskFinished(
                                            comm_handler.fqdn, task_id)

                        if not send_msg:

                            found_ready_worker = False

                            with CriticalSection(lock_worker_state_table):

                                for worker_id in worker_state_table.keys():

                                    if worker_handle_dict[worker_id].is_alive() \
                                            and worker_state_table[worker_id].get_state == WorkerState.READY:

                                        found_ready_worker = True
                                        break

                            if found_ready_worker:

                                logging.debug('Requesting a task...')

                                send_msg = TaskRequest(comm_handler.fqdn)

                            else:

                                worker_count = len(worker_state_table)
                                worker_count_not_active = 0

                                for worker_id in worker_state_table.keys():

                                    if not worker_handle_dict[
                                            worker_id].is_alive():
                                        worker_count_not_active += 1

                                if worker_count == worker_count_not_active:

                                    logging.error('No worker are alive!')
                                    RUN_CONDITION = False

                                else:  # Available worker are busy

                                    with CriticalSection(cond_result_queue):

                                        wait_timeout_result_queue = 1

                                        cond_result_queue.wait(
                                            wait_timeout_result_queue)

                                        if result_queue.is_empty():
                                            send_msg = Heartbeat(
                                                comm_handler.fqdn)

                        if send_msg:

                            if logging.root.isEnabledFor(logging.DEBUG):
                                # TODO: remove redundant call of send_msg.to_string()
                                logging.debug("Sending message to master: %s",
                                              send_msg.to_string())

                            comm_handler.send_string(send_msg.to_string())

                            # Check for response and process it.
                            # Used redundant - TODO: make a class method.
                            ################################################################################
                            in_raw_data = comm_handler.recv_string()

                            if in_raw_data:

                                logging.debug(
                                    "Retrieved message (raw data): %s",
                                    in_raw_data)

                                in_msg = MessageFactory.create(in_raw_data)
                                in_msg_type = in_msg.type()

                                if MessageType.TASK_ASSIGN() == in_msg_type:

                                    task = in_msg.to_task()
                                    logging.debug(
                                        "Retrieved task assign for: %s",
                                        task.tid)
                                    task_queue.push(task)
                                    logging.debug(
                                        "Pushed task to task queue: %s",
                                        task.tid)

                                elif MessageType.ACKNOWLEDGE() == in_msg_type:
                                    pass

                                elif MessageType.WAIT_COMMAND() == in_msg_type:

                                    #TODO: Implement it on the master side!
                                    wait_duration = in_msg.duration
                                    logging.debug(
                                        "Retrieved Wait Command with duration: %fs",
                                        wait_duration)
                                    time.sleep(wait_duration)

                                elif MessageType.EXIT_COMMAND() == in_msg_type:

                                    RUN_CONDITION = False
                                    logging.info(
                                        'Retrieved exit message from master...'
                                    )

                                if request_retry_count:
                                    request_retry_count = 0
################################################################################

                            else:

                                if request_retry_count == max_num_request_retries:

                                    logging.info(
                                        'Exiting, since maximum retry count is reached!'
                                    )
                                    comm_handler.disconnect()
                                    RUN_CONDITION = False

                                time.sleep(request_retry_wait_duration)

                                # Check for response and process it.
                                # Used redundant - TODO: make a class method.
                                ################################################################################
                                in_raw_data = comm_handler.recv_string()

                                if in_raw_data:

                                    logging.debug(
                                        "Retrieved message (raw data): %s",
                                        in_raw_data)

                                    in_msg = MessageFactory.create(in_raw_data)
                                    in_msg_type = in_msg.type()

                                    if MessageType.TASK_ASSIGN(
                                    ) == in_msg_type:

                                        task = in_msg.to_task()
                                        logging.debug(
                                            "Retrieved task assign for: %s",
                                            task.tid)
                                        task_queue.push(task)
                                        logging.debug(
                                            "Pushed task to task queue: %s",
                                            task.tid)

                                    elif MessageType.ACKNOWLEDGE(
                                    ) == in_msg_type:
                                        pass

                                    elif MessageType.WAIT_COMMAND(
                                    ) == in_msg_type:

                                        #TODO: Implement it on the master side!
                                        wait_duration = in_msg.duration
                                        logging.debug(
                                            "Retrieved Wait Command with duration: %fs",
                                            wait_duration)
                                        time.sleep(wait_duration)

                                    elif MessageType.EXIT_COMMAND(
                                    ) == in_msg_type:

                                        RUN_CONDITION = False
                                        logging.info(
                                            'Retrieved exit message from master...'
                                        )

                                    if request_retry_count:
                                        request_retry_count = 0


################################################################################

                                else:

                                    logging.debug(
                                        'No response retrieved - Reconnecting...'
                                    )
                                    comm_handler.reconnect()
                                    request_retry_count += 1

                    except Exception as err:

                        RUN_CONDITION = False
                        exc_type, _, exc_tb = sys.exc_info()
                        filename = os.path.split(
                            exc_tb.tb_frame.f_code.co_filename)[1]
                        logging.error(
                            f"Caught exception (type: {exc_type}) in main loop: {err} "
                            f"- {filename} (line: {exc_tb.tb_lineno})")

                if not RUN_CONDITION:

                    try:

                        logging.info("Shutting down all worker...")

                        all_worker_down = False

                        while not all_worker_down:

                            found_active_worker = False

                            for worker_id in worker_state_table.keys():

                                if worker_handle_dict[worker_id].is_alive():

                                    os.kill(worker_handle_dict[worker_id].pid,
                                            signal.SIGUSR1)

                                    task_queue.push(PoisenPill())

                                    logging.debug(
                                        "Waiting for worker to complete: %s",
                                        worker_handle_dict[worker_id].name)

                                    found_active_worker = True

                            if not found_active_worker:
                                all_worker_down = True
                                logging.debug('All worker are down.')

                            else:
                                logging.debug(
                                    'Waiting for worker to shutdown...')
                                time.sleep(1)

                    except Exception as err:
                        logging.error(
                            f"Caught exception terminating Worker: {err}")

            else:

                logging.error(
                    f"Another instance might be already running (PID file: {config_file_reader.pid_file})!"
                )
                sys.exit(1)

    except Exception as err:

        exc_type, _, exc_tb = sys.exc_info()
        filename = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logging.error(
            f"Exception in {filename} (line: {exc_tb.tb_lineno}): {err}")
        sys.exit(1)

    logging.info('Finished')
    sys.exit(0)