Esempio n. 1
0
class Reclaimer:

    def __init__(self, com: ICommunication_Controller, logger: Logger = None):
        self.__com = com
        if logger is None:
            self.__log = Logger(title_info='Retrieve', log_to_file=True)
        else:
            self.__log = logger

    def require_client_log(self):
        """
            Require client_log file from all workers.
        :return: None
        """
        # send request
        for id in self.__com.available_clients:
            self.__com.send_one(id, RequestWorkingLog())
            self.__log.log_message('Acquire log file from worker({}).'.format(id))

        try:
            nodes_ready = set()
            total_nodes = set(self.__com.available_clients)
            while nodes_ready != total_nodes:

                id_from, log = self.__com.get_one()

                if isinstance(log, DoneType):
                    log.restore()
                    file_format = "\n\t\t--> ".join([filename for filename in log.file_list])
                    self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format))
                    nodes_ready.add(id_from)
                    self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, nodes_ready))

        except Exception as e:
            # print DEBUG message
            import sys
            import traceback
            exc_type, exc_value, exc_tb = sys.exc_info()
            exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb)
            exc_format = "".join(exc_tb)
            self.__log.log_error('Exception occurred: {}\n\t{}'.format(e, exc_format))
            # print DEBUG message

        self.__log.log_message('Done.')
Esempio n. 2
0
class PSGD_Worker:

    Training_TimeOut_Limit = 180

    def __init__(self):
        self.__running_thread = None
        self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()),
                                    log_to_file=True)
        self.__training_log = None

        self.client_logger.log_message(
            'Working started and ready for job submission.')

    def slave_forever(self):
        # set up listening port
        constructor = Worker_Communication_Constructor(
            '0.0.0.0',
            STAR_NET_WORKING_PORTS,
            worker_register=CLZ_WORKER_REGISTER())
        while True:
            com = None
            try:
                self.client_logger.log_message(
                    'Worker started, prepare for connection...')
                register = constructor.buildCom()
                com = Communication_Controller(CLZ_COM_PROCESS(register))
                com.establish_communication()

                self.client_logger.log_message(
                    'Job submission received. Node assigned node_id({})'.
                    format(com.Node_Id))

                if self.init_PSGD(com):
                    self.do_training(com)

                GlobalSettings.clear_default()
                self.client_logger.log_message(
                    'Current session closed, node_id({}).'.format(com.Node_Id))

            except Exception as e:
                self.client_logger.log_error(
                    'Exception occurred: {}'.format(e))

                # print DEBUG message
                import sys
                import traceback
                exc_type, exc_value, exc_tb = sys.exc_info()
                exc_tb = traceback.format_exception(exc_type, exc_value,
                                                    exc_tb)
                for line in exc_tb:
                    self.client_logger.log_message(line)
                # print DEBUG message

            except KeyboardInterrupt:
                self.client_logger.log_error(
                    'Worker shutdown by interruption.')
                constructor.close()
                break
            finally:
                time.sleep(10)
                if isinstance(com, Communication_Controller):
                    com.close()

            self.client_logger.log_message('Worker restarting...')
            # wait for safe closure

    def init_PSGD(self, com: Communication_Controller) -> bool:
        self.client_logger.log_message(
            'ACK job submission and request global settings.')

        # ignore other data
        def acquire(com):
            id_from, data = com.get_one()
            while id_from != Initialization_Server:
                id_from, data = com.get_one()
            return data

        # initialize global settings
        com.send_one(Initialization_Server, Init.GlobalSettings)
        # get data
        data = acquire(com)
        # restore global settings
        if not isinstance(data, Reply.global_setting_package):
            if data == Reply.I_Need_Your_Working_Log:
                self.client_logger.log_message(
                    'Nothing needs to be done, send back logfile and exit process.'
                )
                com.send_one(Initialization_Server,
                             Binary_File_Package(self.client_logger.File_Name))
                if isinstance(self.__training_log, Logger):
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__training_log.File_Name))
                if isinstance(self.__running_thread, PSGDTraining_Client):
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__running_thread.Trace_Eval))
                    com.send_one(
                        Initialization_Server,
                        Binary_File_Package(self.__running_thread.Trace_Train))
                com.send_one(Initialization_Server, Done_Type())
            return False

        data.restore()

        self.client_logger.log_message('Request codec and sgd class.')
        # initialize codec and sgd type
        com.send_one(Initialization_Server, Init.Codec_And_SGD_Type)

        data = acquire(com)
        assert isinstance(data, Reply.codec_and_sgd_package)

        codec, sgd = data.restore()

        self.client_logger.log_message('Request weights and layer type.')
        # initialize weights and layer
        com.send_one(Initialization_Server, Init.Weights_And_Layers)
        data = acquire(com)
        assert isinstance(data, Reply.weights_and_layers_package)

        layers = data.restore()

        self.client_logger.log_message('Request other stuff.')
        # others
        com.send_one(Initialization_Server, Init.MISC)
        data = acquire(com)
        assert isinstance(data, Reply.misc_package)

        loss_t = data.loss_type
        target_acc = data.target_acc
        epoch = data.epoch
        learn_rate = data.learn_rate
        w_type = data.w_types
        op = data.optimizer
        metric = data.metric

        self.__training_log = Logger('Training log @ node-{}'.format(
            com.Node_Id),
                                     log_to_file=True)

        if com.Node_Id != Parameter_Server:

            self.client_logger.log_message('Request data samples.')
            # initialize dataset
            com.send_one(Initialization_Server, Init.Samples)
            data = acquire(com)
            # restore
            assert isinstance(data, Reply.data_sample_package)

            train_x, train_y, eval_x, eval_y = data.restore()

            self.__running_thread = PSGDTraining_Client(
                model_init=layers,
                loss=loss_t,
                codec_type=codec,
                sync_class=sgd,
                com=com,
                w_types=w_type,
                tags=build_tags(node_id=com.Node_Id),
                train_x=train_x,
                train_y=train_y,
                eval_x=eval_x,
                eval_y=eval_y,
                optimizer=op,
                batch_size=GlobalSettings.get_default().batch.batch_size,
                epochs=epoch,
                logger=self.__training_log,
                learn_rate=learn_rate,
                target_acc=target_acc,
                metrics=metric)
        else:
            self.__running_thread = PSGDTraining_Parameter_Server(
                model_init=layers,
                ps_codec=codec,
                ps_sgd_type=sgd,
                com=com,
                w_types=w_type,
                logger=self.__training_log)

        self.client_logger.log_message(
            'Submit stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Submit stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))
        return True

    def do_training(self, com: Communication_Controller):
        self.client_logger.log_message('Prepare to start training process.')
        # check
        assert isinstance(self.__running_thread, Thread)
        assert isinstance(self.__training_log, Logger)

        ready_state = {}
        self.client_logger.log_message('Synchronize timeline with cluster.')

        len_ready = len(com.available_clients())
        time_count = 0
        # check ready states
        while len(ready_state) != len_ready:
            # require
            n, d = com.get_one(False)
            if isinstance(d, Ready_Type):
                ready_state[n] = True
                time_count = 0
            if len(com.available_clients()) < len_ready:
                raise OSError('Minimal number of clients cannot be satisfied.')
            if time_count > PSGD_Worker.Training_TimeOut_Limit:
                raise AssertionError(
                    'Maximal waiting time exceed, give up waiting and reset environment.'
                )
            for node_id in com.available_clients():
                com.send_one(node_id, Ready_Type())
            time.sleep(1)
            time_count += 1

        try:
            self.client_logger.log_message('Execution process started.')
            data_sent_mark = com.Com.bytes_sent
            data_recv_mark = com.Com.bytes_read
            begin = time.time()
            self.__running_thread.start()
            self.__running_thread.join()
            end = time.time()

            self.__training_log.log_message(
                'Execution complete, time:{}'.format(end - begin))
            self.__training_log.log_message(
                'Bytes sent: {}'.format(com.Com.bytes_sent - data_sent_mark))
            self.__training_log.log_message(
                'Bytes read: {}'.format(com.Com.bytes_read - data_recv_mark))

            self.client_logger.log_message(
                'Execution complete, time:{}'.format(end - begin))
            self.client_logger.log_message(
                'Training stage complete, Total bytes sent: {}'.format(
                    com.Com.bytes_sent))
            self.client_logger.log_message(
                'Training stage complete, Total bytes read: {}'.format(
                    com.Com.bytes_read))

            if isinstance(self.__running_thread, PSGDTraining_Client):
                train_csv = Binary_File_Package(
                    self.__running_thread.Trace_Train)
                eval_csv = Binary_File_Package(
                    self.__running_thread.Trace_Eval)

                self.client_logger.log_message('Post training log.')
                com.send_one(Initialization_Server, train_csv)
                com.send_one(Initialization_Server, eval_csv)

        except Exception as error:
            self.client_logger.log_error(
                'Error encountered while executing : {}'.format(error))
            self.__training_log.log_error(
                'Error encountered while executing : {}'.format(error))

        self.client_logger.log_message('Training process exited.')
        log_file = Binary_File_Package(self.__training_log.File_Name)
        com.send_one(Initialization_Server, log_file)
Esempio n. 3
0
class Coordinator:
    def __init__(self, hyper_model: IServerModel, logger=None):
        self.__com = None
        self.__model = hyper_model
        if logger is None:
            self.__log = Logger(title_info='Coordinator-{}'.format(get_repr()),
                                log_to_file=True)
        else:
            self.__log = logger

    def set_workers(self, works: list, nodes_required) -> bool:
        """
            Set worker list.
        :param works: list of tuples
                        like: [ (rule1, address1), (rule2, address2), ... ]
        :return: None, raise exceptions if two workers with same id are assigned.
        """
        pkg = IPA()
        uuid_for_this_task = str(random.randint(0, 0x7fffffff))
        current_node_id_assigned = 0
        # set all address
        for rule, addr in works:
            # Stop connecting, if required nodes count were satisfied.
            if current_node_id_assigned >= nodes_required and rule == "Worker":
                self.__log.log_message('Number of nodes satisfied.')
                break
            if rule == "PS":
                _id = Parameter_Server
            else:
                _id = current_node_id_assigned
                current_node_id_assigned += 1
            pkg.put(_id, uuid_for_this_task, addr)
            self.__log.log_message(
                'Add worker (Rule: {}, Id: {}, Address: {}).'.format(
                    rule, _id, addr))

        self.__log.log_message('Try connecting to the cluster.')
        self.__com = NET(pkg)
        self.__com = Communication_Controller(self.__com)
        self.__com.establish_communication()
        self.__log.log_message('Connection with cluster established.')

        return True

    def resources_dispatch(self):
        """
            Reply to worker's requirements, prepare for the job
        :return:
        """

        # assertion
        assert isinstance(self.__com, Communication_Controller)
        assert isinstance(self.__model, IServerModel)

        total_node_count = len(self.__com.available_clients())
        node_ready = set()
        key_interrupted_before = False

        while not self.__com.is_closed():
            try:
                id_from, data = self.__com.get_one()

                if isinstance(data, Init):
                    if data == Init.GlobalSettings:
                        reply = Reply.global_setting_package(
                            GlobalSettings.get_default())

                    elif data == Init.Weights_And_Layers:
                        reply = Reply.weights_and_layers_package(
                            self.__model.getWeightsInit())

                    elif data == Init.Codec_And_SGD_Type:
                        if id_from != Parameter_Server:
                            reply = Reply.codec_and_sgd_package(
                                self.__model.codec_ctrl(),
                                self.__model.psgd_type())
                        else:
                            reply = Reply.codec_and_sgd_package(
                                self.__model.psgd_server_codec(),
                                self.__model.psgd_server_type())

                    elif data == Init.Samples:
                        reply = Reply.data_sample_package(
                            *self.__model.train_data(),
                            *self.__model.eval_data())

                    elif data == Init.MISC:
                        reply = Reply.misc_package(
                            self.__model.epoches(), self.__model.loss_type(),
                            self.__model.learn_rate(),
                            self.__model.target_acc(),
                            self.__model.weights_types(),
                            self.__model.optimizer_type(),
                            self.__model.metric())

                    else:
                        reply = None

                    self.__log.log_message(
                        'Reply requirements to node({}), type({}).'.format(
                            id_from, reply.__class__.__name__))
                    self.__com.send_one(id_from, reply)

                elif isinstance(data, Ready_Type):
                    self.__com.send_one(id_from, Ready_Type())
                    if id_from in node_ready:
                        continue
                    node_ready.add(id_from)
                    self.__log.log_message(
                        'Node({}) is ready, {} nodes total, {} is ready.'.
                        format(id_from, total_node_count, node_ready))

                elif isinstance(data, Binary_File_Package):
                    self.__log.log_message(
                        'Restoring data ({}) from {}.'.format(
                            data.filename, id_from))
                    data.restore()

            except KeyboardInterrupt:
                if len(node_ready) < total_node_count:
                    self.__log.log_error(
                        'Some of workers is not ready, close anyway?')
                    self.__log.log_message(
                        'Press Ctrl+C again to shutdown immediately.')
                    key_interrupted_before = True
                if key_interrupted_before or len(
                        node_ready) >= total_node_count:
                    self.__log.log_error('Coordinator closed by user.')
                    break

        self.__com.close()
        self.__log.log_message('Dispatcher closed.')

    def require_client_log(self):
        """
            Require client_log file from all workers.
        :return: None
        """
        assert isinstance(self.__com, Communication_Controller)
        # self.__log.log_message('Acquire log file from each worker.')
        # take all ACK
        for id in self.__com.available_clients():
            _, _ = self.__com.get_one()

        # send request
        for id in self.__com.available_clients():
            self.__com.send_one(id, Reply.I_Need_Your_Working_Log)

        try:
            # get result
            for id in self.__com.available_clients():
                self.__log.log_message(
                    'Acquire log file from worker({}).'.format(id))
                log = None
                while not isinstance(log, Done_Type):
                    _, log = self.__com.get_one()
                    if isinstance(log, Binary_File_Package):
                        log.restore()
                        self.__log.log_message(
                            'Save log file for worker({}).'.format(id))
        except:
            self.__log.log_error('Connection lost.')

        self.__com.close()
        self.__log.log_message('Done.')

        return
Esempio n. 4
0
class Coordinator:

    def __init__(self, com: ICommunication_Controller, estimate_bandwidth: int = 10, logger: IPrinter = None):
        """
            Coordinator
        :param com: Communication Thread
        :param estimate_bandwidth: bandwidth estimation, Bytes per second
        :param logger: IPrinter
        """
        self.__com = com
        if logger is None:
            self.__log = Logger(title_info='Coordinator', log_to_file=True)
        else:
            self.__log = logger
        self.__estimate_bandwidth = estimate_bandwidth
        self.__group_allocated = set()
        self.__global_allocated = set()
        self.__log.log_message("Coordinator version: {}.".format(VERSION))

    @property
    def allocated_nodes(self):
        return self.__global_allocated | self.__group_allocated

    def resources_dispatch(self, dispatch_map: Callable[[int, object], IReplyPackage]):
        """
            Reply to worker's requirements, prepare for the job
        :param dispatch_map: Callable object, receive a IRequestPackage instance and returns IReplyPackage instance
                            for reply.
        :return:
        """
        # dispatch to certain group
        node_ready = set()

        while node_ready != self.allocated_nodes:

            try:
                id_from, data = self.__com.get_one()
                reply = None

                if isinstance(data, IRequestPackage):
                    reply = dispatch_map(id_from, data.content())

                    self.__log.log_message(
                        'Reply requirements to node({}), type({}).'.format(id_from, reply.__class__.__name__))

                elif isinstance(data, ReadyType):
                    reply = ReadyType(node_ready)

                    if id_from in node_ready:
                        continue

                    node_ready.add(id_from)
                    self.__log.log_message('Node({}) is ready, {} is ready.'.format(id_from, node_ready))

                elif isinstance(data, Version):
                    reply = Version(Initialization_Server)

                    self.__log.log_message("{}".format(data))

                self.__com.send_one(id_from, reply)

            except KeyboardInterrupt:
                if len(node_ready) < len(self.allocated_nodes):
                    self.__log.log_error('Some workers are not ready.')
                self.__log.log_error('Coordinator closed by user.')

        self.__log.log_message('Dispatch complete.')

    def join(self) -> Dict[int, object]:
        """
            Join all workers, wait for all task.
            :return: Returns a dict, indicates what has been returned from executor on each worker.
        """
        # Join all nodes.
        node_ready = set()
        # Collect result.
        results: Dict[int, object] = {}

        self.__log.log_message("Waiting for ({}) ...".format(self.allocated_nodes))

        while node_ready != self.allocated_nodes:

            id_from, data = self.__com.get_one()

            if isinstance(data, IReplyPackage):
                data.restore()
                self.__log.log_message('Restoring data ({}) from {}.'.format(data, id_from))

            if isinstance(data, DoneType):
                file_format = "\n\t\t--> ".join([filename for filename in data.file_list])
                self.__log.log_message('Save file for {}.\n\tList:\n\t\t--> {}'.format(id_from, file_format))

                node_ready.add(id_from)
                self.__log.log_message('Node({}) is done, {} is done.'.format(id_from, node_ready))

                results[id_from] = data.result

        self.__log.log_message("All task is complete.")
        return results

    def submit_group(self, worker_executor: Type[IExecutor], working_group: Iterable[int] = None, package_size: int = 1e9):
        """
            Submit a job to a specified worker group.
            Nodes inside this group will wait for each other and synchronize start time.
            Group will also wait for all single nodes were ready.
        :param worker_executor: executor class, implementation of IExecutor
        :param working_group: Worker group list, iterable object, contains id of each worker in the group.
        :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch.
        :return: None
        """
        # set work group
        if working_group is None:
            working_group = set(self.__com.available_clients)
        if not isinstance(working_group, set):
            working_group = set(working_group)
        # check for duplication
        assert len(self.__group_allocated & working_group) == 0, "Cannot submit a task to node which already has a job."
        # calculate data size
        dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 1
        # send request
        for _id in working_group:
            self.__com.send_one(_id, SubmitJob(working_group | self.__global_allocated, dataset_ett, worker_executor))

        self.__group_allocated = self.__group_allocated | working_group
        self.__log.log_message("Group submission complete ({}).".format(working_group))

    def submit_single(self, worker_executor: Type[IExecutor], worker_id: int, package_size: int = 1e9):
        """
            Submit a job to a specified node.
            This global node will start execution immediately when itself was ready.
        :param worker_executor: executor class, implementation of IExecutor
        :param worker_id: Worker id.
        :param package_size: Package size in transmission. Potentially required by executor, and provided by dispatch.
        :return:
        """
        # check for duplication
        assert worker_id not in self.__global_allocated, "Cannot submit a task to node which already has a job."
        # calculate data size
        dataset_ett = self.__com.available_clients_count * package_size / self.__estimate_bandwidth + 0.6
        # send request
        self.__com.send_one(worker_id, SubmitJob({worker_id}, dataset_ett, worker_executor))

        self.__global_allocated.add(worker_id)
        self.__log.log_message("Single node submission complete.")
Esempio n. 5
0
    # get assignment class for batch_size calculation
    from server_util.init_model import get_assignment
    ass = get_assignment(arg.assignment)
    assignment = ass(arg.n, arg.r)
    # set up full batch_size
    batch_size = arg.b * assignment.block_count
    GlobalSettings.set_default(arg.n, arg.r, batch_size, assignment)
    # get dataset
    if arg.dataset == 'mnist':
        from dataset.mnist_input import load
    elif arg.dataset == 'cifar':
        from dataset.cifar import load
    elif arg.dataset == 'simlin':
        from dataset.simdata import load
    else:
        logger.log_error("Input dataset type cannot find any matches.")
        exit(1)

    # load dataset
    train_x, train_y, test_x, test_y = load()

    # make iid
    if arg.make_iid_dataset:
        from utils.partition_helper import make_non_iid_distribution
        train_x, train_y = make_non_iid_distribution(train_x, train_y,
                                                     batch_size)

    # make format
    if arg.is_img_cls:
        from dataset.utils import make_image_scale, make_onehot
        train_x = make_image_scale(train_x)
Esempio n. 6
0
class Worker:
    def __init__(self):
        self.client_logger = Logger(title_info='Worker-{}'.format(get_repr()),
                                    log_to_file=True)
        self.client_logger.log_message('Worker version: {}.'.format(VERSION))
        self.__job_executor: [IExecutor] = None

    def slave_forever(self):
        # set up listening port
        listener = Serve(net_type='fcnet')
        try:
            while True:
                self.client_logger.log_message(
                    'Worker started with network type \'FCNet\'.')
                try:
                    with listener.acquire() as com:
                        self.client_logger.log_message(
                            'Job submission received. Node assigned node_id({})'
                            .format(com.Node_Id))

                        self.dispatch(com)

                        self.client_logger.log_message(
                            'Current session closed, node_id({}).'.format(
                                com.Node_Id))
                        self.client_logger.log_message('Worker restarting...')
                        time.sleep(1)
                except OSError:
                    self.client_logger.log_message(
                        "Initialization server exited without report.")
                except ConnectionResetError:
                    self.client_logger.log_message(
                        "Initialization server exited without report.")

        except KeyboardInterrupt:
            self.client_logger.log_error('Worker shutdown by interruption.')
            listener.close()

    @staticmethod
    def __recv_pack(com: ICommunication_Controller, timeout: int = 100):
        data = None
        id_from = None
        time_out_end = time.time() + timeout
        # requests with timeout check
        while data is None:
            id_from, data = com.get_one(blocking=False)
            time.sleep(0.01)
            # Assertion, this node count as one
            assert Initialization_Server in com.available_clients, "Initialization server exited without finishing the initialization."
            assert time.time() < time_out_end, "Maximum waiting time exceed."
        return id_from, data

    def dispatch(self, com: ICommunication_Controller):
        """
            Get first package and find out what to do.
            All exceptions will be handled here, and trace back information will
            be recorded to client_logger.
            Used job_submit.py --retrieve to get trace back log.
        :param com:
        :return:
        """
        results = None
        try:
            id_from = com.Node_Id
            req = None
            while id_from != Initialization_Server:
                id_from, req = Worker.__recv_pack(
                    com, Init_Job_Submission_Timeout_Limit_Sec)

            if isinstance(req, SubmitJob):
                self.client_logger.log_message('ACK job submission.')
                if self.initialize(com, req):
                    results = self.do_training(com)

            if isinstance(req, RequestWorkingLog):
                self.client_logger.log_message('ACK logfile reclaim.')

        except Exception as e:
            # print DEBUG message
            import sys
            import traceback
            exc_type, exc_value, exc_tb = sys.exc_info()
            exc_tb = traceback.format_exception(exc_type, exc_value, exc_tb)
            exc_format = "".join(exc_tb)
            self.client_logger.log_error('Exception occurred: {}\n\t{}'.format(
                e, exc_format))
            # print DEBUG message

        self.post_log(com, results)

    def post_log(self, com: ICommunication_Controller, other_contents: object):
        """
            Post worker log file to coordinator.
        :param other_contents: other content can be attached
        :param com:
        :return:
        """
        posting_files = [self.client_logger.File_Name]
        if isinstance(self.__job_executor, AbsExecutor):
            for filename in self.__job_executor.trace_files():
                posting_files.append(filename)

        # Post files
        com.send_one(Initialization_Server,
                     DoneType(com.Node_Id, posting_files, other_contents))

    def initialize(self, com: ICommunication_Controller,
                   job_info: SubmitJob) -> bool:
        """
            Initialize execution environment
        :param com: Communication process
        :param job_info: job info
        :return:
        """
        # restoring data
        job_info.restore()
        # get info
        ready_state = set()
        total_nodes = job_info.work_group
        eta_waiting_time = job_info.waiting_time

        self.__job_executor: AbsExecutor = job_info.executioner(
            com.Node_Id, job_info.work_group)

        # Report Version
        com.send_one(Initialization_Server, Version(node_id=com.Node_Id))
        # Acknowledge requests
        requests = self.__job_executor.requests()
        replies = []
        # Ask for replies
        for req in requests:
            com.send_one(Initialization_Server, RequestPackage(req))

        req_format = "\tRequests List:\n\t\t--> {}".format("\n\t\t--> ".join(
            [str(req) for req in requests]))
        self.client_logger.log_message('Request data: ({})\n{}'.format(
            len(requests), req_format))
        self.client_logger.log_message('ETA: ({})'.format(eta_waiting_time))
        # Set job executor to ready state
        while not self.__job_executor.ready():

            id_from, data = Worker.__recv_pack(com, eta_waiting_time)

            self.client_logger.log_message('Ack package, type: ({})'.format(
                data.__class__.__name__))
            # restoring data
            if isinstance(data, IReplyPackage):
                data.restore()
                replies.append(data)

                if len(replies) == len(requests):
                    requests = self.__job_executor.satisfy(replies)
                    for req in requests:
                        com.send_one(Initialization_Server,
                                     RequestPackage(req))
                    self.client_logger.log_message(
                        'Request data: ({}).'.format(requests))
                    self.client_logger.log_message(
                        'ETA: ({})'.format(eta_waiting_time))
                    replies.clear()

            # pass to sync
            elif isinstance(data, ReadyType):
                ready_state = ready_state | data.current_ready()

        self.client_logger.log_message(
            'Submit stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Submit stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))

        self.client_logger.log_message('Synchronize timeline with cluster.')

        Worker.synchronize(com, ready_state, total_nodes, eta_waiting_time)

        return True

    @staticmethod
    def synchronize(com: ICommunication_Controller, ready_state: set,
                    total_nodes: set, timeout: int):
        """
            Synchronize timeline with cluster.
            Make sure all nodes exits this method with same time.
        :param com: communication controller
        :param ready_state: how much nodes is ready now
        :param total_nodes: how much nodes we need for the job
        :param timeout: timeout limit in seconds, vaguely accuracy
        :return:
        """
        dead_line = time.time() + timeout

        ready_state.add(com.Node_Id)
        for id in com.available_clients:
            com.send_one(id, ReadyType(ready_state))

        while ready_state & total_nodes != total_nodes:
            assert time.time() < dead_line, "Maximum waiting time exceed."

            current_active = set(com.available_clients) | {com.Node_Id}
            assert current_active & total_nodes == total_nodes, \
                "Current nodes: {}, required nodes: {}.".format(current_active, total_nodes)
            # inc time clock
            time.sleep(0.01)

            # check ready state
            id_from, data = com.get_one(blocking=False)
            if isinstance(data, ReadyType):
                ready_state = ready_state | data.current_ready()

    def do_training(self, com: ICommunication_Controller) -> object:
        """
            Execute job.
        """
        self.client_logger.log_message('Execution process started.')
        begin = time.time()
        result = self.__job_executor.start(com)
        end = time.time()

        self.client_logger.log_message(
            'Execution complete, time:{}'.format(end - begin))
        self.client_logger.log_message(
            'Execution stage complete, Total bytes sent: {}'.format(
                com.Com.bytes_sent))
        self.client_logger.log_message(
            'Execution stage complete, Total bytes read: {}'.format(
                com.Com.bytes_read))
        self.client_logger.log_message('Execution process exited.')

        return result