Example #1
0
    def __init__(self, params, lr=required, n_push=required, n_pull=required, model=required):
        """__init__

        :param params:
        :param lr:
        :param freq:
        :param model:
        """
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))

        defaults = dict(lr=lr,)
        self.accumulated_gradients = torch.zeros(ravel_model_params(model).size())
        self.n_pull = n_pull
        self.n_push = n_push

        self.model = model
        # this sets the initial model parameters
        send_message(MessageCode.ParameterUpdate, ravel_model_params(self.model))
        self.idx = 0

        listener = DownpourListener(self.model)
        listener.start()

        super(DownpourSGD, self).__init__(params, defaults)
Example #2
0
    def __init__(self,
                 params,
                 lr=required,
                 n_push=0,
                 n_pull=0,
                 model=required):
        """__init__

        :param params:
        :param lr:
        :param freq:
        :param model:
        """
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        print('I am node rank:%d' % dist.get_rank())
        defaults = dict(lr=lr, )
        self.accumulated_gradients = torch.zeros(
            ravel_model_params(model).size())
        self.model = model
        # this sets the initial model parameters
        # send_message(MessageCode.ParameterUpdate, ravel_model_params(self.model))
        self.idx = 0
        self.version = 0
        self.queue = Queue(maxsize=1)
        self.gradient_warehouse = WorkerGradientWarehouse()
        self.listener = GradientListener(self.model, self.queue,
                                         self.gradient_warehouse)
        self.listener.start()
        self.sender = GradientMessageSender(self.queue)
        self.sender.start()
        super(GradientSGD, self).__init__(params, defaults)
Example #3
0
 def __init__(self, model):
     _LOGGER.info("Creating ParameterServer")
     print("Creating ParameterServer")
     self.parameter_shard = torch.rand(ravel_model_params(model).numel())
     self.model = model
     # init superclass
     super(ParameterServer, self).__init__(model)
Example #4
0
    def __init__(self, model):
        """__init__

        :param model: nn.Module to be defined by the user
        """
        _LOGGER.info("Setting m_parameter")
        self.m_parameter = torch.zeros(ravel_model_params(model).numel() + 5)
        print('Tensor size : %d' % self.m_parameter.numel())
        super(GradientMessageListener, self).__init__()
Example #5
0
    def __init__(self, model):
        """__init__

        :param model: nn.Module to be defined by the user
        """
        self.model = model
        _LOGGER.info("Setting m_parameter")
        self.m_parameter = torch.zeros(ravel_model_params(model).numel() + 2)
        super(MessageListener, self).__init__()
Example #6
0
 def __init__(self, model, gradient_warehouse, storage_num=10, rank=0):
     _LOGGER.info("Creating GradientServer")
     print("Creating GradientServer")
     # self.parameter_shard = torch.rand(ravel_model_params(model).numel())
     # self.model = model
     self.model = torch.zeros(ravel_model_params(model).numel())
     self.gradient_warehouse = gradient_warehouse
     self.rank = rank
     super(GradientServer, self).__init__(model)
     self.gradient_warehouse.model = self.model
Example #7
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # increase version No.
        # self.version += 1
        if dist.get_rank() == 1:
            time.sleep(0.04)

        # get the lr
        lr = self.param_groups[0]['lr']
        self.listener.lr = lr
        # keep track of accumulated gradients so that we can send
        gradients = ravel_model_params(self.model, grads=True)
        self.listener.worker_ahead_count += 1
        self.listener.version += 1
        current_version = self.listener.version
        while self.listener.worker_ahead_count >= self.listener.waiting_bound:
            pass
        self.gradient_warehouse.push(gradients, current_version)
        # send message
        self.queue.put((GSMessageCode.GradientUpdate, gradients, 0,
                        current_version, 0, 0))
        # send_thread = threading.Thread(target=send_message,
        #                                args=(GSMessageCode.GradientUpdate, gradients, 0, current_version))
        # send_message(GSMessageCode.GradientUpdate, gradients, dst=0, gradient_version=self.listener.version + 1)
        # send_thread.start()
        # reset gradient version
        lock_queue.get()
        if self.idx % 100 == 1:
            self.gradient_warehouse.clean_redundant()

        self.idx += 1
        if lock.locked():
            # skip this iteration
            lock.acquire()
            lock.release()
            return loss
        return loss
Example #8
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        # send parameter request every N iterations
        if self.idx % self.n_pull == 0:
            send_message(MessageCode.ParameterRequest,
                         self.accumulated_gradients)  # dummy val

        # get the lr
        lr = self.param_groups[0]['lr']
        # keep track of accumulated gradients so that we can send
        gradients = ravel_model_params(self.model, grads=True)
        self.accumulated_gradients.add_(-lr, gradients)

        # send gradient update every N iterations
        if self.idx % self.n_push == 0:
            send_message(
                MessageCode.GradientUpdate,
                self.accumulated_gradients)  # send gradients to the server
            self.accumulated_gradients.zero_()

        # internal sgd update
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                p.data.add_(-group['lr'], d_p)

        self.idx += 1
        return loss
Example #9
0
    def receive(self, sender, message_code, gradient_version, trigger,
                fast_flag, parameter):
        """receive parameter updates from the server and reflect them into the client's model."""
        _LOGGER.info("Processing message: {}, version: {}, lr: {}".format(
            message_code.name, gradient_version, self.lr))
        if message_code == GSMessageCode.GradientUpdate:

            if not fast_flag:
                # means this version of gradient should not stored by worker cuz this worker is not a fast-node
                self.gradient_warehouse.remove(self.version)
            # print(len(self.gradient_warehouse.gradient_storage), self.gradient_warehouse.gradient_storage.keys())
            self.version = max(self.version, gradient_version)

            if trigger is 0:
                update_model_params(self.model, parameter, self.lr)
            elif trigger is not 0 and trigger in self.gradient_warehouse.gradient_storage.keys(
            ):
                # received lower nodes' gradient
                # pass
                update_model_params(self.model, parameter, self.lr)
                # update_model_params(self.model, self.gradient_warehouse.pop(trigger), -self.lr)
                # print("Sync-fast, Received version %d from other nodes" % trigger)
            lock_queue.put(gradient_version)
        elif message_code == GSMessageCode.ModelRequest:
            lock.acquire()
            model = ravel_model_params(self.model, grads=False)
            print(model)
            self.queue.put((GSMessageCode.ModelUpdate, model, 0, 0, 0,
                            0))  # send current model
            print('send model to server')
        elif message_code == GSMessageCode.ModelUpdate:
            print(parameter)
            unravel_model_params(self.model, parameter)
            self.version = max(self.version, gradient_version)
            print('unravel_model_params')
            lock.release()
        self.worker_ahead_count = 0