Ejemplo n.º 1
0
    def train(self, t, train_loader, model, optimizer, epoch, Tasks):
        """Train for one epoch on the training set"""
        batch_time = AverageMeter()
        losses = AverageMeter()
        accuracy = AverageMeter()
        # switch to train mode
        model.train()

        world_size = dist.get_world_size()
        rank = dist.get_rank()

        end = time.time()
        batch_cnt = int(len(train_loader))
        for i, (input, target) in enumerate(train_loader):
            target = target.cuda(async=True)
            input = input.cuda()
            input_var = torch.autograd.Variable(input)
            target_var = torch.autograd.Variable(target)

            # compute output
            output = model(input_var)
            output = torch.nn.functional.sigmoid(output)

            loss = self.criterion(output[:, Tasks[t]['subset']],
                                  target_var) / world_size

            # measure accuracy and record loss
            (accu, accus) = self.cleba_accuracy(t, output.data, target, Tasks)

            reduced_loss = loss.data.clone()
            reduced_accu = accu.clone() / world_size

            dist.all_reduce_multigpu([reduced_loss])
            dist.all_reduce_multigpu([reduced_accu])

            losses.update(reduced_loss[0], input.size(0))
            accuracy.update(reduced_accu[0], input.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            average_gradients(model)
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.print_freq == 0 and rank == 0:
                print(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Accuracy {accuracy.val:.3f} ({accuracy.avg:.3f})'.format(
                        epoch,
                        i,
                        batch_cnt,
                        batch_time=batch_time,
                        loss=losses,
                        accuracy=accuracy))
Ejemplo n.º 2
0
    def compute_pre_param(self, t, memory_cache, epoch, Tasks):
        # if self.rank == 0:
        #     print("== BEGIN: compute grad for pre observed tasks: {task}".format(task=t))
        # end = time.time()
        self.optimizer.zero_grad()
        mem_batch_cnt = int(len(memory_cache))
        for input, target in memory_cache:
            # target = target.cuda(async=True)
            # input = input.cuda(async=True)
            # input, target already loaded into GPU
            input_var = torch.autograd.Variable(input)
            target_var = torch.autograd.Variable(target)

            # compute output
            output = self.model(input_var)
            output = torch.nn.functional.sigmoid(output)
            # compute loss divided by world_size and mem_batch_cnt
            loss = self.criterion(
                output[:, Tasks[t]['subset']],
                target_var) / (self.world_size * mem_batch_cnt)
            # compute gradient for each batch of memory and accumulate
            loss.backward()

        average_gradients(self.model)
        # if self.rank == 0:
        #     print("== END: compute grad for pre observed task: {task} | TIME: {time} ".\
        #         format(task=t, time=(time.time()-end)) )
        return self.model.parameters
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    world_size = dist.get_world_size()
    rank = dist.get_rank()

    end = time.time()

    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input.cuda())
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)

        # measure accuracy and record loss
        loss = criterion(output, target_var) / world_size
        prec1 = accuracy(output.data, target, topk=(1, 1))

        reduced_loss = loss.data.clone()
        reduced_prec1 = prec1.clone() / world_size

        dist.all_reduce_multigpu([reduced_loss])
        dist.all_reduce_multigpu([reduced_prec1])

        losses.update(reduced_loss[0], input.size(0))
        top1.update(reduced_prec1[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        average_gradients(model)
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0 and rank == 0:
            print('Epoch: [{0}][{1}/{2}]'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(epoch, i, len(train_loader), batch_time=batch_time,
                                                                    data_time=data_time, loss=losses, top1=top1))
Ejemplo n.º 4
0
    def train(self, t, train_loader, epoch, Tasks):
        """Train for one epoch on the training set"""
        batch_time = AverageMeter()
        losses = AverageMeter()
        accuracy = AverageMeter()
        # switch to train mode
        self.model.train()

        end = time.time()
        batch_cnt = int(len(train_loader))
        for i, (input, target) in enumerate(train_loader):
            # ================================================================= #
            # compute grad for data at previous tasks
            if len(self.solved_tasks) > 0:
                if self.rank == 0:
                    print(
                        "====== compute grad for pre observed tasks: {tasks}".
                        format(tasks=self.solved_tasks))
                # compute grad for pre observed tasks
                for pre_t in self.solved_tasks:
                    ## smaple few examples from previous tasks
                    # memory_sampler = Tasks[pre_t]['memory_sampler']
                    # memory_sampler.set_epoch(epoch) # random or fix sample?
                    # memory_loader = Tasks[pre_t]['memory_loader']
                    memory_cache = self.memory_caches[
                        pre_t]  # memory_cache is a list of loaded gpu tensor
                    ## compute gradient for few samples in previous tasks
                    if self.rank == 0:
                        print(
                            "== BEGIN: compute grad for pre observed tasks: {task}"
                            .format(task=pre_t))
                    end_pre = time.time()
                    #
                    # pre_param = self.compute_pre_param(pre_t, memory_loader, epoch, Tasks)
                    pre_param = self.compute_pre_param(pre_t, memory_cache,
                                                       epoch, Tasks)
                    #
                    if self.rank == 0:
                        print("== END: compute grad for pre observed task: {task} | TIME: {time} ".\
                            format(task=pre_t, time=(time.time()-end_pre)) )
                    ## copy previous grad to tensor
                    store_grad(pre_param, self.grads, self.grad_dims, pre_t)
            # ================================================================= #
            # compute grad for data at current task
            target = target.cuda(async=True)
            input = input.cuda(async=True)
            input_var = torch.autograd.Variable(input)
            target_var = torch.autograd.Variable(target)

            # compute output
            output = self.model(input_var)
            output = torch.nn.functional.sigmoid(output)

            loss = self.criterion(output[:, Tasks[t]['subset']],
                                  target_var) / self.world_size

            # compute gradient within constraints and backprop errors
            self.optimizer.zero_grad()
            loss.backward()
            average_gradients(self.model)
            # ================================================================== #
            # check grad and get new grad
            if len(self.solved_tasks) > 0:
                if self.rank == 0:
                    print(
                        "== BEGIN: check constraints; if violate, get surrogate grad."
                    )
                end_opt = time.time()
                ## copy gradient for data at current task to a tensor and clear grad
                store_grad(self.model.parameters, self.grads, self.grad_dims,
                           t)
                ## check if current step gradient violate constraints
                indx = torch.cuda.LongTensor(self.solved_tasks)
                dotp = torch.mm(self.grads[:, t].unsqueeze(0),
                                self.grads.index_select(1, indx))
                if (dotp < 0).sum() != 0:
                    violate_constr = True
                else:
                    violate_constr = False
                ## use convex quadratic prorgamming to get surrogate grad
                if violate_constr:
                    # if violate, use quadprog to get new grad
                    self.optimizer.zero_grad()
                    project2cone2(self.grads[:, t].unsqueeze(1),
                                  self.grads.index_select(1, indx),
                                  self.margin)
                    ## copy surrogate grad back to model gradient parameters
                    overwrite_grad(self.model.parameters, self.grads[:, t],
                                   self.grad_dims)
                if self.rank == 0:
                    print("== END: violate constraints? : {vio_constr} | TIME: {time}".\
                        format(vio_constr=violate_constr, time=(time.time()-end_opt))
                        )
            # ================================================================= #
            # then do SGD step
            self.optimizer.step()

            # measure accuracy and record loss
            accu, _ = self.cleba_accuracy(t, output.data, target, Tasks)

            reduced_loss = loss.data.clone()
            reduced_accu = accu.clone() / self.world_size

            dist.all_reduce_multigpu([reduced_loss])
            dist.all_reduce_multigpu([reduced_accu])

            losses.update(reduced_loss[0], input.size(0))
            accuracy.update(reduced_accu[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            if i % self.print_freq == 0 and self.rank == 0:
                print(
                    'Training Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Accuracy {accuracy.val:.3f} ({accuracy.avg:.3f})'.format(
                        epoch,
                        i,
                        batch_cnt,
                        batch_time=batch_time,
                        loss=losses,
                        accuracy=accuracy))

            end = time.time()
Ejemplo n.º 5
0
def train(epoch, op_explore):
    """ train model on each epoch in trainset
    """

    global trainloader
    global testloader
    global net
    global criterion
    global optimizer
    global rank, world_size

    if rank == 0:
        logger.debug("Epoch: %d", epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0

    optimizer = op_explore
    f11 = open('/root/log', 'a+')
    f11.write('### ready to train \n')
    f11.close()

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        f11 = open('/root/log', 'a+')
        f11.write('### loop to train \n')
        f11.close()
        targets = targets.cuda(async=True)
        #inputs, targets = inputs.to(device), targets.to(device)
        input_var = torch.autograd.Variable(inputs.cuda())
        target_var = torch.autograd.Variable(targets)

        optimizer.zero_grad()
        outputs = net(input_var)
        loss = criterion(outputs, target_var) / world_size

        loss.backward()
        average_gradients(net)
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.data.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        #As the cost of all_reduce, we don't use all_reduce every batch to calculate acc."
        """
        if rank == 0:
            logger.debug(
                "Loss: %.3f | Acc: %.3f%% (%d/%d)",
                train_loss / (batch_idx + 1),
                100.0 * tmp_correct / tmp_total,
                tmp_correct,
                tmp_total,
            )
        """
    reduced_total = torch.Tensor([total])
    reduced_correct = torch.Tensor([correct])
    reduced_total = reduced_total.cuda()
    reduced_correct = reduced_correct.cuda()
    dist.all_reduce(reduced_total)
    dist.all_reduce(reduced_correct)

    tmp_total = int(reduced_total[0])
    tmp_correct = int(reduced_correct[0])
    acc = 100.0 * tmp_correct / tmp_total

    return acc