def train(train_loader, model, criterion, optimizer, epoch, scheduler, args):
    '''Train model on data in train_loader for a single epoch'''
    print('Starting training epoch {}'.format(epoch))

    # Prepare value counters and timers
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    # Switch model to train mode
    model.train()

    # Train for single eopch
    end = time.time()
    for i, (input_gray, input_ab, target) in enumerate(train_loader):

        if args.mixup:
            input_gray, target_a, target_b, lam = mixup_data(
                input_gray, target, args.alpha)

        # Use GPU if available
        input_gray_variable = Variable(
            input_gray).cuda() if use_gpu else Variable(input_gray)
        input_ab_variable = Variable(input_ab).cuda() if use_gpu else Variable(
            input_ab)
        target_variable = Variable(target).cuda() if use_gpu else Variable(
            target)

        # Record time to load data (above)
        data_time.update(time.time() - end)

        # Run forward pass
        output_ab = model(input_gray_variable)  # throw away class predictions

        if args.mixup:
            loss = mixup_criterion(criterion, output_ab, target_a, target_b,
                                   lam, args.smooth)
        else:
            loss = criterion(output_ab, input_ab_variable)  # MSE

        # Record loss and measure accuracy
        losses.update(loss.item(), input_gray.size(0))

        # Compute gradient and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Record time to do forward and backward passes
        batch_time.update(time.time() - end)
        end = time.time()

        # Print model accuracy -- in the code below, val refers to value, not validation
        if i % args.print_freq == 0:
            for param_group in optimizer.param_groups:
                current_lr = param_group['lr']
            print('({0}) lr:[{1}]  '
                  'Epoch: [{2}][{3}/{4}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.6f} ({loss.avg:.6f})\t'.format(
                      args.optmzr,
                      current_lr,
                      epoch,
                      i,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

    print('Finished training epoch {}'.format(epoch))
Beispiel #2
0
def qtrain(train_loader, criterion, optimizer, scheduler, epoch, model, args,
           layers, rew_layers, eps):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    idx_loss_dict = {}

    # switch to train mode
    model.train()

    if args.masked_retrain and not args.combine_progressive:
        print("full acc re-train masking")
        masks = {}
        for name, W in (model.named_parameters()):
            weight = W.cpu().detach().numpy()
            non_zeros = weight != 0
            non_zeros = non_zeros.astype(np.float32)
            zero_mask = torch.from_numpy(non_zeros).cuda()
            W = torch.from_numpy(weight).cuda()
            W.data = W
            masks[name] = zero_mask
    elif args.combine_progressive:
        print("progressive rew-train/re-train masking")
        masks = {}
        for name, W in (model.named_parameters()):
            weight = W.cpu().detach().numpy()
            non_zeros = weight != 0
            non_zeros = non_zeros.astype(np.float32)
            zero_mask = torch.from_numpy(non_zeros).cuda()
            W = torch.from_numpy(weight).cuda()
            W.data = W
            masks[name] = zero_mask

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        # adjust learning rate
        if args.masked_retrain:
            scheduler.step()

        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        if args.mixup:
            input, target_a, target_b, lam = mixup_data(
                input, target, args.alpha)

        # compute output
        output = model(input)

        if args.mixup:
            ce_loss = mixup_criterion(criterion, output, target_a, target_b,
                                      lam, args.smooth)
        else:
            ce_loss = criterion(output, target, smooth=args.smooth)

        if args.rew:
            if i == 0:
                print("reweighted l1 training...\n")
                adjust_rew_learning_rate2(optimizer, epoch, rew_milestone,
                                          args)

            l1_loss = 0
            # add reweighted l1 loss
            if i == 0 and epoch - 1 in rew_milestone:
                print("reweighted l1 update")
                for j in range(len(layers)):
                    if args.sparsity_type == "irregular":
                        rew_layers[j] = (1 / (layers[j].data + eps))
                    elif args.sparsity_type == "column":
                        rew_layers[j] = (
                            1 / (torch.norm(layers[j].data, dim=0) + eps))
                    elif args.sparsity_type == "kernel":
                        rew_layers[j] = (
                            1 / (torch.norm(layers[j].data, dim=[2, 3]) + eps))
                    elif args.sparsity_type == "filter":
                        rew_layers[j] = (
                            1 / (torch.norm(torch.norm(layers[j].data, dim=1),
                                            dim=[1, 2]) + eps))

            for j in range(len(layers)):
                rew = rew_layers[j]
                conv_layer = layers[j]
                if args.sparsity_type == "irregular":
                    l1_loss = l1_loss + 1e-6 * torch.sum(
                        (torch.abs(rew * conv_layer)))
                elif args.sparsity_type == "column":
                    l1_loss = l1_loss + penalty_para[j] * torch.sum(
                        rew * torch.norm(conv_layer, dim=0))
                elif args.sparsity_type == "kernel":
                    l1_loss = l1_loss + 1e-5 * torch.sum(
                        rew * torch.norm(conv_layer, dim=[2, 3]))
                elif args.sparsity_type == "filter":
                    l1_loss = l1_loss + 1e-3 * torch.sum(rew * torch.norm(
                        torch.norm(conv_layer, dim=1), dim=[1, 2]))
            ce_loss = l1_loss + ce_loss

        # measure accuracy and record loss
        acc1, _ = accuracy(output, target, topk=(1, 5))

        losses.update(ce_loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        ce_loss.backward()

        if args.combine_progressive:
            with torch.no_grad():
                for name, W in (model.named_parameters()):
                    if name in masks:
                        W.grad *= masks[name]
                        #W.grad=hard_quant(W.grad,args.param_bits)#make sure gradient is quantized
        if args.masked_retrain:
            with torch.no_grad():
                for name, W in (model.named_parameters()):
                    if name in masks:
                        W.grad *= masks[name]
                        #W.grad=hard_quant(W.grad,args.param_bits)#make sure gradient is quantized

        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print(i)
        if i % args.log_interval == 0:
            for param_group in optimizer.param_groups:
                current_lr = param_group['lr']
            print('({0}) lr:[{1:.5f}]  '
                  'Epoch: [{2}][{3}/{4}]\t'
                  'Status: rew-[{5}] retrain-[{6}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Acc@1 {top1.val:.3f}% ({top1.avg:.3f}%)\t'.format(
                      args.optmzr,
                      current_lr,
                      epoch,
                      i,
                      len(train_loader),
                      args.rew,
                      args.masked_retrain,
                      batch_time=data_time,
                      loss=losses,
                      top1=top1))
        if i % 100 == 0:
            idx_loss_dict[i] = losses.avg
    return idx_loss_dict