Exemple #1
0
def train(train_queue, model, criterion, optimizer, epoch, init_lr,
          warmup_epochs, global_step):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    model.train()
    for step, (data, target) in enumerate(train_queue):
        n = data.size(0)
        data = data.cuda()
        target = target.cuda()

        # Change lr.
        if epoch < warmup_epochs:
            len_epoch = len(train_queue)
            scale = float(1 + step + epoch * len_epoch) / \
                (warmup_epochs * len_epoch)
            lr = init_lr * scale
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        # Forward.
        optimizer.zero_grad()
        logits, logits_aux = model(data)
        loss = criterion(logits, target)
        if args.auxiliary:
            loss_aux = criterion(logits_aux, target)
            loss += args.auxiliary_weight * loss_aux

        # Backward and step.
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()

        ############# APEX #############
        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
        prec1 = utils.reduce_tensor(prec1, args.world_size)
        prec5 = utils.reduce_tensor(prec5, args.world_size)

        objs.update(to_python_float(reduced_loss), n)
        top1.update(to_python_float(prec1), n)
        top5.update(to_python_float(prec5), n)
        ################################

        if step % args.report_freq == 0:
            current_lr = list(optimizer.param_groups)[0]['lr']
            logging.info('train %03d %e %f %f lr: %e', step, objs.avg,
                         top1.avg, top5.avg, current_lr)
            writer.add_scalar('train/loss', objs.avg, global_step)
            writer.add_scalar('train/acc_top1', top1.avg, global_step)
            writer.add_scalar('train/acc_top5', top5.avg, global_step)
            writer.add_scalar('train/lr',
                              optimizer.state_dict()['param_groups'][0]['lr'],
                              global_step)
        global_step += 1

    return top1.avg, objs.avg, global_step
Exemple #2
0
def train(train_queue, model, criterion, optimizer, global_step):
    objs = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()

    model.train()
    for step, (data, target) in enumerate(train_queue):
        n = data.size(0)
        data = data.cuda()
        target = target.cuda()

        # Forward.
        optimizer.zero_grad()
        logits = model(data)
        loss = criterion(logits, target)

        # Backward and step.
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()

        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
        prec1 = utils.reduce_tensor(prec1, args.world_size)
        prec5 = utils.reduce_tensor(prec5, args.world_size)

        objs.update(to_python_float(reduced_loss), n)
        top1.update(to_python_float(prec1), n)
        top5.update(to_python_float(prec5), n)

        if (step + 1) % args.report_freq == 0:
            current_lr = list(optimizer.param_groups)[0]['lr']
            logging.info('train %03d %e %f %f lr: %e', step, objs.avg,
                         top1.avg, top5.avg, current_lr)
        global_step += 1

    return top1.avg, top5.avg, objs.avg, global_step
Exemple #3
0
def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0 and args.local_rank == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                to_python_float(loss.data)))
Exemple #4
0
def test():
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        with torch.no_grad():
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            output = model(data)
            test_loss += to_python_float(
                F.nll_loss(output, target,
                           size_average=False).data)  # sum up batch loss
            pred = output.data.max(
                1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print(
        '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
Exemple #5
0
def train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger):
    global batch_time_global, data_time_global
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    train_loader_len = len(train_loader)
    # print('Length of train loader = %i\n'%train_loader_len)
    bar = Bar('Processing', max=train_loader_len)
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # measure data loading time
        data_time_lap = time.time() - end
        data_time.update(data_time_lap)
        if epoch > 0:
            data_time_global.update(data_time_lap)

        n = inputs.size(0)
        if use_cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()

        # print('input size = %i, device %s\n'%(inputs.size(0), inputs.device))
        # compute output
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward and step.
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs, targets, topk=(1, 5))
        reduced_loss = reduce_tensor(loss.data, args.world_size)
        prec1 = reduce_tensor(prec1, args.world_size)
        prec5 = reduce_tensor(prec5, args.world_size)

        losses.update(to_python_float(reduced_loss), n)
        top1.update(to_python_float(prec1), n)
        top5.update(to_python_float(prec5), n)

        # for restarting
        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            iter_count, iter_total = optimizer.update_iter()

        # measure elapsed time
        batch_time_lap = time.time() - end
        batch_time.update(batch_time_lap)
        if epoch > 0:
            batch_time_global.update(batch_time_lap)
        end = time.time()

        # plot progress
        bar.suffix = '(Epoch {epoch}, {batch}/{size}) Data: {data:.3f}s/{data_global:.3f}s | Batch: {bt:.3f}s/{bt_global:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
            epoch=epoch,
            batch=batch_idx + 1,
            size=train_loader_len,
            data=data_time.val,
            data_global=data_time_global.avg,
            bt=batch_time.val,
            bt_global=batch_time_global.avg,
            total=bar.elapsed_td,
            eta=bar.eta_td,
            loss=losses.avg,
            top1=top1.avg,
            top5=top5.avg,
        )
        bar.next()
        if args.local_rank == 0:
            logger.file.write(bar.suffix)
    bar.finish()
    if args.optimizer.lower(
    ) == 'srsgd' or args.optimizer.lower() == 'sradam' or args.optimizer.lower(
    ) == 'sradamw' or args.optimizer.lower() == 'srradam':
        return (losses.avg, top1.avg, top5.avg, iter_count)
    else:
        return (losses.avg, top1.avg, top5.avg)
Exemple #6
0
    def step(self,
             input_valid,
             target_valid,
             global_step,
             weights,
             input_valid2=None,
             target_valid2=None,
             model_opt=None):
        """Optimizer for the architecture params."""
        self.arch_optimizer.zero_grad()
        if self.meta_loss == 'default':
            loss, accuracy, loss1, loss2 = self.training_obj(
                input_valid, target_valid, weights, model_opt, input_valid2,
                target_valid2, global_step)
            loss, loss1, loss2 = torch.mean(loss), torch.mean(
                loss1), torch.mean(loss2)
        elif self.meta_loss == 'rebar':
            # compute loss with discrete weights
            with torch.no_grad():
                disc_weights = {
                    'normal': weights['dis_normal'],
                    'reduce': weights['dis_reduce']
                }

                loss_disc, accuracy, loss1, loss2 = self.training_obj(
                    input_valid, target_valid, disc_weights, model_opt,
                    input_valid2, target_valid2, global_step)

            # compute baseline
            loss_cont, _, _, _ = self.training_obj(input_valid, target_valid,
                                                   weights, model_opt,
                                                   input_valid2, target_valid2,
                                                   global_step)

            reward = (loss_disc - loss_cont).detach()
            log_q_d = self.alpha.module.log_prob(weights)
            loss = torch.mean(log_q_d * reward) + torch.mean(loss_cont)
            loss1, loss2 = torch.mean(loss1), torch.mean(loss2)

            if self.latency_cost:
                # train the surrogate function initially.
                if self.surrogate_not_train:
                    self.train_surrogate(input_valid)

                # sample a single architecture sample
                weight_lat = self.alpha(1)
                disc_weights_lat = {
                    'normal': weight_lat['dis_normal'],
                    'reduce': weight_lat['dis_reduce']
                }

                # compute latency for the discrete weights.
                elapsed_time = self.compute_latency(input_valid,
                                                    disc_weights_lat)
                # latency prediction for continuous weights
                self.surrogate.eval()
                alphas = self.alpha.module.get_arch_sample(weight_lat)
                latency_cont = self.surrogate(alphas)
                # latency prediction for discrete weights
                alphas = self.alpha.module.get_arch_sample(disc_weights_lat)
                latency_discrete = self.surrogate(alphas)
                surrogate_loss = torch.mean(
                    torch.abs(elapsed_time - latency_discrete.squeeze(1)))

                self.latency_coeff_curr = self.latency_coeff * max(
                    min(global_step / self.args.latency_iter, 1.0), 0.)
                loss_disc_lat = self.latency_coeff_curr * torch.relu(
                    torch.Tensor([elapsed_time]).cuda() - self.target_latency)
                loss_cont_lat = self.latency_coeff_curr * torch.relu(
                    latency_cont[0] - self.target_latency)

                # collect latency information
                self.latency_pred_loss.update(
                    utils.reduce_tensor(surrogate_loss.data,
                                        self.args.world_size))
                self.latency_value.update(elapsed_time)

                self.latency_actual.append(elapsed_time)
                self.latency_estimate.append(
                    latency_discrete.squeeze(1).data.cpu().numpy()[0])

                if global_step % 50 == 0:
                    self.logging.info('latency_pred_loss %f' % np.mean(
                        np.abs(
                            np.array(self.latency_actual)[-50:] -
                            np.array(self.latency_estimate)[-50:])))

                # saving some latency info
                if global_step % 1000 == 100 and self.args.local_rank == 0:
                    import pickle
                    print('saving')
                    with open(os.path.join(self.args.save, 'latency.pkl'),
                              'wb') as f:
                        pickle.dump([
                            self.latency_actual, self.latency_estimate,
                            global_step
                        ], f)

                reward = (loss_disc_lat - loss_cont_lat).detach()
                log_q_d = self.alpha.module.log_prob(weight_lat)
                loss = loss + torch.mean(
                    log_q_d * reward) + torch.mean(loss_cont_lat)

        elif self.meta_loss == 'reinforce':
            # compute loss with discrete weights
            with torch.no_grad():
                disc_weights = self.alpha.module.discretize(weights)
                loss_disc, accuracy, loss1, loss2 = self.training_obj(
                    input_valid, target_valid, disc_weights, model_opt,
                    input_valid2, target_valid2, global_step)

            reduce_loss_disc = utils.reduce_tensor(loss_disc.data,
                                                   self.args.world_size)
            avg = torch.mean(reduce_loss_disc).detach()
            baseline = self.exp_avg1.avg
            # update the moving average
            self.exp_avg1.update(avg)
            reward = (loss_disc - baseline).detach()
            log_q_d = self.alpha.module.log_prob(weights)
            loss = torch.mean(log_q_d * reward) + baseline
            loss1, loss2 = torch.mean(loss1), torch.mean(loss2)

            if self.latency_cost:
                weight_lat = self.alpha(1)
                disc_weights_lat = self.alpha.module.discretize(weights)
                elapsed_time = self.compute_latency(input_valid,
                                                    disc_weights_lat)
                self.latency_coeff_curr = self.latency_coeff * min(
                    global_step / self.args.latency_iter, 1.0)
                loss_disc_lat = self.latency_coeff_curr * elapsed_time
                self.latency_value.update(elapsed_time)

                baseline = self.exp_avg2.avg
                # update the moving average
                self.exp_avg2.update(float(loss_disc_lat))
                reward = loss_disc_lat - baseline
                log_q_d = self.alpha.module.log_prob(weight_lat)
                loss = loss + torch.mean(log_q_d * reward) + baseline
                loss1, loss2 = torch.mean(loss1), torch.mean(loss2)

        entropy_loss = self.alpha.module.entropy_loss(weights)

        # Backward pass and update.
        loss.backward()
        self.arch_optimizer.step()
        # Logging.
        reduced_loss = utils.reduce_tensor(loss.data, self.args.world_size)
        accuracy = utils.reduce_tensor(accuracy, self.args.world_size)

        self.loss.update(to_python_float(reduced_loss), 1)
        self.accuracy.update(to_python_float(accuracy), 1)
        self.count += 1
        if self.count % self.report_freq == 0:
            self.logging.info('Meta Loss:%s %03d  %e %f', self.meta_loss,
                              self.count, self.loss.avg, self.accuracy.avg)
            self.writer.add_scalar('meta/loss', self.loss.avg, global_step)
            self.writer.add_scalar('meta/acc', self.accuracy.avg, global_step)
            self.writer.add_scalar(
                'meta/lr',
                self.arch_optimizer.state_dict()['param_groups'][0]['lr'],
                global_step)
            self.writer.add_scalar('meta/entropy', entropy_loss, global_step)

            if self.gen_error_alpha:
                self.writer.add_scalar('meta/loss_val', loss1, global_step)
                self.writer.add_scalar('meta/loss_cov', loss2, global_step)
                self.writer.add_scalar('meta/loss_diff_sign',
                                       self.loss_diff_sign.avg, global_step)

            if self.latency_cost:
                self.writer.add_scalar('meta/latency_time',
                                       self.latency_value.avg, global_step)
                self.writer.add_scalar('meta/latency_prediction_loss',
                                       self.latency_pred_loss.avg, global_step)
                self.writer.add_scalar('meta/latency_coeff',
                                       self.latency_coeff_curr, global_step)