Exemple #1
0
def valid(epoch):
    net.eval()
    valid_loss = 0.0
    correct = 0.0
    total = 0.0

    with torch.no_grad():
        for step, data in enumerate(valid_loader):
            x, y = data
            x, y = x.to(device), y.to(device)
            out = net(x)
            loss = criterion(out, y)

            _, pred = torch.max(out.data, 1)
            valid_loss += loss.item()
            total += y.size(0)
            correct += (pred == y).squeeze().sum().cpu().numpy()
    valid_acc = correct / total
    print("valid accuracy", valid_acc)
    logx.metric('val', {
        'loss': valid_loss,
        'accuracy': valid_acc
    },
                epoch=epoch)
    return valid_acc
Exemple #2
0
def train(epoch):
    net.train()
    train_loss = 0.0
    correct = 0.0
    total = 0.0
    for step, data in enumerate(train_loader):
        x, y = data
        x, y = x.to(device), y.to(device)
        out = net(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, pred = torch.max(out.data, 1)
        total += y.size(0)
        correct += (pred == y).squeeze().sum().cpu().numpy()
        train_loss += loss.item()

        if step % 100 == 0:
            print("epoch", epoch, "step", step, "loss", loss.item())

    train_acc = correct / total
    print("train accuracy", train_acc)
    logx.metric('train', {
        'loss': train_loss,
        'accuracy': train_acc
    },
                epoch=epoch)
    def __train_per_epoch(self, epoch_idx: int, steps_per_eval: int):
        with tqdm(total=len(self.train_dataloader),
                  desc=f"Epoch {epoch_idx}") as pbar:
            for batch_idx, batch in enumerate(self.train_dataloader):
                global_step = epoch_idx * len(
                    self.train_dataloader) + batch_idx
                loss = self.__training_step(batch)
                if self.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                loss.backward()
                logx.metric(
                    'train', {
                        "tr_loss": loss.item(),
                        "learning_rate": self.scheduler.get_last_lr()[0]
                    }, global_step)
                pbar.set_postfix_str(f"tr_loss: {loss.item():.5f}")
                # update weights
                self.optimizer.step()
                self.scheduler.step()  # Update learning rate schedule
                if batch_idx % steps_per_eval == 0:
                    # validate and save checkpoints
                    # downsample a subset of dev dataset
                    eval_dataset = self.dev_dataloader.dataset
                    subset_size = len(eval_dataset) // 500
                    eval_sampled_dataloader = DataLoader(
                        Subset(
                            self.dev_dataloader.dataset,
                            random.sample(range(len(eval_dataset)),
                                          subset_size)),
                        shuffle=True,
                        batch_size=self.batch_size,
                        pin_memory=True)
                    mean_loss, metrics_scores, _, _ = self.validate(
                        eval_sampled_dataloader)
                    logx.metric('val', metrics_scores, global_step)
                    if self.n_gpu > 1:
                        save_dict = {
                            "model_construct_params_dict":
                            self.model.module.param_dict(),
                            "state_dict":
                            self.model.module.state_dict(),
                            "solver_construct_params_dict":
                            self.state_dict(),
                            "optimizer":
                            self.optimizer.state_dict()
                        }
                    else:
                        save_dict = {
                            "model_construct_params_dict":
                            self.model.param_dict(),
                            "state_dict": self.model.state_dict(),
                            "solver_construct_params_dict": self.state_dict(),
                            "optimizer": self.optimizer.state_dict()
                        }

                    logx.save_model(save_dict,
                                    metric=mean_loss,
                                    epoch=global_step,
                                    higher_better=False)
                pbar.update(1)
Exemple #4
0
def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            input = input.cuda(args.gpu, non_blocking=True)
        target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        metrics = {
            'loss': losses.avg,
            'top1': float(top1.avg),
            'top5': float(top5.avg)
        }
        logx.metric('train', metrics, i + epoch * len(train_loader))

        if i % args.print_freq == 0:
            logx.msg('Epoch: [{0}][{1}/{2}]\t'
                     'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                     'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                     'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                     'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                     'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                         epoch,
                         i,
                         len(train_loader),
                         batch_time=batch_time,
                         data_time=data_time,
                         loss=losses,
                         top1=top1,
                         top5=top5))
Exemple #5
0
def validation(args, model, device, val_loader, optimizer, epoch, criterion):
    model.eval()
    n_val = len(val_loader)
    val_loss = 0
    val_psnr = 0
    for batch_idx, batch_data in enumerate(val_loader):
        batch_ldr0, batch_ldr1, batch_ldr2 = batch_data['input0'].to(device), batch_data['input1'].to(device), \
                                             batch_data['input2'].to(device)
        label = batch_data['label'].to(device)

        with torch.no_grad():
            pred = model(batch_ldr0, batch_ldr1, batch_ldr2)
            pred = range_compressor_tensor(pred)
            pred = torch.clamp(pred, 0., 1.)

        loss = criterion(pred, label)
        psnr = batch_PSNR(pred, label, 1.0)
        logx.msg('Validation set: PSNR: {:.4f}'.format(psnr))

        iteration = (epoch - 1) * len(val_loader) + batch_idx
        if epoch % 100 == 0:
            logx.add_image('val/input1', batch_ldr0[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('val/input2', batch_ldr1[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('val/input3', batch_ldr2[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('val/pred', pred[0][[2, 1, 0], :, :], iteration)
            logx.add_image('val/gt', label[0][[2, 1, 0], :, :], iteration)

        val_loss += loss
        val_psnr += psnr

    val_loss /= n_val
    val_psnr /= n_val
    logx.msg('Validation set: Average loss: {:.4f}'.format(val_loss))
    logx.msg('Validation set: Average PSNR: {:.4f}\n'.format(val_psnr))

    # capture metrics
    metrics = {'psnr': val_psnr}
    logx.metric('val', metrics, epoch)
    # save_model
    save_dict = {
        'epoch': epoch + 1,
        'arch': 'AHDRNet',
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }

    logx.save_model(save_dict,
                    epoch=epoch,
                    metric=val_loss,
                    higher_better=True)
Exemple #6
0
def validate(val_loader, model, criterion, args, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if args.gpu is not None:
                input = input.cuda(args.gpu, non_blocking=True)
            target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                logx.msg('Test: [{0}/{1}]\t'
                         'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                         'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                         'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                         'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                             i,
                             len(val_loader),
                             batch_time=batch_time,
                             loss=losses,
                             top1=top1,
                             top5=top5))

        logx.msg(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
            top1=top1, top5=top5))

        metrics = {'top1': float(top1.avg), 'top5': float(top5.avg)}
        logx.metric('val', metrics, epoch)

    return top1.avg
Exemple #7
0
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            logx.msg('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

            # capture metrics
            metrics = {'loss': loss.item()}
            iteration = epoch * len(train_loader) + batch_idx
            logx.metric('train', metrics, iteration)
Exemple #8
0
def test_epoch(epoch):
    model.eval()
    losses = 0.0
    total, correct = 0.0, 0.0
    with torch.no_grad():
        for step, (x, y) in enumerate(val_loader):
            x, y = x.to(config.device), y.to(config.device)
            out = model(x)
            loss = criterion(out, y)
            losses += loss.cpu().detach().numpy()
            _, pred = torch.max(out.data, 1)
            total += y.size(0)
            correct += (pred == y).squeeze().sum().cpu().numpy()
    save_dict = {
        'state_dict': model.state_dict()
    }
    logx.msg("epoch {} validation loss {} validation acc {}".format(epoch, losses / (step + 1), correct / total))
    logx.metric('val', {'loss': losses / (step + 1), 'acc': correct / total})
    logx.save_model(save_dict, losses, epoch, higher_better=False, delete_old=True)
Exemple #9
0
def train(args, model, device, train_loader, optimizer, epoch, criterion):
    model.train()
    epoch_loss = 0
    for batch_idx, batch_data in enumerate(train_loader):
        batch_ldr0, batch_ldr1, batch_ldr2 = batch_data['input0'].to(
            device), batch_data['input1'].to(device), batch_data['input2'].to(
                device)
        label = batch_data['label'].to(device)

        pred = model(batch_ldr0, batch_ldr1, batch_ldr2)
        pred = range_compressor_tensor(pred)
        pred = torch.clamp(pred, 0., 1.)
        loss = criterion(pred, label)
        psnr = batch_PSNR(pred, label, 1.0)
        # psnr = batch_PSNR(torch.clamp(pred, 0., 1.), label, 1.0)

        epoch_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        # nn.utils.clip_grad_value_(model.parameters(), 0.01)
        optimizer.step()

        iteration = (epoch - 1) * len(train_loader) + batch_idx
        if batch_idx % args.log_interval == 0:
            logx.msg('Train Epoch: {} [{}/{} ({:.0f} %)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(batch_data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            logx.add_scalar('train/learning_rate',
                            optimizer.param_groups[0]['lr'], iteration)
            logx.add_scalar('train/psnr', psnr, iteration)
            logx.add_image('train/input1', batch_ldr0[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('train/input2', batch_ldr1[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('train/input3', batch_ldr2[0][[2, 1, 0], :, :],
                           iteration)
            logx.add_image('train/pred', pred[0][[2, 1, 0], :, :], iteration)
            logx.add_image('train/gt', label[0][[2, 1, 0], :, :], iteration)

        # capture metrics
        metrics = {'loss': loss.item()}
        logx.metric('train', metrics, iteration)
Exemple #10
0
def train_epoch(epoch):
    model.train()
    losses = 0.0
    total, correct = 0.0, 0.0
    for step, (x, y) in enumerate(train_loader):
        x, y = x.to(config.device), y.to(config.device)
        out = model(x)
        loss = criterion(out, y)
        losses += loss.cpu().detach().numpy()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, pred = torch.max(out.data, 1)
        total += y.size(0)
        correct += (pred == y).squeeze().sum().cpu().numpy()

        if step % 100 == 0:
            logx.msg("epoch {} step {} training loss {}".format(epoch, step, loss.item()))
    logx.msg("epoch {} training loss {} training acc {}".format(epoch, losses / (step + 1), correct / total))
    logx.metric("train", {"loss": losses / (step + 1), 'acc': correct / total})
    return losses
def callback_func(env):
    """
    callback function that records r^2 and MSE
    """
    if env.evaluation_result_list[0][0] == "dev-NegrSquare" and env.evaluation_result_list[1][0] == "dev-rmse":
        eval_dict = {
            "R2": -env.evaluation_result_list[0][1],
            "MSE": env.evaluation_result_list[1][1],
        }
    elif env.evaluation_result_list[0][0] == "dev-rmse" and env.evaluation_result_list[1][0] == "dev-NegrSquare":
        eval_dict = {
            "MSE": env.evaluation_result_list[0][1],
            "R2": -env.evaluation_result_list[1][1],
        }
    else:
        eval_dict = {
            env.evaluation_result_list[0][0]: env.evaluation_result_list[0][1],
            env.evaluation_result_list[1][0]: env.evaluation_result_list[1][1],
        }
    logx.metric('val',
                eval_dict,
                env.iteration)
Exemple #12
0
def test(args, model, device, test_loader, epoch, optimizer):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    accuracy = 100. * correct / len(test_loader.dataset)
    logx.msg(
        '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset), accuracy))

    # capture metrics
    metrics = {'loss': test_loss, 'accuracy': accuracy}
    logx.metric('val', metrics, epoch)

    # save model
    save_dict = {
        'epoch': epoch + 1,
        'arch': 'lenet',
        'state_dict': model.state_dict(),
        'accuracy': accuracy,
        'optimizer': optimizer.state_dict()
    }

    logx.save_model(save_dict,
                    metric=accuracy,
                    epoch=epoch,
                    higher_better=True)
Exemple #13
0
 def __train_per_epoch(self, epoch_idx, steps_per_eval):
     # with tqdm(total=len(self.train_dataloader), desc=f"Epoch {epoch_idx}") as pbar:
     for batch_idx, batch in enumerate(self.train_dataloader):
         # assume that the whole input matrix fits the GPU memory
         global_step = epoch_idx * len(self.train_dataloader) + batch_idx
         training_set_loss, training_set_outputs, training_set_output_similarity = self.__training_step(
             batch)
         if batch_idx + 1 == len(self.train_dataloader):
             # validate and save checkpoints
             developing_set_outputs, developing_set_metrics_scores, developing_set_output_similarity = \
                 self.validate(self.dev_dataloader)
             # TODO: this part can be optimized to batchwise computing
             if self.record_training_loss_per_epoch:
                 training_set_metrics_scores, _ = \
                     self.get_scores(self.train_decoder,
                                     training_set_outputs,
                                     self.train_dataloader.dataset.anchor_idx)
             else:
                 training_set_metrics_scores = dict()
             training_set_metrics_scores['loss'] = training_set_loss.item()
             if self.scheduler:
                 training_set_metrics_scores[
                     'learning_rate'] = self.scheduler.get_last_lr()[0]
             logx.metric('train', training_set_metrics_scores, global_step)
             logx.metric('val', developing_set_metrics_scores, global_step)
             if self.n_gpu > 1:
                 save_dict = {
                     "model_construct_dict": self.model.module.config,
                     "model_state_dict": self.model.module.state_dict(),
                     "solver_construct_params_dict":
                     self.construct_param_dict,
                     "optimizer": self.optimizer.state_dict(),
                     "train_scores": training_set_metrics_scores,
                     "train_input_embedding":
                     self.train_dataloader.dataset.x,
                     "train_input_similarity":
                     self.train_dataloader.dataset.input_similarity,
                     "train_output_embedding": training_set_outputs,
                     "train_output_similarity":
                     training_set_output_similarity,
                     "dev_scores": developing_set_metrics_scores,
                     "dev_input_embeddings": self.dev_dataloader.dataset.x,
                     "dev_input_similarity":
                     self.dev_dataloader.dataset.input_similarity,
                     "dev_output_embedding": developing_set_outputs,
                     "dev_output_similarity":
                     developing_set_output_similarity,
                 }
             else:
                 save_dict = {
                     "model_construct_dict": self.model.config,
                     "model_state_dict": self.model.state_dict(),
                     "solver_construct_params_dict":
                     self.construct_param_dict,
                     "optimizer": self.optimizer.state_dict(),
                     "train_scores": training_set_metrics_scores,
                     "train_input_embedding":
                     self.train_dataloader.dataset.x,
                     "train_input_similarity":
                     self.train_dataloader.dataset.input_similarity,
                     "train_output_embedding": training_set_outputs,
                     "train_output_similarity":
                     training_set_output_similarity,
                     "dev_scores": developing_set_metrics_scores,
                     "dev_input_embeddings": self.dev_dataloader.dataset.x,
                     "dev_input_similarity":
                     self.dev_dataloader.dataset.input_similarity,
                     "dev_output_embedding": developing_set_outputs,
                     "dev_output_similarity":
                     developing_set_output_similarity,
                 }
             logx.save_model(
                 save_dict,
                 metric=developing_set_metrics_scores['Recall@1'],
                 epoch=global_step,
                 higher_better=True)
Exemple #14
0
                              batch_size=args.batch_size,
                              shuffle=True)
    valid_dataset = Train_Dataset('./data/new_valid.csv',
                                  './data/train/',
                                  transform=valid_transformer)
    valid_loader = DataLoader(dataset=valid_dataset,
                              batch_size=args.batch_size)
    best_accuracy = 0
    for epoch in range(args.epochs):
        print("epoch:" + str(epoch))
        train_acc, train_loss = train(my_model,
                                      train_loader,
                                      optimizer,
                                      scheduler=scheduler)
        metric_train = {'train_acc': train_acc, 'train_loss': train_loss}
        logx.metric('train', metric_train, epoch)
        # torch.save({'state_dict}': my_model.state_dict()}, './weights/resnet50_last.pth')
        valid_acc, valid_loss = valid(my_model, valid_loader)
        metric_valid = {'valid_acc': valid_acc, 'valid_loss': valid_loss}
        logx.metric('val', metric_valid, epoch)
        if valid_acc > best_accuracy:
            best_accuracy = valid_acc
            torch.save({'state_dict}': my_model.state_dict()},
                       './logs/exp9/highest_valid_acc.pth')
        logx.save_model({'state_dict}': my_model.state_dict()},
                        valid_loss,
                        epoch,
                        higher_better=False,
                        delete_old=True)
        print("current_acc:{0}, best_acc:{1}".format(valid_acc, best_accuracy))
def train(train_loader, net, optim, curr_epoch, scaler):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    return:
    """
    full_bt = time.perf_counter()
    net.train()

    train_main_loss = AverageMeter()
    start_time = None
    warmup_iter = 10
    optim.last_batch = len(train_loader) - 1
    btimes = []
    batch_time = time.perf_counter()
    for i, data in enumerate(train_loader):
        lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4)

        if i <= warmup_iter:
            start_time = time.time()
        # inputs = (bs,3,713,713)
        # gts    = (bs,713,713)
        images, gts, _img_name, scale_float = data
        batch_pixel_size = images.size(0) * images.size(2) * images.size(3)
        images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda(
        )
        inputs = {'images': images, 'gts': gts}
        optim.zero_grad()
        if args.amp:
            with amp.autocast():
                main_loss = net(inputs)
                log_main_loss = main_loss.clone().detach_()
                # torch.distributed.all_reduce(log_main_loss,
                #                              torch.distributed.ReduceOp.SUM)
                log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss,
                                                 MPI.SUM)
                # log_main_loss = log_main_loss / args.world_size
            # train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            scaler.scale(main_loss).backward()
        else:
            main_loss = net(inputs)
            main_loss = main_loss.mean()
            log_main_loss = main_loss.clone().detach_()
            log_wait = None
            #train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            main_loss.backward()

        # the scaler update is within the optim step
        optim.step()

        if i >= warmup_iter:
            curr_time = time.time()
            batches = i - warmup_iter + 1
            batchtime = (curr_time - start_time) / batches
        else:
            batchtime = 0

        if log_wait is not None:
            log_wait.Wait()
        log_main_loss = log_main_loss / args.world_size
        train_main_loss.update(log_main_loss.item(), batch_pixel_size)

        msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],'
               ' [lr {:0.6f}] [batchtime {:0.3g}]')
        msg = msg.format(curr_epoch, i + 1, len(train_loader),
                         train_main_loss.avg,
                         optim.local_optimizer.param_groups[-1]['lr'],
                         batchtime)
        logx.msg(msg)

        metrics = {
            'loss': train_main_loss.avg,
            'lr': optim.local_optimizer.param_groups[-1]['lr']
        }
        curr_iter = curr_epoch * len(train_loader) + i
        logx.metric('train', metrics, curr_iter)

        if i >= 10 and args.test_mode:
            del data, inputs, gts
            return
        btimes.append(time.perf_counter() - batch_time)
        batch_time = time.perf_counter()

    if args.benchmarking:
        train_loss_tens = torch.tensor(train_main_loss.avg)
        optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM)
        train_loss_tens = train_loss_tens.to(torch.float)
        train_loss_tens /= float(optim.comm.size)
        train_main_loss.avg = train_loss_tens.item()

    return train_main_loss.avg, torch.mean(
        torch.tensor(btimes)), time.perf_counter() - full_bt
Exemple #16
0
def train(train_loader, net, optim, curr_epoch):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    return:
    """
    net.train()

    train_main_loss = AverageMeter()
    start_time = None
    warmup_iter = 10
    loss_metric = dict([('epoch', []), ('loss', []), ('lr', [])])
    for i, data in enumerate(train_loader):
        if i <= warmup_iter:
            start_time = time.time()
        # inputs = (bs,3,713,713)
        # gts    = (bs,713,713)
        images, gts, _img_name, scale_float = data
        batch_pixel_size = images.size(0) * images.size(2) * images.size(3)
        images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda(
        )
        inputs = {'images': images, 'gts': gts}

        optim.zero_grad()
        main_loss = net(inputs)

        if args.apex:
            log_main_loss = main_loss.clone().detach_()
            torch.distributed.all_reduce(log_main_loss,
                                         torch.distributed.ReduceOp.SUM)
            log_main_loss = log_main_loss / args.world_size
        else:
            main_loss = main_loss.mean()
            log_main_loss = main_loss.clone().detach_()

        train_main_loss.update(log_main_loss.item(), batch_pixel_size)
        if args.fp16:
            with amp.scale_loss(main_loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:
            main_loss.backward()

        optim.step()

        if i >= warmup_iter:
            curr_time = time.time()
            batches = i - warmup_iter + 1
            batchtime = (curr_time - start_time) / batches
        else:
            batchtime = 0

        msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],'
               ' [lr {:0.6f}] [batchtime {:0.3g}]')
        msg = msg.format(curr_epoch, i + 1, len(train_loader),
                         train_main_loss.avg, optim.param_groups[-1]['lr'],
                         batchtime)
        logx.msg(msg)

        metrics = {
            'loss': train_main_loss.avg,
            'lr': optim.param_groups[-1]['lr']
        }
        curr_iter = curr_epoch * len(train_loader) + i
        logx.metric('train', metrics, curr_iter)
        loss_metric['epoch'].append(curr_epoch)
        loss_metric['loss'].append(train_main_loss.avg)
        loss_metric['lr'].append(optim.param_groups[-1]['lr'])

        if i >= 10 and args.test_mode:
            del data, inputs, gts
            return
        del data
Exemple #17
0
def train():
    for time in range(5):
        logx.initialize(get_logdir("../runs"),
                        tensorboard=True,
                        coolname=False)

        model.load_state_dict(
            torch.load("..\\runs\exp10\last_checkpoint_ep0.pth")
            ['state_dict'])  # warmup

        dataset_train = TrainDataset(
            '../' + cfg.root_folder +
            '/five_fold/train_kfold_{}.csv'.format(time),
            '../' + cfg.root_folder + '/train/', train_transform)
        train_loader = DataLoader(dataset_train,
                                  batch_size=cfg.bs,
                                  shuffle=True)
        test_data = TrainDataset(
            '../' + cfg.root_folder +
            '/five_fold/test_kfold_{}.csv'.format(time),
            '../' + cfg.root_folder + '/train/',
        )
        test_load = DataLoader(test_data, batch_size=cfg.bs, shuffle=False)

        # train
        for epoch in range(cfg.epoch):
            loss_epoch = 0
            total = 0
            correct = 0
            for i, (x, y) in enumerate(train_loader, 1):
                x, y = x.to(device), y.to(device)
                y_hat = model(x)
                # 计算正确率
                total += x.size(0)
                _, predict = torch.max(y_hat.data, dim=1)
                correct += (predict == y).sum().item()

                # 损失
                loss = criterion(y_hat, y)
                loss_epoch += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # 过程可视化
                if i % 30 == 0:
                    print(
                        'epoch:%d,  enumerate:%d,  loss_avg:%f,  now_acc:%f' %
                        (epoch, i, loss_epoch / i, correct / total))

            # epoch matric 可视化
            train_loss = loss_epoch / i
            train_acc = (correct / total) * 100
            logx.metric('train', {'loss': train_loss, 'acc': train_acc}, epoch)

            # valid
            # 开发集正确率
            correct = 0
            total = 0
            val_loss = 0
            with torch.no_grad():
                for i, (img, label) in enumerate(test_load, 1):
                    img, label = img.to(device), label.to(device)
                    output = model(img)
                    loss = criterion(output, label)
                    val_loss += loss.cpu().item()
                    _, predicted = torch.max(output.data, dim=1)  # 最大值,位置
                    total += img.size(0)
                    correct += (predicted == label).sum().item()
            val_acc = (100 * correct / total)
            val_loss /= i
            logx.metric('val', {'loss': val_loss, 'acc': val_acc}, epoch)
            # epoch lossand other metric
            print(
                'epoch over; train_loss:%f, val_loss:%f, train_acc=%f, val_acc:%f'
                % (train_loss, val_loss, train_acc, val_acc))
            logx.save_model({
                'state_dict': model.state_dict(),
                'epoch': epoch
            },
                            val_acc,
                            higher_better=True,
                            epoch=epoch,
                            delete_old=True)
            scheduler.step()
Exemple #18
0
def eval_metrics(iou_acc, args, net, optim, val_loss, epoch, mf_score=None):
    """
    Modified IOU mechanism for on-the-fly IOU calculations ( prevents memory
    overflow for large dataset) Only applies to eval/eval.py
    """
    was_best = False

    iou_per_scale = {}
    iou_per_scale[1.0] = iou_acc
    if args.amp or args.apex:
        iou_acc_tensor = torch.cuda.FloatTensor(iou_acc)
        torch.distributed.all_reduce(iou_acc_tensor,
                                     op=torch.distributed.ReduceOp.SUM)
        iou_per_scale[1.0] = iou_acc_tensor.cpu().numpy()

    scales = [1.0]

    # Only rank 0 should save models and calculate metrics
    if args.global_rank != 0:
        return None, 0

    hist = iou_per_scale[args.default_scale]
    iu, acc, acc_cls = calculate_iou(hist)
    iou_per_scale = {args.default_scale: iu}

    # calculate iou for other scales
    for scale in scales:
        if scale != args.default_scale:
            iou_per_scale[scale], _, _ = calculate_iou(iou_per_scale[scale])

    print_evaluate_results(hist,
                           iu,
                           epoch=epoch,
                           iou_per_scale=iou_per_scale,
                           log_multiscale_tb=args.log_msinf_to_tb)

    freq = hist.sum(axis=1) / hist.sum()
    mean_iu = np.nanmean(iu)
    fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()

    metrics = {
        'loss': val_loss.avg,
        'mean_iu': mean_iu,
        'acc_cls': acc_cls,
        'acc': acc,
    }
    logx.metric('val', metrics, epoch)
    logx.msg('Mean: {:2.2f}'.format(mean_iu * 100))

    save_dict = {
        'epoch':
        epoch,
        'arch':
        args.arch,
        'num_classes':
        cfg.DATASET_INST.num_classes,
        'state_dict':
        net.state_dict(),
        'optimizer':
        optim.lcl_optimizer.state_dict() if args.heat else optim.state_dict(),
        'mean_iu':
        mean_iu,
        'command':
        ' '.join(sys.argv[1:])
    }
    logx.save_model(save_dict, metric=mean_iu, epoch=epoch)
    torch.cuda.synchronize()

    if mean_iu > args.best_record['mean_iu']:
        was_best = True

        args.best_record['val_loss'] = val_loss.avg
        if mf_score is not None:
            args.best_record['mask_f1_score'] = mf_score.avg
        args.best_record['acc'] = acc
        args.best_record['acc_cls'] = acc_cls
        args.best_record['fwavacc'] = fwavacc
        args.best_record['mean_iu'] = mean_iu
        args.best_record['epoch'] = epoch

    logx.msg('-' * 107)
    if mf_score is None:
        fmt_str = ('{:5}: [epoch {}], [val loss {:0.5f}], [acc {:0.5f}], '
                   '[acc_cls {:.5f}], [mean_iu {:.5f}], [fwavacc {:0.5f}]')
        current_scores = fmt_str.format('this', epoch, val_loss.avg, acc,
                                        acc_cls, mean_iu, fwavacc)
        logx.msg(current_scores)
        best_scores = fmt_str.format('best', args.best_record['epoch'],
                                     args.best_record['val_loss'],
                                     args.best_record['acc'],
                                     args.best_record['acc_cls'],
                                     args.best_record['mean_iu'],
                                     args.best_record['fwavacc'])
        logx.msg(best_scores)
    else:
        fmt_str = ('{:5}: [epoch {}], [val loss {:0.5f}], [mask f1 {:.5f} ] '
                   '[acc {:0.5f}], '
                   '[acc_cls {:.5f}], [mean_iu {:.5f}], [fwavacc {:0.5f}]')
        current_scores = fmt_str.format('this', epoch, val_loss.avg,
                                        mf_score.avg, acc, acc_cls, mean_iu,
                                        fwavacc)
        logx.msg(current_scores)
        best_scores = fmt_str.format(
            'best', args.best_record['epoch'], args.best_record['val_loss'],
            args.best_record['mask_f1_score'], args.best_record['acc'],
            args.best_record['acc_cls'], args.best_record['mean_iu'],
            args.best_record['fwavacc'])
        logx.msg(best_scores)
    logx.msg('-' * 107)

    return was_best, mean_iu
Exemple #19
0
def train_net():
    header = [
        'epoch', 'train_loss', 'val_loss', 'val_dice', 'val_iou', 'lr',
        'time(s)'
    ]
    start_epoch, global_step, best_score, total_list = -1, 1, 0.0, []
    if args.vis:
        viz = Visualizer(port=args.port,
                         env=f"EXP_{args.exp_id}_NET_{args.arch}")

    # Resume the training process
    if args.resume:
        start_epoch = resume(args=args)

    # automatic mixed-precision training
    if args.amp_available:
        scaler = torch.cuda.amp.GradScaler()

    for epoch in range(start_epoch + 1, args.epochs):
        args.net.train()

        epoch_loss, epoch_start_time, rows = 0., time(), [epoch + 1]

        # get the current learning rate
        new_lr = get_lr(args=args, epoch=epoch)

        # Training process
        with tqdm(total=n_train,
                  desc=f'Epoch-{epoch + 1}/{args.epochs}',
                  unit='img') as p_bar:
            for batch in train_loader:
                # args.optimizer.zero_grad()
                image, label = batch['image'], batch['label']
                assert image.shape[1] == args.n_channels

                # Prepare the image and the corresponding label.
                image = image.to(device=args.device, dtype=torch.float32)
                mask_type = torch.float32 if args.n_classes == 1 else torch.long
                label = label.to(device=args.device, dtype=mask_type)

                # Forward propagation.
                if args.amp_available:
                    with torch.cuda.amp.autocast():
                        try:
                            output = args.net(image)
                        except RuntimeError as exception:
                            if "out of memory" in str(exception):
                                print("WARNING: out of memory")
                                if hasattr(torch.cuda, 'empty_cache'):
                                    torch.cuda.empty_cache()
                                exit(0)
                            else:
                                raise exception
                        loss = criterion(output, label)
                else:
                    output = args.net(image)
                    loss = criterion(output, label)

                # visualize the image.
                if args.vis:
                    try:
                        viz.img(name='ground_truth', img_=label[0])
                        tmp = output[0]
                        tmp[tmp > 0.5] = 1.0
                        tmp[tmp < 0.5] = 0.0
                        viz.img(name='prediction', img_=tmp)
                    except ConnectionError:
                        pass

                args.optimizer.zero_grad()
                # Back propagation.
                if args.amp_available:
                    scaler.scale(loss).backward()
                    scaler.step(args.optimizer)
                    scaler.update()
                else:
                    loss.backward()
                    args.optimizer.step()

                global_step += 1
                epoch_loss += loss.item()
                logx.add_scalar('Loss/train', loss.item(), global_step)
                p_bar.set_postfix(**{'loss (batch)': loss.item()})
                p_bar.update(image.shape[0])

        # Calculate  the train loss
        train_loss = epoch_loss / (n_train // args.batch_size)
        metrics = {'train_loss': train_loss}
        logx.metric(phase='train', metrics=metrics, epoch=epoch)

        # Validate process
        val_score, val_loss = eval_net(criterion, logx, epoch, val_loader,
                                       n_val, args)

        # Update the current learning rate and
        # you should write the monitor metrics in step() if you use the ReduceLROnPlateau scheduler.
        if args.sche != "Poly":
            args.scheduler.step()

        # Calculating and logging the metrics
        metrics = {
            'val_loss': val_loss,
            'iou': val_score['iou'],
            'dc': val_score['dc'],
            'sp': val_score['sp'],
            'se': val_score['se'],
            'acc': val_score['acc'],
        }
        logx.metric(phase='val', metrics=metrics, epoch=epoch)

        # Print the metrics
        print(
            "\033[1;33;44m=============================Evaluation result=============================\033[0m"
        )
        logx.msg("[Train] Loss: %.4f | LR: %.6f" % (train_loss, new_lr))
        logx.msg("[Valid] Loss: %.4f | ACC: %.4f | IoU: %.4f | DC: %.4f" % (
            val_loss,
            metrics['acc'],
            metrics['iou'],
            metrics['dc'],
        ))
        rows += [train_loss, val_loss, metrics['dc'], metrics['iou'], new_lr]

        # Logging the image to tensorboard
        logx.add_image('image', torch.cat([i for i in image], 2), epoch)
        logx.add_image('label/gt', torch.cat([j for j in label], 2), epoch)
        logx.add_image('label/pd', torch.cat([k > 0.5 for k in output], 2),
                       epoch)

        # Update the best score
        best_score, tm = update_score(args, best_score, val_score, logx, epoch,
                                      epoch_start_time)
        rows.append(tm)
        total_list.append(rows)

        # Saving the model with relevant parameters
        save_model(args, epoch, new_lr, interval=10)

    data = pd.DataFrame(total_list)
    file_path = os.path.join(os.path.join(args.dir_log, 'metrics.csv'))
    data.to_csv(file_path,
                header=header,
                index=False,
                mode='w',
                encoding='utf-8')
    plot_curve(file_path, args.dir_log, show=True)