Example #1
0
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')

    progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses],
                             prefix="Train - Epoch: [{}/{}]".format(epoch, configs.num_epochs))

    criterion = Compute_Loss(device=configs.device)
    num_iters_per_epoch = len(train_dataloader)
    # switch to train mode
    model.train()
    start_time = time.time()
    for batch_idx, batch_data in enumerate(tqdm(train_dataloader)):
        data_time.update(time.time() - start_time)
        metadatas, imgs, targets = batch_data
        batch_size = imgs.size(0)
        global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1
        for k in targets.keys():
            targets[k] = targets[k].to(configs.device, non_blocking=True)
        imgs = imgs.to(configs.device, non_blocking=True).float()
        outputs = model(imgs)
        total_loss, loss_stats = criterion(outputs, targets)
        # For torch.nn.DataParallel case
        if (not configs.distributed) and (configs.gpu_idx is None):
            total_loss = torch.mean(total_loss)

        # compute gradient and perform backpropagation
        total_loss.backward()
        if global_step % configs.subdivisions == 0:
            optimizer.step()
            # zero the parameter gradients
            optimizer.zero_grad()

            # ######################### Sersy #########################################
            # Adjust learning rate
            # if configs.step_lr_in_epoch:
            #     lr_scheduler.step()
            #     if tb_writer is not None:
            #         tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], global_step)

        if configs.distributed:
            reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
        else:
            reduced_loss = total_loss.data
        losses.update(to_python_float(reduced_loss), batch_size)
        # measure elapsed time
        # torch.cuda.synchronize()
        batch_time.update(time.time() - start_time)

        if tb_writer is not None:
            if (global_step % configs.tensorboard_freq) == 0:
                loss_stats['avg_loss'] = losses.avg
                tb_writer.add_scalars('Train', loss_stats, global_step)
        # Log message
        if logger is not None:
            if (global_step % configs.print_freq) == 0:
                logger.info(progress.get_message(batch_idx))

        start_time = time.time()
Example #2
0
def train(train_loader, model, criterion, optimizer, args):
    model.train()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    running_metric_text = runningScore(2)
    running_metric_kernel = runningScore(2)

    end = time.time()
    for batch_idx, (imgs, gt_texts, gt_kernels,
                    training_masks) in enumerate(train_loader):
        data_time.update(time.time() - end)

        imgs = Variable(imgs.cuda())
        gt_texts = Variable(gt_texts.cuda())
        gt_kernels = Variable(gt_kernels.cuda())
        training_masks = Variable(training_masks.cuda())

        outputs = model(imgs)
        texts = outputs[:, 0, :, :]
        kernels = outputs[:, 1:, :, :]

        loss = criterion(texts, gt_texts, kernels, gt_kernels, training_masks)
        losses.update(loss.item(), imgs.size(0))

        optimizer.zero_grad()
        loss.backward()

        if (args.sr_lr is not None):
            updateBN(model, args)

        optimizer.step()

        score_text = cal_text_score(texts, gt_texts, training_masks,
                                    running_metric_text)
        score_kernel = cal_kernel_score(kernels, gt_kernels, gt_texts,
                                        training_masks, running_metric_kernel)

        batch_time.update(time.time() - end)
        end = time.time()

        if batch_idx % 20 == 0:
            output_log = '({batch}/{size}) Batch: {bt:.3f}s | TOTAL: {total:.0f}min | ETA: {eta:.0f}min | Loss: {loss:.4f} | Acc_t: {acc: .4f} | IOU_t: {iou_t: .4f} | IOU_k: {iou_k: .4f}'.format(
                batch=batch_idx + 1,
                size=len(train_loader),
                bt=batch_time.avg,
                total=batch_time.avg * batch_idx / 60.0,
                eta=batch_time.avg * (len(train_loader) - batch_idx) / 60.0,
                loss=losses.avg,
                acc=score_text['Mean Acc'],
                iou_t=score_text['Mean IoU'],
                iou_k=score_kernel['Mean IoU'])
            print(output_log)
            sys.stdout.flush()

    return (losses.avg, score_text['Mean Acc'], score_kernel['Mean Acc'],
            score_text['Mean IoU'], score_kernel['Mean IoU'])
Example #3
0
def train_one_epoch(train_loader, model, optimizer, epoch, configs, logger):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')

    progress = ProgressMeter(len(train_loader),
                             [batch_time, data_time, losses],
                             prefix="Train - Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    start_time = time.time()
    for batch_idx, (origin_imgs, resized_imgs, org_ball_pos_xy,
                    global_ball_pos_xy, event_class,
                    target_seg) in enumerate(tqdm(train_loader)):
        data_time.update(time.time() - start_time)
        batch_size = resized_imgs.size(0)
        target_seg = target_seg.to(configs.device, non_blocking=True)
        resized_imgs = resized_imgs.to(configs.device,
                                       non_blocking=True).float()
        # Only move origin_imgs to cuda if the model has local stage for ball detection
        if not configs.no_local:
            origin_imgs = origin_imgs.to(configs.device,
                                         non_blocking=True).float()
            # compute output
            pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model(
                origin_imgs, resized_imgs, org_ball_pos_xy, global_ball_pos_xy,
                event_class, target_seg)
        else:
            pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model(
                None, resized_imgs, org_ball_pos_xy, global_ball_pos_xy,
                event_class, target_seg)
        # For torch.nn.DataParallel case
        if (not configs.distributed) and (configs.gpu_idx is None):
            total_loss = torch.mean(total_loss)

        # zero the parameter gradients
        optimizer.zero_grad()
        # compute gradient and perform backpropagation
        total_loss.backward()
        optimizer.step()

        losses.update(total_loss.item(), batch_size)
        # measure elapsed time
        batch_time.update(time.time() - start_time)

        # Log message
        if logger is not None:
            if ((batch_idx + 1) % configs.print_freq) == 0:
                logger.info(progress.get_message(batch_idx))

        start_time = time.time()

    return losses.avg
def evaluate_one_epoch(val_loader, model, epoch, configs, logger):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')

    conf_thresh = 0.5
    nms_thresh = 0.5
    iou_threshold = 0.5

    progress = ProgressMeter(len(val_loader), [batch_time, data_time],
                             prefix="Evaluate - Epoch: [{}/{}]".format(
                                 epoch, configs.num_epochs))
    labels = []
    sample_metrics = []  # List of tuples (TP, confs, pred)
    # switch to evaluate mode
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        for batch_idx, batch_data in enumerate(tqdm(val_loader)):
            data_time.update(time.time() - start_time)
            _, imgs, targets = batch_data
            # Extract labels
            labels += targets[:, 1].tolist()
            # Rescale target
            targets[:, 2:] *= configs.img_size
            imgs = imgs.to(configs.device, non_blocking=True)

            outputs = model(imgs)
            outputs = post_processing(outputs,
                                      conf_thresh=conf_thresh,
                                      nms_thresh=nms_thresh)

            sample_metrics += get_batch_statistics_rotated_bbox(
                outputs, targets, iou_threshold=iou_threshold)

            # measure elapsed time
            # torch.cuda.synchronize()
            batch_time.update(time.time() - start_time)

            # Log message
            if logger is not None:
                if ((batch_idx + 1) % configs.print_freq) == 0:
                    logger.info(progress.get_message(batch_idx))

            start_time = time.time()

        # Concatenate sample statistics
        true_positives, pred_scores, pred_labels = [
            np.concatenate(x, 0) for x in list(zip(*sample_metrics))
        ]
        precision, recall, AP, f1, ap_class = ap_per_class(
            true_positives, pred_scores, pred_labels, labels)

    return precision, recall, AP, f1, ap_class
Example #5
0
def train(train_dataloader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, data in enumerate(train_dataloader):

        # measure data loading time
        data_time.update(time.time() - end)

        # get the inputs; data is a list of [inputs, labels]
        inputs, targets = data
        inputs = inputs.to(device)
        targets = targets.to(device)

        # compute output
        output = model(inputs)
        loss = criterion(output, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, targets, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1, inputs.size(0))
        top5.update(prec5, inputs.size(0))

        # compute gradients in a backward pass
        optimizer.zero_grad()
        loss.backward()

        # Call step of optimizer to update model params
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 5 == 0:
            print(
                f"Epoch [{epoch + 1}] [{i}/{len(train_dataloader)}]\t"
                f"Time {data_time.val:.3f} ({data_time.avg:.3f})\t"
                f"Loss {loss.item():.4f}\t"
                f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})",
                end="\r")
    torch.save(model.state_dict(),
               f"./checkpoints/{opt.datasets}_epoch_{epoch + 1}.pth")
def train_one_epoch(train_loader, model, optimizer, epoch, configs, logger):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')

    progress = ProgressMeter(len(train_loader),
                             [batch_time, data_time, losses],
                             prefix="Train - Epoch: [{}/{}]".format(
                                 epoch, configs.num_epochs))

    # switch to train mode
    model.train()
    start_time = time.time()
    for batch_idx, (resized_imgs, org_ball_pos_xy, global_ball_pos_xy,
                    target_events,
                    target_seg) in enumerate(tqdm(train_loader)):
        data_time.update(time.time() - start_time)
        batch_size = resized_imgs.size(0)
        target_seg = target_seg.to(configs.device, non_blocking=True)
        resized_imgs = resized_imgs.to(configs.device,
                                       non_blocking=True).float()
        pred_ball_global, pred_ball_local, pred_events, pred_seg, local_ball_pos_xy, total_loss, _ = model(
            resized_imgs, org_ball_pos_xy, global_ball_pos_xy, target_events,
            target_seg)
        # For torch.nn.DataParallel case
        if (not configs.distributed) and (configs.gpu_idx is None):
            total_loss = torch.mean(total_loss)

        # zero the parameter gradients
        optimizer.zero_grad()
        # compute gradient and perform backpropagation
        total_loss.backward()
        optimizer.step()

        if configs.distributed:
            reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
        else:
            reduced_loss = total_loss.data
        losses.update(to_python_float(reduced_loss), batch_size)
        # measure elapsed time
        torch.cuda.synchronize()
        batch_time.update(time.time() - start_time)

        # Log message
        if logger is not None:
            if ((batch_idx + 1) % configs.print_freq) == 0:
                logger.info(progress.get_message(batch_idx))

        start_time = time.time()

    return losses.avg
def train(train_loader, net, optim, curr_epoch, writer):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    writer: tensorboard writer
    return:
    """
    net.train()

    train_main_loss = AverageMeter()
    curr_iter = curr_epoch * len(train_loader)

    for i, data in enumerate(train_loader):
        inputs, gts, _img_name = data

        batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3)

        inputs, gts = inputs.cuda(), gts.cuda()

        optim.zero_grad()

        main_loss = net(inputs, gts=gts)

        if args.apex:
            log_main_loss = main_loss.clone().detach_()
            torch.distributed.all_reduce(log_main_loss, torch.distributed.ReduceOp.SUM)
            log_main_loss = log_main_loss / args.world_size
        else:
            main_loss = main_loss.mean()
            log_main_loss = main_loss.clone().detach_()

        train_main_loss.update(log_main_loss.item(), batch_pixel_size)
        if args.fp16:
            with amp.scale_loss(main_loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:
            main_loss.backward()

        optim.step()

        curr_iter += 1

        if args.local_rank == 0:
            msg = '[epoch {}], [iter {} / {}], [train main loss {:0.6f}], [lr {:0.6f}]'.format(
                curr_epoch, i + 1, len(train_loader), train_main_loss.avg,
                optim.param_groups[-1]['lr'])

            logging.info(msg)

            # Log tensorboard metrics for each iteration of the training phase
            writer.add_scalar('training/loss', (train_main_loss.val),
                              curr_iter)
            writer.add_scalar('training/lr', optim.param_groups[-1]['lr'],
                              curr_iter)

        if i > 5 and args.test_mode:
            return
def validate(model, dataset, opt, ctx):
    """Test on validation dataset."""
    detector = PoseDetector(opt)
    detector.model = model

    results = {}
    num_iters = len(dataset)
    bar = Bar('{}'.format(opt.exp_id), max=num_iters)
    time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
    avg_time_stats = {t: AverageMeter() for t in time_stats}
    print("Reporting every 1000 images...")

    for ind in range(num_iters):
        img_id = dataset.images[ind]
        img_info = dataset.coco.loadImgs(ids=[img_id])[0]
        img_path = os.path.join(dataset.img_dir, img_info['file_name'])

        ret = detector.run(img_path)
        results[img_id] = ret['results']
        Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format(
            ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td)
        for t in avg_time_stats:
            avg_time_stats[t].update(ret[t])
            Bar.suffix = Bar.suffix + '|{} {:.3f} '.format(
                t, avg_time_stats[t].avg)
        if ind % 1000 == 0:
            bar.next()

    bar.finish()
    val_dataset.run_eval(results=results, save_dir='./output/')
Example #9
0
    def inference_classification(self):
        self.model.eval()
        self.model.module.mode = 0
        val_accuracy = AverageMeter()

        with torch.no_grad():
            final_itr = tqdm(self.test_loader, ncols=80, desc='Inference (instance) ...')

            for i, (input, labels) in enumerate(final_itr):
                input  = input.to(self.device)
                labels = labels.to(self.device)

                logits = self.model(input)[0]
                preds  = self.model.module.pooling.predictions(logits)

                accuracy = (preds == labels).sum().item() / labels.shape[0]
                val_accuracy.append(accuracy)

                final_itr.set_description('--- (test) | Accuracy: {:.3f}  :'.format(
                    val_accuracy.avg())
                )

        err = val_accuracy.avg()
        fp = open(os.path.join(self.logdir, 'meanscores.csv'), 'w')
        fp.write('Accuracy: {:.4f} \n'.format(err))
        fp.close()
Example #10
0
def train_single_epoch(model, criterion, optimizer, train_loader, epoch, is_cuda):
    model.train() # switch to train mode
    avg_loss = AverageMeter()
    end = time.time()
    running_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader, 0):
        # wrap them in Variable
        if is_cuda:
            labels = labels.cuda(async=True)
            inputs = inputs.cuda(async=True)

        input_var = torch.autograd.Variable( inputs )
        label_var = torch.autograd.Variable( labels )

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = model(input_var)
        loss = criterion(output, label_var)
        loss.backward()
        optimizer.step()

        # print statistics
        avg_loss.update(loss.data[0], labels.size(0))

    return avg_loss.avg
Example #11
0
def validate(val_dataloader, model, configs):
    losses = AverageMeter('Loss', ':.4e')
    criterion = Compute_Loss(device=configs.device)
    # switch to train mode
    model.eval()
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(val_dataloader)):
            metadatas, imgs, targets = batch_data
            batch_size = imgs.size(0)
            for k in targets.keys():
                targets[k] = targets[k].to(configs.device, non_blocking=True)
            imgs = imgs.to(configs.device, non_blocking=True).float()
            outputs = model(imgs)
            total_loss, loss_stats = criterion(outputs, targets)
            # For torch.nn.DataParallel case
            if (not configs.distributed) and (configs.gpu_idx is None):
                total_loss = torch.mean(total_loss)

            if configs.distributed:
                reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
            else:
                reduced_loss = total_loss.data
            losses.update(to_python_float(reduced_loss), batch_size)

    return losses.avg
Example #12
0
def train(epoch):
    net.train()
    # 计算平均损失,每个epoch更新为0
    train_loss = AverageMeter()
    #每次迭代调用 _getitem_ 方法,进行transform变换。
    curr_iter = (epoch - 1) * len(trainloader)
    for i, (inputs, labels) in enumerate(trainloader):
        if args.cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        N = inputs.size(0)
        # 清空梯度
        optimizer.zero_grad()
        outputs = net(inputs)
        # 计算单个样本的loss
        loss = criterion(outputs, labels) / N
        # 反向传导,更新参数
        loss.backward()
        optimizer.step()
        train_loss.update(loss.item(), N)
        curr_iter += 1

        #writer.add_scalar('train_loss', train_loss.avg, curr_iter)
        #if (i + 1) % args.trainInterval == 0:
        print('[epoch %d], [iter %d / %d], [train loss %.5f]' %
              (epoch, i + 1, len(trainloader), train_loss.avg))
Example #13
0
def train(train_loader, net, criterion, optim, curr_epoch, scheduler, max_iter):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    writer: tensorboard writer
    return:
    """
    net.train()
    train_total_loss = AverageMeter()
    time_meter = AverageMeter()

    curr_iter = curr_epoch * len(train_loader)

    for i, data in enumerate(train_loader):
        if curr_iter >= max_iter:
            break
        start_ts = time.time()
        inputs, gts = data
        batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3)
        inputs, gts = inputs.cuda(), gts.cuda()
        optim.zero_grad()
        outputs = net(inputs)
        total_loss = criterion(outputs, gts)
        log_total_loss = total_loss.clone().detach_()
        train_total_loss.update(log_total_loss.item(), batch_pixel_size)

        total_loss.backward()
        optim.step()
        scheduler.step()

        time_meter.update(time.time() - start_ts)

        del total_loss

        curr_iter += 1
        if i % 50 == 49:

            msg = '[epoch {}], [iter {} / {} : {}], [loss {:0.6f}], [lr {:0.6f}], [time {:0.4f}]'.format(
                      curr_epoch, i + 1, len(train_loader), curr_iter, train_total_loss.avg,
                      optim.param_groups[-1]['lr'], time_meter.avg / args.batch_size)
            logging.info(msg)
            train_total_loss.reset()
            time_meter.reset()
    return curr_iter
Example #14
0
def validate(val_loader, net, criterion, optim, curr_epoch, writer):
    """
    Runs the validation loop after each training epoch
    val_loader: Data loader for validation
    net: thet network
    criterion: loss fn
    optimizer: optimizer
    curr_epoch: current epoch
    writer: tensorboard writer
    return: val_avg for step function if required
    """

    net.eval()
    val_loss = AverageMeter()
    iou_acc = 0
    dump_images = []

    for val_idx, data in enumerate(val_loader):
        inputs, gt_image, img_names = data
        assert len(inputs.size()) == 4 and len(gt_image.size()) == 3
        assert inputs.size()[2:] == gt_image.size()[1:]

        batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3)
        inputs, gt_cuda = inputs.cuda(), gt_image.cuda()

        with torch.no_grad():
            output = net(inputs)  # output = (1, 19, 713, 713)

        assert output.size()[2:] == gt_image.size()[1:]
        assert output.size()[1] == args.dataset_cls.num_classes

        val_loss.update(criterion(output, gt_cuda).item(), batch_pixel_size)
        predictions = output.data.max(1)[1].cpu()

        # Logging
        if val_idx % 20 == 0:
            if args.local_rank == 0:
                logging.info("validating: %d / %d", val_idx + 1, len(val_loader))
        if val_idx > 10 and args.test_mode:
            break

        # Image Dumps
        if val_idx < 10:
            dump_images.append([gt_image, predictions, img_names])

        iou_acc += fast_hist(predictions.numpy().flatten(), gt_image.numpy().flatten(),
                             args.dataset_cls.num_classes)
        del output, val_idx, data

    if args.apex:
        iou_acc_tensor = torch.cuda.FloatTensor(iou_acc)
        torch.distributed.all_reduce(iou_acc_tensor, op=torch.distributed.ReduceOp.SUM)
        iou_acc = iou_acc_tensor.cpu().numpy()

    if args.local_rank == 0:
        evaluate_eval(args, net, optim, val_loss, iou_acc, dump_images,
                      writer, curr_epoch, args.dataset_cls)

    return val_loss.avg
Example #15
0
    def test(self, epoch):
        batch_time = AverageMeter('Time', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')
        progress = ProgressMeter(len(self.test_loader),
                                 [batch_time, losses, top1, top5],
                                 prefix='Test: ')

        # switch to test mode
        self.model.eval()

        with torch.no_grad():
            end = time.time()
            for i, (images, target) in enumerate(self.test_loader):
                images = images.cuda()
                target = target.cuda()

                # compute output
                output, _ = self.model(images)
                loss = self.criterion(output, target)

                # measure accuracy and record loss
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                losses.update(loss.item(), images.size(0))
                top1.update(acc1[0], images.size(0))
                top5.update(acc5[0], images.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                if i % self.args.print_freq == 0 and self.args.local_rank == 0:
                    progress.display(i)

            if self.args.local_rank == 0:
                print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
                    top1=top1, top5=top5))
                self.writer.add_scalar('Test/Avg_Loss', losses.avg, epoch + 1)
                self.writer.add_scalar('Test/Avg_Top1', top1.avg, epoch + 1)
                self.writer.add_scalar('Test/Avg_Top5', top5.avg, epoch + 1)
                self.summary_graph_adj(self.writer, epoch + 1)
                self.summary_graph_histogram(self.writer, epoch + 1)

        return top1.avg
Example #16
0
def evaluate(val_loader, net):
    '''
    Runs the evaluation loop and prints F score
    val_loader: Data loader for validation
    net: thet network
    return: 
    '''
    net.eval()
    for thresh in args.eval_thresholds.split(','):
        mf_score1 = AverageMeter()
        mf_pc_score1 = AverageMeter()
        ap_score1 = AverageMeter()
        ap_pc_score1 = AverageMeter()
        Fpc = np.zeros((args.dataset_cls.num_classes))
        Fc = np.zeros((args.dataset_cls.num_classes))
        for vi, data in enumerate(val_loader):
            input, mask, edge, img_names = data
            assert len(input.size()) == 4 and len(mask.size()) == 3
            assert input.size()[2:] == mask.size()[1:]
            h, w = mask.size()[1:]

            batch_pixel_size = input.size(0) * input.size(2) * input.size(3)
            input, mask_cuda, edge_cuda = input.cuda(), mask.cuda(), edge.cuda(
            )

            with torch.no_grad():
                seg_out, edge_out = net(input)

            seg_predictions = seg_out.data.max(1)[1].cpu()
            edge_predictions = edge_out.max(1)[0].cpu()

            logging.info('evaluating: %d / %d' % (vi + 1, len(val_loader)))
            _Fpc, _Fc = eval_mask_boundary(seg_predictions.numpy(),
                                           mask.numpy(),
                                           args.dataset_cls.num_classes,
                                           bound_th=float(thresh))
            Fc += _Fc
            Fpc += _Fpc

            del seg_out, edge_out, vi, data

        logging.info('Threshold: ' + thresh)
        logging.info('F_Score: ' +
                     str(np.sum(Fpc / Fc) / args.dataset_cls.num_classes))
        logging.info('F_Score (Classwise): ' + str(Fpc / Fc))
Example #17
0
def validate(val_dataloader, model, configs):
    losses = AverageMeter('Loss', ':.4e')
    criterion = Compute_Loss(device=configs.device)
    # switch to train mode
    model.eval()
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(val_dataloader)):
            metadatas, targets = batch_data
            batch_size = len(metadatas['img_path'])
            voxelinput = metadatas['voxels']
            coorinput = metadatas['coors']
            numinput = metadatas['num_points']

            for k in targets.keys():
                targets[k] = targets[k].to(configs.device, non_blocking=True)
            #imgs = imgs.to(configs.device, non_blocking=True).float()

            dtype = torch.float32
            voxelinputr = torch.tensor(voxelinput,
                                       dtype=torch.float32,
                                       device=configs.device).to(dtype)

            coorinputr = torch.tensor(coorinput,
                                      dtype=torch.int32,
                                      device=configs.device)

            numinputr = torch.tensor(numinput,
                                     dtype=torch.int32,
                                     device=configs.device)

            try:
                outputs = model(voxelinputr, coorinputr, numinputr)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    print("WARNING: out of memory")
                    print('###############################3')
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    print('###############################3')
                    raise exception

            #outputs = model(voxelinputr, coorinputr, numinputr)

            total_loss, loss_stats = criterion(outputs, targets)
            # For torch.nn.DataParallel case
            if (not configs.distributed) and (configs.gpu_idx is None):
                total_loss = torch.mean(total_loss)

            if configs.distributed:
                reduced_loss = reduce_tensor(total_loss.data,
                                             configs.world_size)
            else:
                reduced_loss = total_loss.data
            losses.update(to_python_float(reduced_loss), batch_size)

    return losses.avg
Example #18
0
def train(model, train_dataset, criterion, optimizer, epoch, device, args):
    BATCH_SIZE = args.batch_size
    ITER_SIZE = args.iter_size
    TOTAL_TRAIN_DATA = train_dataset.len
    NUM_PTS = args.num_pts
    NUM_BATCH = int(np.ceil((TOTAL_TRAIN_DATA / (BATCH_SIZE * ITER_SIZE))))

    data_idx = 0
    model = model.train()
    losses = AverageMeter()

    tot_loss = []
    fastprint("Training... ")
    for batch_idx in range(NUM_BATCH):
        loss_sum = 0
        optimizer.zero_grad()
        for _iter in range(ITER_SIZE):
            data = train_dataset.getitem(data_idx)
            points, label, indptr, indices = data['data'], \
                    data['label'], \
                    data['indptr'], \
                    data['indices']
            points, label, indptr, indices = torch.from_numpy(points), \
                                            torch.from_numpy(label.reshape(-1)), \
                                            torch.from_numpy(indptr), \
                                            torch.from_numpy(indices)
            points, label, indptr, indices = points.view(NUM_PTS, -1), \
                                            label.view(-1), \
                                            indptr.view(-1), \
                                            indices.view(-1)
            points, label, indptr, indices = Variable(points).float(), \
                                            Variable(label).type(torch.LongTensor), \
                                            indptr, indices
            points, label, indptr, indices = points.to(device), \
                    label.to(device), \
                    indptr.to(device), \
                    indices.to(device)

            pred = model(points, indptr, indices)
            loss = criterion(pred, label) / ITER_SIZE
            loss.backward()

            loss_sum += loss.item()
            data_idx += 1
            losses.update(loss.item(), label.size(0))

        optimizer.step()

        tot_loss.append(loss_sum)
        fastprint('[%d: %d/%d] train loss: %f' %
                  (epoch, batch_idx, NUM_BATCH, loss_sum))

    torch.save(model.state_dict(), '%s/cls_model_%d.pth' % (args.outf, epoch))
    np.savez(os.path.join(args.outf, 'TrainLoss_epoch_{}.npz'.format(epoch)), loss=tot_loss)
Example #19
0
    def training(self, epoch, prefix='Train', evaluation=False):
        self.model.train()
        if evaluation:
            self.evaluator.reset()

        train_losses = AverageMeter()
        tbar = tqdm(self.train_dataloader, desc='\r', total=self.iters_per_epoch)  # 设置最多迭代次数, 从0开始..

        if self.writer:
            self.writer.add_scalar(f'{prefix}/learning_rate', get_learning_rate(self.optimizer), epoch)

        for i, sample in enumerate(tbar):
            image, target = sample['img'], sample['target']
            image, target = image.to(self.device), target.to(self.device)
            if self.args.optimizer == 'SGD':
                self.lr_scheduler(self.optimizer, i, epoch)  # each iteration

            output = self.model(image)
            loss = self.criterion(output, target)  # multiple output loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            train_losses.update(loss.item())
            tbar.set_description('Epoch {}, Train loss: {:.3f}'.format(epoch, train_losses.avg))

            if evaluation:
                output = F.interpolate(output[-1], size=(target.size(1), target.size(2)), mode='bilinear', align_corners=True)
                pred = torch.argmax(output, dim=1)
                self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy())  # B,H,W

            # 即便 tqdm 有 total,仍然要这样跳出
            if i == self.iters_per_epoch - 1:
                break

        if self.writer:
            self.writer.add_scalar(f'{prefix}/loss', train_losses.val, epoch)
            if evaluation:
                Acc = self.evaluator.Pixel_Accuracy()
                mIoU = self.evaluator.Mean_Intersection_over_Union()
                print('Epoch: {}, Acc_pixel:{:.3f}, mIoU:{:.3f}'.format(epoch, Acc, mIoU))

                self.writer.add_scalars(f'{prefix}/IoU', {
                    'mIoU': mIoU,
                    # 'mDice': mDice,
                }, epoch)
                self.writer.add_scalars(f'{prefix}/Acc', {
                    'acc_pixel': Acc,
                    # 'acc_class': Acc_class
                }, epoch)
Example #20
0
def test(model, test_dataset, criterion, epoch, device, args):
    fastprint('Evaluation ... ')
    TOTAL_TEST_DATA = test_dataset.len
    NUM_PTS = args.num_pts

    test_loss = 0.0
    correct = 0.0
    losses = AverageMeter()

    model = model.eval()

    with torch.no_grad():
        for idx in range(TOTAL_TEST_DATA):
            data = test_dataset.getitem(idx)
            points, label, indptr, indices = data['data'], \
                    data['label'], \
                    data['indptr'], \
                    data['indices']
            points, label, indptr, indices = torch.from_numpy(points), \
                    torch.from_numpy(label.reshape(-1)), \
                    torch.from_numpy(indptr), \
                    torch.from_numpy(indices)
            points, label, indptr, indices = points.view(
                NUM_PTS, -1), label.view(-1), indptr.view(-1), indices.view(-1)
            points, label, indptr, indices = Variable(points).float(), Variable(
                label).type(torch.LongTensor), indptr, indices
            points, label, indptr, indices = points.to(device), label.to(
                device), indptr.to(device), indices.to(device)

            pred = model(points, indptr, indices)
            loss = criterion(pred, label)
            # get the index of the max log-probability
            pred = pred.argmax(dim=1, keepdim=True)

            test_loss += loss.item()
            losses.update(loss.item(), label.size(0))
            correct += pred.eq(label.view_as(pred)).sum().item()

    test_loss /= float(TOTAL_TEST_DATA)
    acc = 100. * correct / float(TOTAL_TEST_DATA)
    fastprint('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, TOTAL_TEST_DATA, acc))

    return acc
Example #21
0
def validate(cfg, valid_loader, GENERATORS, cri_mse, device, epoch, save_path,
             is_visual):
    print("=" * 30)
    print("START VALIDATON")
    psnr_result = OrderedDict()
    # switch to evaluate mode
    for name, models in GENERATORS.items():
        if models is not None:
            models.eval()
            psnr_result['{}'.format(name)] = AverageMeter()

    for _, data in enumerate(valid_loader):
        gt_img, gt_filename, blur_img, blur_filename = data

        batch_size = gt_img.size(0)
        gt_img, blur_img = prepare([gt_img, blur_img], device)

        with torch.no_grad():
            outputs = GENERATORS['netG'](blur_img)

        mse = cri_mse(gt_img, outputs[-1])
        psnr = 10 * log10(1 / mse.item())
        psnr_result['{}'.format(name)].update(psnr, batch_size)

        if is_visual:
            gt_filename = gt_filename[0]
            blur_filename = blur_filename[0]

            save_out_path = os.path.join(save_path, 'output')
            make_dir(save_out_path)
            save_ep_path = os.path.join(save_out_path, 'ep_{}'.format(epoch))
            make_dir(save_ep_path)

            output_list = [o[0, :, :, :] for o in outputs]
            output_list = tensor2img_list(output_list)
            for i in range(len(output_list)):
                save_name = os.path.join(
                    save_ep_path, '{}_out{}.png'.format(blur_filename, i))
                cv2.imwrite(save_name, output_list[i])

    return psnr_result
Example #22
0
def validate(val_loader, net, criterion, optim, scheduler, curr_epoch, curr_iter):
    """
    Runs the validation loop after each training epoch
    val_loader: Data loader for validation
    net: thet network
    criterion: loss fn
    optimizer: optimizer
    curr_epoch: current epoch
    return: val_avg for step function if required
    """

    net.eval()
    val_loss = AverageMeter()
    iou_acc = 0
    error_acc = 0

    for val_idx, data in enumerate(val_loader):
        inputs, gts = data = data
        assert len(inputs.size()) == 4 and len(gts.size()) == 3
        assert inputs.size()[2:] == gts.size()[1:]

        batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3)
        inputs, gts = inputs.cuda(), gts.cuda()

        with torch.no_grad():
            output = net(inputs)
        del inputs
        assert output.size()[2:] == gts.size()[1:]
        assert output.size()[1] == args.num_classes
        val_loss.update(criterion(output, gts).item(), batch_pixel_size)

        predictions = output.data.max(1)[1].cpu()
        # Logging
        if val_idx % 20 == 0:
            logging.info("validating: %d / %d", val_idx + 1, len(val_loader))
        iou_acc += fast_hist(predictions.numpy().flatten(), gts.cpu().numpy().flatten(),
                             args.num_classes)
        del gts, output, val_idx, data

    per_cls_iou = evaluate_eval(args, net, optim, scheduler, val_loss, iou_acc, curr_epoch, args.dataset, curr_iter)
    return val_loss.avg, per_cls_iou
def validate(val_loader,
             net,
             criterion,
             optim,
             epoch,
             calc_metrics=True,
             dump_assets=False,
             dump_all_images=False):
    """
    Run validation for one epoch

    :val_loader: data loader for validation
    :net: the network
    :criterion: loss fn
    :optimizer: optimizer
    :epoch: current epoch
    :calc_metrics: calculate validation score
    :dump_assets: dump attention prediction(s) images
    :dump_all_images: dump all images, not just N
    """
    val_time = time.perf_counter()
    dumper = ImageDumper(val_len=len(val_loader),
                         dump_all_images=dump_all_images,
                         dump_assets=dump_assets,
                         dump_for_auto_labelling=args.dump_for_auto_labelling,
                         dump_for_submission=args.dump_for_submission,
                         rank=rank)

    net.eval()
    val_loss = AverageMeter()
    iou_acc = 0
    for val_idx, data in enumerate(val_loader):
        input_images, labels, img_names, _ = data
        if args.dump_for_auto_labelling or args.dump_for_submission:
            submit_fn = '{}.png'.format(img_names[0])
            if val_idx % 20 == 0:
                logx.msg(
                    f'validating[Iter: {val_idx + 1} / {len(val_loader)}]')
            if os.path.exists(os.path.join(dumper.save_dir, submit_fn)):
                continue

        # Run network
        assets, _iou_acc = \
            eval_minibatch(data, net, criterion, val_loss, calc_metrics,
                          args, val_idx)

        iou_acc += _iou_acc

        input_images, labels, img_names, _ = data

        if optim.comm.rank == 0:
            dumper.dump(
                {
                    'gt_images': labels,
                    'input_images': input_images,
                    'img_names': img_names,
                    'assets': assets
                }, val_idx)

        if val_idx > 5 and args.test_mode:
            break

        if val_idx % 2 == 0 and optim.comm.rank == 0:
            logx.msg(f'validating[Iter: {val_idx + 1} / {len(val_loader)}]')

    # average the loss value
    val_loss_tens = torch.tensor(val_loss.val)
    optim.comm.Allreduce(MPI.IN_PLACE, val_loss_tens, MPI.SUM)
    val_loss_tens = val_loss_tens.to(torch.float)
    val_loss_tens /= float(optim.comm.size)
    val_loss.val = val_loss_tens.item()
    # sum up the iou_acc
    optim.comm.Allreduce(MPI.IN_PLACE, iou_acc, MPI.SUM)

    # was_best = False
    if calc_metrics:
        # was_best = eval_metrics(iou_acc, args, net, optim, val_loss, epoch)
        _, mean_iu = eval_metrics(iou_acc, args, net, optim, val_loss, epoch)

    optim.comm.bcast(mean_iu, root=0)
    # was_best = optim.comm.bcast(was_best, root=0)
    #
    # # Write out a summary html page and tensorboard image table
    # if not args.dump_for_auto_labelling and not args.dump_for_submission and optim.comm.rank == 0:
    #     dumper.write_summaries(was_best)
    return val_loss.val, mean_iu, time.perf_counter() - val_time
def train(args, train_loader, model, optimizer, criterion, epoch):
    logger = logging.getLogger('train')

    log_dir = os.path.join('log', args.env)
    if not os.path.isdir(log_dir):
        logger.info('log dir does not exist, create log dir.')
        os.makedirs(log_dir)
    fh = logging.FileHandler(os.path.join(log_dir, 'train.log'), mode='a+')
    fh.setLevel(logging.INFO)
    logger.addHandler(fh)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    accu = AverageMeter()

    model = model.train()
    end = time.time()
    optimizer = optimizer

    for i, (image, target) in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)
        image = image.cuda(async=True)
        target = target.cuda(async=True)
        image_var, target_var = Variable(image), Variable(target)

        # compute output
        output = model(image_var)

        loss = criterion(output, target_var)

        # update the cls
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = 10
        losses.update(loss.data[0])
        accu.update(acc)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if ((i + 1) % args.print_freq) == 0:
            logger.info('Epoch: [{0}][{1}/{2}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format(
                            epoch + 1,
                            i + 1,
                            len(train_loader),
                            batch_time=batch_time,
                            data_time=data_time,
                            loss=losses,
                            accu=accu))
            sys.stdout.flush()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format(
                      epoch + 1,
                      i + 1,
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses,
                      accu=accu))

    logger.info(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format(
        losses=losses, accu=accu))
    print(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format(
        losses=losses, accu=accu))
    return losses.avg, accu.avg
def validate(args, val_loader, model, criterion, criterion2):

    logger = logging.getLogger('val')
    log_dir = os.path.join('log', args.env)
    if not os.path.isdir(log_dir):
        logger.info('log dir does not exist, create log dir.')
        os.makedirs(log_dir)
    fh = logging.FileHandler(os.path.join(log_dir, 'val.log'), mode='a+')
    fh.setLevel(logging.INFO)
    logger.addHandler(fh)

    batch_time = AverageMeter()
    losses = AverageMeter()
    accu = AverageMeter()

    # switch to evaluate mode
    model = model.eval()

    end = time.time()
    for i, (image, target) in enumerate(val_loader):

        # measure data loading time
        image = image.cuda(async=True)
        target = target.cuda(async=True)
        image_var, target_var = Variable(image, volatile=True), Variable(
            target, volatile=True)

        # compute output
        output = model(image_var)
        loss = criterion(output, target_var)
        acc = 10
        losses.update(loss.data[0])
        accu.update(acc)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if ((i + 1) % args.print_freq) == 0:
            logger.info('Test: [{0}/{1}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format(
                            i + 1,
                            len(val_loader),
                            batch_time=batch_time,
                            loss=losses,
                            accu=accu))
            sys.stdout.flush()
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'accuracy {accu.val:.4f} ({accu.avg:.4f})\t'.format(
                      i + 1,
                      len(val_loader),
                      batch_time=batch_time,
                      loss=losses,
                      accu=accu))

    logger.info(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format(
        losses=losses, accu=accu))
    print(' * Loss: {losses.avg:.3f} accuracy:{accu.avg:.3f}'.format(
        losses=losses, accu=accu))
    return losses.avg, accu.avg
Example #26
0
        x = self.fc1(x)
        x = self.fc2(x)
        return x


net = nn.DataParallel(Net())
net.to(device)
torchsummary.summary(net, (1, 28, 28))

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

# We are now ready to train our network, which in Keras is done via a call to the `fit` method of the network:
# we "fit" the model to its training data.
for epoch in range(5):  # loop over the dataset multiple times
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    net.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
Example #27
0
    def train_epoch(self, epoch_num):
        batch_time = AverageMeter()
        losses = AverageMeter()
        acces = AverageMeter()

        self.model.train()

        end = time.time()
        for iter_i, batch_data in enumerate(self.train_loader):
            image_inputs = batch_data['image']
            mean_normal = batch_data['mean_normal']
            room_mask = batch_data['room_mask']
            if self.configs.mode == 'room_corner':
                corner_map = batch_data['corner_map']
            else:
                corner_map = torch.stack(
                    [batch_data['corners_map'], batch_data['edge_map']], 1)
            label = batch_data['label']

            if self.configs.use_cuda:
                image_inputs = image_inputs.cuda()
                mean_normal = mean_normal.cuda()
                room_mask = room_mask.cuda()
                corner_map = corner_map.cuda()
                label = label.cuda()

            if self.configs.mode == 'room_corner':
                corner_map = corner_map.unsqueeze(1)

            inputs = torch.cat([
                image_inputs.unsqueeze(1), mean_normal,
                room_mask.unsqueeze(1), corner_map
            ],
                               dim=1)
            logits, preds = self.model(inputs)

            loss = self.criterion(logits, label)
            losses.update(loss.data, image_inputs.size(0))

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.configs.mode == 'corner_corner':
                acc = binary_pred_accuracy(preds.detach().cpu().numpy(),
                                           label.cpu().numpy())
            else:
                acc = binary_pred_accuracy(preds.detach().cpu().numpy()[:, 0],
                                           label.cpu().numpy()[:, 0])
            acces.update(acc, image_inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Corner pred Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
                      epoch_num,
                      iter_i,
                      len(self.train_loader),
                      batch_time=batch_time,
                      loss=losses,
                      acc=acces))

            if iter_i > self.configs.max_iter_per_epoch:
                break
Example #28
0
def train():
    try:
        os.makedirs(opt.checkpoints_dir)
    except OSError:
        pass
    if torch.cuda.device_count() > 1:
        model = torch.nn.parallel.DataParallel(
            AlexNet(num_classes=opt.num_classes))
    else:
        model = AlexNet(num_classes=opt.num_classes)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(
            torch.load(MODEL_PATH, map_location=lambda storage, loc: storage))
    model.to(device)
    ################################################
    # Set loss function and Adam optimizer
    ################################################
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=opt.lr)

    for epoch in range(opt.epochs):
        # train for one epoch
        print(f"\nBegin Training Epoch {epoch + 1}")
        # Calculate and return the top-k accuracy of the model
        # so that we can track the learning process.
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()

        for i, data in enumerate(train_dataloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, targets = data
            inputs = inputs.to(device)
            targets = targets.to(device)

            # compute output
            output = model(inputs)
            loss = criterion(output, targets)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output, targets, topk=(1, 2))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1, inputs.size(0))
            top5.update(prec5, inputs.size(0))

            # compute gradients in a backward pass
            optimizer.zero_grad()
            loss.backward()

            # Call step of optimizer to update model params
            optimizer.step()

            print(
                f"Epoch [{epoch + 1}] [{i + 1}/{len(train_dataloader)}]\t"
                f"Loss {loss.item():.4f}\t"
                f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t"
                f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})",
                end="\r")

        # save model file
        torch.save(model.state_dict(), MODEL_PATH)
Example #29
0
def validate(epoch):
    net.eval()
    val_loss = AverageMeter()
    inputs_all, labels_all, predictions_all = [], [], []

    for i, (inputs, labels) in enumerate(valloader):
        if args.cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
        N = inputs.size(0)
        outputs = net(inputs)
        # predictions 为输入图片尺寸大小的对应每一像素点的分类值
        predictions = outputs.data.max(1)[1].squeeze_(1).squeeze_(
            0).cpu().numpy()

        loss = criterion(outputs, labels) / N
        val_loss.update(loss.item(), N)

        if random.random() > args.valImgSampleRate:
            inputs_all.append(None)
        else:
            inputs_all.append(inputs.data.squeeze_(0).cpu())

        labels_all.append(labels.data.squeeze_(0).cpu().numpy())

        predictions_all.append(predictions)

    # 计算本次epoch之后对验证集的正确率等评价指标
    acc, acc_cls, mean_iu, fwavacc = evaluate(predictions_all, labels_all,
                                              num_classes)

    if mean_iu > best_record['mean_iu']:
        best_record['val_loss'] = val_loss.avg
        best_record['epoch'] = epoch
        best_record['acc'] = acc
        best_record['acc_cls'] = acc_cls
        best_record['mean_iu'] = mean_iu
        best_record['fwavacc'] = fwavacc
        snapshot_name = 'epoch_%d_loss_%.5f_acc_%.5f_acc-cls_%.5f_mean-iu_%.5f_fwavacc_%.5f_lr_%.10f' % (
            epoch, val_loss.avg, acc, acc_cls, mean_iu, fwavacc, args.lr)
        torch.save(net.state_dict(),
                   os.path.join(ckpt_path, exp_name, snapshot_name + '.pth'))
        #torch.save(optimizer.state_dict(), os.path.join(ckpt_path, exp_name, 'opt_' + snapshot_name + '.pth'))

        if args.val_save_to_img_file:
            to_save_dir = os.path.join(ckpt_path, exp_name, str(epoch))
            check_mkdir(to_save_dir)

        #val_visual = []
        for idx, data in enumerate(zip(inputs_all, labels_all,
                                       predictions_all)):
            if data[0] is None:
                continue
            input_pil = restore_transform(data[0])
            labels_pil = colorize_mask(data[1])
            predictions_pil = colorize_mask(data[2])
            if args.val_save_to_img_file:
                input_pil.save(os.path.join(to_save_dir, '%d_input.png' % idx))
                predictions_pil.save(
                    os.path.join(to_save_dir, '%d_prediction.png' % idx))
                labels_pil.save(os.path.join(to_save_dir,
                                             '%d_label.png' % idx))
            # val_visual.extend([visualize(input_pil.convert('RGB')), visualize(labels_pil.convert('RGB')),
            #                    visualize(predictions_pil.convert('RGB'))])
        # val_visual = torch.stack(val_visual, 0)
        # val_visual = vutils.make_grid(val_visual, nrow=3, padding=5)
        # writer.add_image(snapshot_name, val_visual)

    print(
        '--------------------------------------------------------------------')
    print(
        '[epoch %d], [val loss %.5f], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]'
        % (epoch, val_loss.avg, acc, acc_cls, mean_iu, fwavacc))

    print(
        'best record: [val loss %.5f], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f], [epoch %d]'
        %
        (best_record['val_loss'], best_record['acc'], best_record['acc_cls'],
         best_record['mean_iu'], best_record['fwavacc'], best_record['epoch']))

    print(
        '--------------------------------------------------------------------')

    # writer.add_scalar('val_loss', val_loss.avg, epoch)
    # writer.add_scalar('acc', acc, epoch)
    # writer.add_scalar('acc_cls', acc_cls, epoch)
    # writer.add_scalar('mean_iu', mean_iu, epoch)
    # writer.add_scalar('fwavacc', fwavacc, epoch)
    # writer.add_scalar('lr', optimizer.param_groups[1]['lr'], epoch)

    return val_loss.avg
def train(train_loader, net, optim, curr_epoch, scaler):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    return:
    """
    full_bt = time.perf_counter()
    net.train()

    train_main_loss = AverageMeter()
    start_time = None
    warmup_iter = 10
    optim.last_batch = len(train_loader) - 1
    btimes = []
    batch_time = time.perf_counter()
    for i, data in enumerate(train_loader):
        lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4)

        if i <= warmup_iter:
            start_time = time.time()
        # inputs = (bs,3,713,713)
        # gts    = (bs,713,713)
        images, gts, _img_name, scale_float = data
        batch_pixel_size = images.size(0) * images.size(2) * images.size(3)
        images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda(
        )
        inputs = {'images': images, 'gts': gts}
        optim.zero_grad()
        if args.amp:
            with amp.autocast():
                main_loss = net(inputs)
                log_main_loss = main_loss.clone().detach_()
                # torch.distributed.all_reduce(log_main_loss,
                #                              torch.distributed.ReduceOp.SUM)
                log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss,
                                                 MPI.SUM)
                # log_main_loss = log_main_loss / args.world_size
            # train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            scaler.scale(main_loss).backward()
        else:
            main_loss = net(inputs)
            main_loss = main_loss.mean()
            log_main_loss = main_loss.clone().detach_()
            log_wait = None
            #train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            main_loss.backward()

        # the scaler update is within the optim step
        optim.step()

        if i >= warmup_iter:
            curr_time = time.time()
            batches = i - warmup_iter + 1
            batchtime = (curr_time - start_time) / batches
        else:
            batchtime = 0

        if log_wait is not None:
            log_wait.Wait()
        log_main_loss = log_main_loss / args.world_size
        train_main_loss.update(log_main_loss.item(), batch_pixel_size)

        msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],'
               ' [lr {:0.6f}] [batchtime {:0.3g}]')
        msg = msg.format(curr_epoch, i + 1, len(train_loader),
                         train_main_loss.avg,
                         optim.local_optimizer.param_groups[-1]['lr'],
                         batchtime)
        logx.msg(msg)

        metrics = {
            'loss': train_main_loss.avg,
            'lr': optim.local_optimizer.param_groups[-1]['lr']
        }
        curr_iter = curr_epoch * len(train_loader) + i
        logx.metric('train', metrics, curr_iter)

        if i >= 10 and args.test_mode:
            del data, inputs, gts
            return
        btimes.append(time.perf_counter() - batch_time)
        batch_time = time.perf_counter()

    if args.benchmarking:
        train_loss_tens = torch.tensor(train_main_loss.avg)
        optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM)
        train_loss_tens = train_loss_tens.to(torch.float)
        train_loss_tens /= float(optim.comm.size)
        train_main_loss.avg = train_loss_tens.item()

    return train_main_loss.avg, torch.mean(
        torch.tensor(btimes)), time.perf_counter() - full_bt