Example #1
0
def validate(val_dataloader, model, configs):
    losses = AverageMeter('Loss', ':.4e')
    criterion = Compute_Loss(device=configs.device)
    # switch to train mode
    model.eval()
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(val_dataloader)):
            metadatas, imgs, targets = batch_data
            batch_size = imgs.size(0)
            for k in targets.keys():
                targets[k] = targets[k].to(configs.device, non_blocking=True)
            imgs = imgs.to(configs.device, non_blocking=True).float()
            outputs = model(imgs)
            total_loss, loss_stats = criterion(outputs, targets)
            # For torch.nn.DataParallel case
            if (not configs.distributed) and (configs.gpu_idx is None):
                total_loss = torch.mean(total_loss)

            if configs.distributed:
                reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
            else:
                reduced_loss = total_loss.data
            losses.update(to_python_float(reduced_loss), batch_size)

    return losses.avg
Example #2
0
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')

    progress = ProgressMeter(len(train_dataloader), [batch_time, data_time, losses],
                             prefix="Train - Epoch: [{}/{}]".format(epoch, configs.num_epochs))

    criterion = Compute_Loss(device=configs.device)
    num_iters_per_epoch = len(train_dataloader)
    # switch to train mode
    model.train()
    start_time = time.time()
    for batch_idx, batch_data in enumerate(tqdm(train_dataloader)):
        data_time.update(time.time() - start_time)
        metadatas, imgs, targets = batch_data
        batch_size = imgs.size(0)
        global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1
        for k in targets.keys():
            targets[k] = targets[k].to(configs.device, non_blocking=True)
        imgs = imgs.to(configs.device, non_blocking=True).float()
        outputs = model(imgs)
        total_loss, loss_stats = criterion(outputs, targets)
        # For torch.nn.DataParallel case
        if (not configs.distributed) and (configs.gpu_idx is None):
            total_loss = torch.mean(total_loss)

        # compute gradient and perform backpropagation
        total_loss.backward()
        if global_step % configs.subdivisions == 0:
            optimizer.step()
            # zero the parameter gradients
            optimizer.zero_grad()

            # ######################### Sersy #########################################
            # Adjust learning rate
            # if configs.step_lr_in_epoch:
            #     lr_scheduler.step()
            #     if tb_writer is not None:
            #         tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], global_step)

        if configs.distributed:
            reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
        else:
            reduced_loss = total_loss.data
        losses.update(to_python_float(reduced_loss), batch_size)
        # measure elapsed time
        # torch.cuda.synchronize()
        batch_time.update(time.time() - start_time)

        if tb_writer is not None:
            if (global_step % configs.tensorboard_freq) == 0:
                loss_stats['avg_loss'] = losses.avg
                tb_writer.add_scalars('Train', loss_stats, global_step)
        # Log message
        if logger is not None:
            if (global_step % configs.print_freq) == 0:
                logger.info(progress.get_message(batch_idx))

        start_time = time.time()
Example #3
0
def validate(val_dataloader, model, configs):
    losses = AverageMeter('Loss', ':.4e')
    criterion = Compute_Loss(device=configs.device)
    # switch to train mode
    model.eval()
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(val_dataloader)):
            metadatas, targets = batch_data
            batch_size = len(metadatas['img_path'])
            voxelinput = metadatas['voxels']
            coorinput = metadatas['coors']
            numinput = metadatas['num_points']

            for k in targets.keys():
                targets[k] = targets[k].to(configs.device, non_blocking=True)
            #imgs = imgs.to(configs.device, non_blocking=True).float()

            dtype = torch.float32
            voxelinputr = torch.tensor(voxelinput,
                                       dtype=torch.float32,
                                       device=configs.device).to(dtype)

            coorinputr = torch.tensor(coorinput,
                                      dtype=torch.int32,
                                      device=configs.device)

            numinputr = torch.tensor(numinput,
                                     dtype=torch.int32,
                                     device=configs.device)

            try:
                outputs = model(voxelinputr, coorinputr, numinputr)
            except RuntimeError as exception:
                if "out of memory" in str(exception):
                    print("WARNING: out of memory")
                    print('###############################3')
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    print('###############################3')
                    raise exception

            #outputs = model(voxelinputr, coorinputr, numinputr)

            total_loss, loss_stats = criterion(outputs, targets)
            # For torch.nn.DataParallel case
            if (not configs.distributed) and (configs.gpu_idx is None):
                total_loss = torch.mean(total_loss)

            if configs.distributed:
                reduced_loss = reduce_tensor(total_loss.data,
                                             configs.world_size)
            else:
                reduced_loss = total_loss.data
            losses.update(to_python_float(reduced_loss), batch_size)

    return losses.avg
Example #4
0
def train_epoch(model, dataloader, solver, rtm3d_loss, configs, tb_writer, epoch):
    train_dataloader, train_sampler = dataloader
    nb = len(train_dataloader)
    epochs = configs.SOLVER.MAX_EPOCH
    model.train()
    if configs.distributed:
        train_sampler.set_epoch(epoch)
    if configs.is_master_node:
        print(('\n' + '%10s' * 10) % (
            'Epoch', 'gpu_mem', 'MKF', 'VFM', 'M_OFF', 'V_OFF', 'total', 'targets', 'lr', 'time'))
        pbar = tqdm.tqdm(enumerate(train_dataloader), total=nb)  # progress bar
    else:
        pbar = enumerate(train_dataloader)
    mloss = torch.zeros((5,), dtype=torch.float32, device=configs.DEVICE)
    time1 = time.time()
    for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
        imgs = imgs.to(configs.DEVICE)
        targets = targets.to(configs.DEVICE)
        pred = model(imgs)
        time2 = time.time()
        loss, loss_items = rtm3d_loss(pred, targets)
        time3 = time.time()
        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ', loss_items)
            return

        if i:
            mloss = (mloss + loss_items) / 2
        else:
            mloss = loss_items
        solver.step(loss)
        if configs.distributed:
            reduced_loss = torch_utils.reduce_tensor(loss.data, configs.world_size)
        else:
            reduced_loss = loss.data
        if configs.is_master_node:
            mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
            mask = targets.get_field('mask')
            s = ('%10s' * 2 + '%10.4g' * 7 + '%10s') % (
                '%g/%g' % (epoch, epochs - 1), mem, *mloss, mask.shape[0], solver.learn_rate,
                '%.1g/%.3g' % (float(time2) - float(time1), float(time3) - float(time2)))
            pbar.set_description(s)

            # write tensorboard
            if tb_writer is not None:
                Tags = ['MKF', 'VFM', 'M_OFF', 'V_OFF', 'total']
                for x, tag in zip(list(mloss), Tags):
                    tb_writer.add_scalar('loss/' + tag, x, epoch * nb + i)
        time1 = time.time()
Example #5
0
def train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch,
                    configs, logger, tb_writer):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')

    progress = ProgressMeter(len(train_dataloader),
                             [batch_time, data_time, losses],
                             prefix="Train - Epoch: [{}/{}]".format(
                                 epoch, configs.num_epochs))

    criterion = Compute_Loss(device=configs.device)
    num_iters_per_epoch = len(train_dataloader)
    # switch to train mode
    model.train()
    start_time = time.time()
    for batch_idx, batch_data in enumerate(tqdm(train_dataloader)):
        data_time.update(time.time() - start_time)
        metadatas, targets = batch_data
        batch_size = len(metadatas['img_path'])
        '''hetmap = np.array(targets['hm_cen'][0], dtype= np.uint8) * 100
        hetmap = hetmap.transpose(1,2,0)
        hetmap = cv2.resize(hetmap,(800,800))
        global count
        hetmap = hetmap.transpose(2,0,1)
        tb_writer.add_image('traget{}'.format(count), hetmap)'''

        voxelinput = metadatas['voxels']
        coorinput = metadatas['coors']
        numinput = metadatas['num_points']

        global_step = num_iters_per_epoch * (epoch - 1) + batch_idx + 1
        for k in targets.keys():
            targets[k] = targets[k].to(configs.device, non_blocking=True)

        dtype = torch.float32
        voxelinputr = torch.tensor(voxelinput,
                                   dtype=torch.float32,
                                   device=configs.device).to(dtype)

        coorinputr = torch.tensor(coorinput,
                                  dtype=torch.int32,
                                  device=configs.device)

        numinputr = torch.tensor(numinput,
                                 dtype=torch.int32,
                                 device=configs.device)

        #print('coor. {}'.format(coorinputr.shape))

        outputs = model(voxelinputr, coorinputr, numinputr)
        #print(type(outputs))
        #outputs = outputs._asdict()
        '''outhetmap = np.array(outputs['hm_cen'][0].cpu().detach().numpy(), dtype= np.uint8) * 100
        outhetmap = outhetmap.transpose(1,2,0)
        outhetmap = cv2.resize(outhetmap,(800,800))
        outhetmap = outhetmap.transpose(2,0,1)
        tb_writer.add_image('output{}'.format(count), outhetmap)'''

        #count += 1

        #box_preds = outputs.view(batch_size, -1, 7)

        total_loss, loss_stats = criterion(outputs, targets)
        # For torch.nn.DataParallel case
        if (not configs.distributed) and (configs.gpu_idx is None):
            total_loss = torch.mean(total_loss)

        # compute gradient and perform backpropagation
        total_loss.backward()
        if global_step % configs.subdivisions == 0:
            optimizer.step()
            # zero the parameter gradients
            optimizer.zero_grad()
            # Adjust learning rate
            if configs.step_lr_in_epoch:
                lr_scheduler.step()
                if tb_writer is not None:
                    tb_writer.add_scalar('LR',
                                         lr_scheduler.get_lr()[0], global_step)

        if configs.distributed:
            reduced_loss = reduce_tensor(total_loss.data, configs.world_size)
        else:
            reduced_loss = total_loss.data
        losses.update(to_python_float(reduced_loss), batch_size)
        # measure elapsed time
        # torch.cuda.synchronize()
        batch_time.update(time.time() - start_time)

        if tb_writer is not None:
            if (global_step % configs.tensorboard_freq) == 0:
                loss_stats['avg_loss'] = losses.avg
                tb_writer.add_scalars('Train', loss_stats, global_step)
        # Log message
        if logger is not None:
            if (global_step % configs.print_freq) == 0:
                logger.info(progress.get_message(batch_idx))

        start_time = time.time()