コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description='Voxelnet for semantic')
    parser.add_argument('--lr',
                        default=0.001,
                        type=float,
                        help='Initial learning rate')
    parser.add_argument('--epochs', default=100, help='epochs')
    parser.add_argument('--batchsize', default=4, help='epochs')
    parser.add_argument('--weight_file', default='', help='weights to load')
    parser.add_argument(
        '--test_area',
        type=int,
        default=5,
        help='Which area to use for test, option: 1-6 [default: 6]')
    parser.add_argument('--num_point',
                        type=int,
                        default=4096,
                        help='Point number [default: 4096]')

    args = parser.parse_args()
    NUM_POINT = args.num_point
    BATCH_SIZE = args.batchsize
    lr = args.lr
    ALL_FILES = getDataFiles('indoor3d_sem_seg_hdf5_data/all_files.txt')
    room_filelist = [
        line.rstrip()
        for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt')
    ]

    # Load ALL data
    data_batch_list = []
    label_batch_list = []
    for h5_filename in ALL_FILES:
        data_batch, label_batch = loadDataFile(h5_filename)
        data_batch_list.append(data_batch)
        label_batch_list.append(label_batch)
    data_batches = np.concatenate(data_batch_list, 0)
    label_batches = np.concatenate(label_batch_list, 0)
    print(data_batches.shape)
    print(label_batches.shape)

    test_area = 'Area_' + str(args.test_area)
    train_idxs = []
    test_idxs = []
    for i, room_name in enumerate(room_filelist):
        if test_area in room_name:
            test_idxs.append(i)
        else:
            train_idxs.append(i)

    train_data = data_batches[
        train_idxs, ...]  # ... means ellipsis, the same as [train_idxs, :, :]
    train_label = label_batches[train_idxs].astype(np.int64)
    test_data = data_batches[test_idxs, ...]
    test_label = label_batches[test_idxs].astype(np.int64)
    print(train_data.shape, train_label.shape)
    print(test_data.shape, test_label.shape)

    time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string)
    if not os.path.exists(log_dir): os.makedirs(log_dir)

    checkpoint_dir = os.path.join(log_dir, 'checkpoint')
    if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir)

    #writer = SummaryWriter(log_dir=os.path.join( log_dir, 'tensorboard'))

    start_epoch = 0
    epochs = args.epochs

    model = get_model()
    model.cuda()
    # print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr)
    criterion = nn.CrossEntropyLoss().cuda()

    if args.weight_file != '':
        pre_trained_model = torch.load(args.weight_file)
        start_epoch = pre_trained_model['epoch']
        model_state = model.state_dict()
        model_state.update(pre_trained_model['state_dict'])
        model.load_state_dict(model_state)

    global_counter = 0
    for epoch in range(start_epoch, epochs):
        learn_rate_now = adjust_learning_rate(optimizer, global_counter,
                                              BATCH_SIZE, lr)
        #writer.add_scalar('train/learning_rate', learn_rate_now, global_counter)

        losses = AverageMeter()
        top1 = AverageMeter()
        model.train()

        train_data_shuffled, train_label_shuffled, _ = shuffle_data(
            train_data[:, 0:NUM_POINT, :], train_label)
        file_size = train_data_shuffled.shape[0]
        num_batches = file_size // BATCH_SIZE

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            feature = train_data_shuffled[start_idx:end_idx, :, :]
            label = train_label_shuffled[start_idx:end_idx]

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(),
                             requires_grad=True)
            input = torch.transpose(input, 3, 1)
            target = Variable(torch.from_numpy(label).cuda(),
                              requires_grad=False)
            target = target.view(-1, )
            output = model(input)
            output_reshaped = output.permute(0, 3, 2,
                                             1).contiguous().view(-1, 13)

            loss = criterion(output_reshaped, target)
            prec1 = accuracy(output_reshaped.data, target.data, topk=(1, ))
            #prec1[0] = prec1[0].cpu().numpy()[0]
            prec1 = prec1[0].cpu().numpy()
            #losses.update(loss.data[0], BATCH_SIZE)
            losses.update(loss.data, BATCH_SIZE)
            #top1.update(prec1[0], BATCH_SIZE)
            top1.update(prec1, BATCH_SIZE)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print('Epoch: [{0}][{1}]\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(epoch,
                                                                  batch_idx,
                                                                  loss=losses,
                                                                  top1=top1))

            with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f:
                f.write('Epoch: [{0}][{1}]\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) \n'.format(
                            epoch, batch_idx, loss=losses, top1=top1))

            global_counter += 1

        #writer.add_scalar('train/loss', losses.avg, global_counter)
        #writer.add_scalar('train/accuracy', top1.avg, global_counter)

        losses = AverageMeter()
        top1 = AverageMeter()
        model.eval()

        file_size = test_data.shape[0]
        num_batches = file_size // BATCH_SIZE

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            feature = test_data[start_idx:end_idx, :, :]
            label = test_label[start_idx:end_idx]

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(),
                             requires_grad=True)
            input = torch.transpose(input, 3, 1)
            target = Variable(torch.from_numpy(label).cuda(),
                              requires_grad=False)
            target = target.view(-1, )
            output = model(input)
            output_reshaped = output.permute(0, 3, 2,
                                             1).contiguous().view(-1, 13)

            loss = criterion(output_reshaped, target)
            prec1 = accuracy(output_reshaped.data, target.data, topk=(1, ))
            #prec1[0] = prec1[0].cpu().numpy()[0]
            prec1 = prec1[0].cpu().numpy()
            #losses.update(loss.data[0], BATCH_SIZE)
            losses.update(loss.data, BATCH_SIZE)
            #top1.update(prec1[0], BATCH_SIZE)
            top1.update(prec1, BATCH_SIZE)

        #writer.add_scalar('val/loss', losses.avg, global_counter)
        #writer.add_scalar('val/accuracy', top1.avg, global_counter)

        print('Epoch {} Val Loss {:.3f} Val Acc {:.3f}  \t'.format(
            epoch, losses.avg, top1.avg))

        with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f:
            f.write('Epoch: [{0}]\t'
                    'Loss {loss.avg:.4f} \t'
                    'Prec@1 {top1.avg:.3f} \n'.format(epoch,
                                                      loss=losses,
                                                      top1=top1))

        if (epoch % 5 == 0):
            torch.save(
                {
                    'epoch': epoch + 1,
                    'args': args,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict()
                },
                os.path.join(checkpoint_dir,
                             'checkpoint_' + str(epoch) + '.pth.tar'))
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(description='Voxelnet for semantic')
    parser.add_argument('--lr',
                        default=0.001,
                        type=float,
                        help='Initial learning rate')  # default=0.001(good)
    parser.add_argument('--epochs', default=2,
                        help='epochs')  # default=100, 50, 30
    parser.add_argument('--batchsize', default=4, help='epochs')  # default=32
    parser.add_argument('--weight_file', default='', help='weights to load')
    # log_ptn/train/Area_2_2019-09-11-11-43-48/checkpoint/checkpoint_0_max_mIoU_test_25.17065278824228.pth.tar
    parser.add_argument(
        '--test_area',
        type=int,
        default=2,
        help='Which area to use for test, option: 1-2 [default: 2]')
    parser.add_argument('--num_point',
                        type=int,
                        default=4096,
                        help='Point number [default: 4096]')

    args = parser.parse_args()
    NUM_POINT = args.num_point
    BATCH_SIZE = args.batchsize
    lr = args.lr
    ALL_FILES = getDataFiles(
        'indoor3d_sem_seg_hdf5_data/all_files.txt')  # .h5 file routes
    room_filelist = [
        line.rstrip()
        for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt')
    ]

    # Load ALL data into a big data_batch & a big label_batch
    data_batch_list = []
    label_batch_list = []
    print(ALL_FILES)
    for h5_filename in ALL_FILES:
        h5_dir = os.path.join(
            '/home/chenkun/pointnet_pytorch-master/indoor3d_sem_seg_hdf5_data',
            h5_filename)
        f = h5py.File(h5_dir)
        data_batch = f['data'][:]
        label_batch = f['label'][:]
        data_batch_list.append(data_batch)
        label_batch_list.append(label_batch)
    data_batches = np.concatenate(data_batch_list, 0)
    label_batches = np.concatenate(label_batch_list, 0)
    print(data_batches.shape)
    print(label_batches.shape)

    test_area = 'Area_' + str(args.test_area)
    train_idxs = []
    test_idxs = []
    for i, room_name in enumerate(room_filelist):
        if test_area in room_name:
            test_idxs.append(i)
        else:
            train_idxs.append(i)

    train_data = data_batches[train_idxs, ...]
    train_label = label_batches[train_idxs].astype(np.int64)
    # test_data = data_batches[test_idxs, ...]      # ZZC
    # test_label = label_batches[test_idxs].astype(np.int64)  # ZZC

    test_data = train_data  # ZZC
    test_label = train_label  # ZZC

    print(train_data.shape, train_label.shape)
    print(test_data.shape, test_label.shape)

    time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string)

    if not os.path.exists(log_dir): os.makedirs(log_dir)

    checkpoint_dir = os.path.join(log_dir, 'checkpoint')
    if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir)

    start_epoch = 0
    epochs = args.epochs

    model = get_model()
    model.cuda()
    # print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr)

    # class_names = ["ground", "vegetation", "building", "clutter"]    # ZZC
    class_names = ["T2T", "B2B", "BH", "BL", "V2V", "OT"]

    # Add weights to the loss function
    # weightsTrain = [0.04, 0.20, 0.12, 0.64]  # default
    # weightsTrain = [0.25, 0.25, 0.25, 0.25]
    # weightsTrain = [0.20, 0.50, 0.30, 0.50]
    weightsTrain = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00]
    class_weights_Train = torch.FloatTensor(weightsTrain).cuda()
    criterionTrain = nn.CrossEntropyLoss(weight=class_weights_Train,
                                         size_average=True).cuda()
    # True: loss is averaged over each loss element in batch
    weightsVal = [0.2, 0.4, 0.6, 1.00, 1.00,
                  1.00]  # default  [0.08, 0.37, 0.15, 0.40]
    class_weights_Val = torch.FloatTensor(weightsVal).cuda()
    criterionVal = nn.CrossEntropyLoss(weight=class_weights_Val,
                                       size_average=True).cuda()

    if args.weight_file != '':
        pre_trained_model = torch.load(args.weight_file)
        start_epoch = pre_trained_model['epoch']
        model_state = model.state_dict()
        model_state.update(pre_trained_model['state_dict'])
        model.load_state_dict(model_state)

    #  #####################################################
    #    Start training
    #  #####################################################
    global_counter = 0
    max_mIoU_test = 0.0

    for epoch in range(start_epoch, epochs):
        learn_rate_now = adjust_learning_rate(optimizer, global_counter,
                                              BATCH_SIZE,
                                              lr)  # Seems not changing, ZZC

        iter_loss = 0.0  # Initialisation: loss for one epoch
        iterations = 0

        cm = ConfusionMatrix(6, class_names=class_names)
        cm.clear()

        model.train()

        train_data_shuffled, train_label_shuffled, _ = shuffle_data(
            train_data[:, 0:NUM_POINT, :], train_label)
        file_size = train_data_shuffled.shape[
            0]  # total number of training batches
        num_batches = file_size // BATCH_SIZE  # number of iterations in one epoch
        print('\nnum_batches(training):\t', num_batches)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE

            feature = train_data_shuffled[start_idx:end_idx, :, :]
            label = train_label_shuffled[start_idx:end_idx]
            # print('Here')
            # print(feature.shape)
            # print(label.shape)

            # feature[:, :, 0:2] = 0.0
            # feature[:, :, 6:9] = 0.0
            # print(feature.shape)

            # print(feature[0, 0, 0])
            # print(feature[0, 0, 1])
            # print(feature[0, 0, 2])
            # print(feature[0, 0, 3])
            # print(feature[0, 0, 4])
            # print(feature[0, 0, 5])
            # print(feature[0, 0, 6])
            # print(feature[0, 0, 7])
            # print(feature[0, 0, 8])

            #

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(),
                             requires_grad=True)
            # print(input.size())

            input = torch.transpose(input, 3, 1)  # ? ZZC
            # print(input.size())

            target = Variable(torch.from_numpy(label).cuda(),
                              requires_grad=False)
            # print(target.size())

            target = target.view(-1, )
            # print(target.size())

            output = model(input)
            output_reshaped = output.permute(0, 3, 2,
                                             1).contiguous().view(-1, 6)

            # exit()  # for check, ZZC
            _, pred = torch.max(output.data, 1)
            pred = pred.view(-1, )
            cm.add_batch(target.cpu().numpy(), pred.cpu().numpy())  # detach()
            loss = criterionTrain(output_reshaped, target)
            iter_loss += loss.item()  # Accumulate the loss
            iterations += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_counter += 1

            if batch_idx % 10 == 0:
                print('Epoch: [%3d][%3d]\t Loss: %.4f' %
                      (epoch, batch_idx, loss))  # Print loss for one bath

        # Print training results for 1 epoch
        iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU()
        print(
            'Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' %
            (epoch, iter_loss / iterations, cm.overall_accuracy(),
             mIoU))  # Print loss for the epoch
        print(
            'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%'
            % (iou0, iou1, iou2, iou3, iou4, iou5))

        with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f:
            f.write(
                'Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n'
                % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU))
            f.write(
                'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n'
                % (iou0, iou1, iou2, iou3, iou4, iou5))

        #  #####################################################
        #    Start validation
        #  #####################################################
        model.eval()
        iter_loss = 0.0  # Initialisation: loss for one epoch
        iterations = 0
        cm = ConfusionMatrix(6, class_names=class_names)  # ZZC
        cm.clear()

        file_size = test_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        print('num_batches(testing):\t', num_batches)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            feature = test_data[start_idx:end_idx, :, :]
            label = test_label[start_idx:end_idx]

            # feature[:, :, 0:2] = 0.0
            # feature[:, :, 6:9] = 0.0

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(),
                             requires_grad=True)
            input = torch.transpose(input, 3, 1)  # ? ZZC
            target = Variable(torch.from_numpy(label).cuda(),
                              requires_grad=False)
            target = target.view(-1, )
            output = model(input)
            output_reshaped = output.permute(0, 3, 2,
                                             1).contiguous().view(-1, 6)

            _, pred = torch.max(output.data, 1)
            pred = pred.view(-1, )
            cm.add_batch(target.cpu().numpy(), pred.cpu().numpy())  # detach()

            loss = criterionVal(output_reshaped, target)
            iter_loss += loss.item()  # Accumulate the loss
            iterations += 1

        # Print validation results after 1 epoch
        iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU()
        print('Epoch: [%3d]\t Test Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' %
              (epoch, iter_loss / iterations, cm.overall_accuracy(),
               mIoU))  # Print loss for the epoch
        print(
            'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%'
            % (iou0, iou1, iou2, iou3, iou4, iou5))

        with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f:
            f.write(
                'Epoch: [%3d]\t Test Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n'
                % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU))
            f.write(
                'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n'
                % (iou0, iou1, iou2, iou3, iou4, iou5))

        # Check whether best model, -> Save model
        if (mIoU > max_mIoU_test or epoch == epochs - 1):
            max_mIoU_test = mIoU
            print(
                '-> Best performance (test mIoU) achieved or This is final epoch.'
            )
            print('Max_mIoU in testing: %3.2f%%\n' % (max_mIoU_test))
            torch.save(
                {
                    'epoch': epoch + 1,
                    'args': args,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict()
                },
                os.path.join(
                    checkpoint_dir, 'checkpoint_' + str(epoch) +
                    '_max_mIoU_test_' + str(mIoU) + '.pth.tar'))
コード例 #3
0
ファイル: condensenet.py プロジェクト: skvictoria/ERFNet
    def train_one_epoch(self):
        """
        One epoch training function
        """
        # Initialize tqdm
        tqdm_batch = tqdm(self.data_loader.train_loader,
                          total=self.data_loader.train_iterations,
                          desc="Epoch-{}-".format(self.current_epoch))
        # Set the model to be in training mode
        self.model.train()
        # Initialize your average meters
        epoch_loss = AverageMeter()
        top1_acc = AverageMeter()
        top5_acc = AverageMeter()

        current_batch = 0
        for x, y in tqdm_batch:
            if self.cuda:
                x, y = x.cuda(self.config.async_loading), y.cuda(
                    self.config.async_loading)

            # current iteration over total iterations
            progress = float(
                self.current_epoch * self.data_loader.train_iterations +
                current_batch) / (self.config.max_epoch *
                                  self.data_loader.train_iterations)
            # progress = float(self.current_iteration) / (self.config.max_epoch * self.data_loader.train_iterations)
            x, y = Variable(x), Variable(y)
            lr = adjust_learning_rate(self.optimizer,
                                      self.current_epoch,
                                      self.config,
                                      batch=current_batch,
                                      nBatch=self.data_loader.train_iterations)
            # model
            pred = self.model(x, progress)
            # loss
            cur_loss = self.loss(pred, y)
            if np.isnan(float(cur_loss.item())):
                raise ValueError('Loss is nan during training...')
            # optimizer
            self.optimizer.zero_grad()
            cur_loss.backward()
            self.optimizer.step()

            top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5))

            epoch_loss.update(cur_loss.item())
            top1_acc.update(top1.item(), x.size(0))
            top5_acc.update(top5.item(), x.size(0))

            self.current_iteration += 1
            current_batch += 1

            self.summary_writer.add_scalar("epoch/loss", epoch_loss.val,
                                           self.current_iteration)
            self.summary_writer.add_scalar("epoch/accuracy", top1_acc.val,
                                           self.current_iteration)
        tqdm_batch.close()

        self.logger.info("Training at epoch-" + str(self.current_epoch) +
                         " | " + "loss: " + str(epoch_loss.val) +
                         "- Top1 Acc: " + str(top1_acc.val) + "- Top5 Acc: " +
                         str(top5_acc.val))
コード例 #4
0
ファイル: train.py プロジェクト: liuguoyou/VideoMoCo-plus
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    netG = moco.builder.MaskGenerator()
    netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim,
                             args.moco_k, args.moco_m, args.moco_t, args.mlp)
    print(netG)
    print(netD)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            netG.cuda(args.gpu)
            netD.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            netG = torch.nn.parallel.DistributedDataParallel(
                netG, device_ids=[args.gpu], find_unused_parameters=True)
            netD = torch.nn.parallel.DistributedDataParallel(
                netD, device_ids=[args.gpu], find_unused_parameters=True)
        else:
            netG.cuda()
            netD.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            netG = torch.nn.parallel.DistributedDataParallel(netG)
            netD = torch.nn.parallel.DistributedDataParallel(netD)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        netG = netG.cuda(args.gpu)
        netD = netD.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        pass  # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu
    # torch.cuda.synchronize()
    optimizer_g = torch.optim.SGD(netG.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    optimizer_d = torch.optim.SGD(netD.parameters(),
                                  args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    G_criterion = nn.L1Loss().cuda(args.gpu)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netD.load_state_dict(checkpoint['state_dict'])
            #optimizer_d.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

        if os.path.isfile(args.resumeG):
            print("=> loading checkpoint '{}'".format(args.resumeG))
            if args.gpu is None:
                checkpoint = torch.load(args.resumeG)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resumeG, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            netG.load_state_dict(checkpoint['state_dict'])
            #optimizer_g.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resumeG, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resumeG))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    video_augmentation = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)),
    ])
    audio_augmentation = moco.loader.DummyAudioTransform()
    augmentation = {'video': video_augmentation, 'audio': audio_augmentation}

    augmentation_gpu = moco.loader.MoCoAugmentV2(
        args.crop_size) if args.aug_plus else moco.loader.MoCoAugment(
            args.crop_size)

    train_dataset = Kinetics400(traindir,
                                args.frame_per_clip,
                                args.step_between_clips,
                                extensions='mp4',
                                transform=augmentation,
                                num_workers=4)

    train_sampler = RandomClipSampler(train_dataset.video_clips, 1)

    if args.distributed:
        # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler)
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True,
                                               multiprocessing_context="fork")
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer_d, epoch, args)
        adjust_learning_rate(optimizer_g, epoch, args)

        # train for one epoch
        train(train_loader, augmentation_gpu, criterion, G_criterion, netG,
              netD, optimizer_g, optimizer_d, epoch, args, writer)

        if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or
                                      (args.multiprocessing_distributed
                                       and args.rank % ngpus_per_node == 0)):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netG.state_dict(),
            },
                            ckp_dir + '/netG',
                            max_save=20,
                            is_best=False)

            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': netD.state_dict(),
            },
                            ckp_dir + '/netD',
                            max_save=20,
                            is_best=False)
コード例 #5
0
def main():
    # load config
    cfg = importlib.import_module('configs.cifar10.{}'.format(args.config)).config

    # accuracy
    best_acc = 0
    best_epoch = 0
    start_epoch = 0

    # fix random seed
    if args.rng_seed is not None:
        rng_seed = args.rng_seed
    else:
        rng_seed = cfg.TRAIN.rng_seed
    random.seed(rng_seed)
    np.random.seed(rng_seed)
    torch.manual_seed(rng_seed)

    # setup output and logger
    output_dir = mkdir(osp.join(OUTPUT_ROOT_DIR, args.config, 'rnd_%d' % rng_seed))
    logger = create_logger(output_dir)
    logger.info('config:\n' + pprint.pformat(cfg))
    logger.info('arguments:\n' + pprint.pformat(args))
    logger.info('gpu(s): ' + str(os.environ.get('CUDA_VISIBLE_DEVICES')))

    print("=> Creating model '{}'".format(cfg.model))
    model = models.cifar10.__dict__[cfg.model]()
    module_dict = dict(model.named_modules())
    logger.info('module:\n' + pprint.pformat(module_dict))
    # define loss function (criterion)
    criterion = nn.CrossEntropyLoss()

    # gpu support
    assert torch.cuda.is_available(), 'Training requires cuda'
    # if the input size is fixed, enable it
    import torch.backends.cudnn as cudnn
    cudnn.benchmark = True
    # cudnn.deterministic = True
    # enable DataParallel, default use all cuda devices
    model = nn.DataParallel(model).cuda()
    # model = model.cuda()
    criterion = criterion.cuda()

    # define optimizer
    optimizer = torch.optim.SGD(model.parameters(), cfg.TRAIN.lr,
                                momentum=cfg.TRAIN.momentum, weight_decay=cfg.TRAIN.wd)

    # optionally resume from a checkpoint
    if args.resume:
        if osp.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            best_acc = checkpoint['best_acc']
            best_epoch = checkpoint['best_epoch']
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            if not args.eval:
                optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            logger.warn("=> no checkpoint found at '{}'".format(args.resume))

    # load data
    print('=> Preparing data')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), # transform into [0.0, 1.0]
        transforms.Normalize(PIXEL_MEANS, PIXEL_STDS),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(PIXEL_MEANS, PIXEL_STDS),
    ])

    train_set = torchvision.datasets.CIFAR10(root=DATA_ROOT_DIR, train=True, download=True, transform=transform_train)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=cfg.TRAIN.batch_size,
                                               shuffle=True, num_workers=args.num_worker)

    val_set = torchvision.datasets.CIFAR10(root=DATA_ROOT_DIR, train=False, download=True, transform=transform_test)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=cfg.TEST.batch_size,
                                             shuffle=False, num_workers=args.num_worker)

    if args.eval:
        logger.info('evaluating trained model')
        acc = validate(val_loader, model, criterion)
        logger.info(
            'Val-Epoch: [{0}]\t'
            'Prec@1: {acc:.3f})'.format(start_epoch, acc=acc)
        )
        return

    def do_checkpoint(epoch, path):
        torch.save({
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'model': cfg.model,
            'best_acc': best_acc,
            'best_epoch': best_epoch,
        }, path)

    # save initialization state
    do_checkpoint(0, osp.join(output_dir, 'init.ckpt'))

    for epoch in range(start_epoch, cfg.TRAIN.end_epoch):
        adjust_learning_rate(optimizer, epoch, cfg.TRAIN.lr, cfg.TRAIN.lr_step)

        # train for one epoch
        epoch_result = train(train_loader, model, criterion, optimizer, epoch)
        logger.info(
            'Train-Epoch: [{0}]\t'
            'Loss: {loss:.4f}\t'
            'Prec@1: {acc:.3f}'.format(
                epoch + 1, **epoch_result)
        )

        # evaluate on validation set
        acc = validate(val_loader, model, criterion)
        logger.info(
            'Val-Epoch: [{0}]\t'
            'Prec@1: {acc:.3f}'.format(
                epoch + 1, acc=acc)
        )

        # remember best acc and save checkpoint
        is_best = acc > best_acc
        epoch_t = epoch + 1
        if is_best:
            best_epoch = epoch_t
            best_acc = acc
            do_checkpoint(best_epoch, osp.join(output_dir, 'best.ckpt'))
        if (args.ckpt_interval is not None) and (epoch_t % args.ckpt_interval == 0):
            do_checkpoint(epoch_t, osp.join(output_dir, '%03d.ckpt' % epoch_t))

    logger.info(
        '=> Best-Epoch: [{0}]\t'
        'Prec@1: {acc:.3f}'.format(
            best_epoch, acc=best_acc)
    )
コード例 #6
0
columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'time']

train_utils.save_checkpoint(
    args.dir,
    start_epoch - 1,
    model_state=model.state_dict(),
    optimizer_state=optimizer.state_dict()
)

test_res = {'loss': None, 'accuracy': None, 'nll': None}
for epoch in range(start_epoch, args.epochs + 1):
    time_ep = time.time()

    lr = learning_rate_schedule(args.lr, epoch, args.epochs)
    train_utils.adjust_learning_rate(optimizer, lr)

    train_res = train_utils.train(loaders['train'], model, optimizer, criterion, regularizer, cuda=args.cuda)
    test_res = train_utils.test(loaders['test'], model, criterion, regularizer, cuda=args.cuda)

    if epoch % args.save_freq == 0:
        train_utils.save_checkpoint(
            args.dir,
            epoch,
            model_state=model.state_dict(),
            optimizer_state=optimizer.state_dict()
        )

    time_ep = time.time() - time_ep
    values = [epoch, lr, train_res['loss'], train_res['accuracy'], test_res['nll'],
              test_res['accuracy'], time_ep]
コード例 #7
0
def main():
    parser = argparse.ArgumentParser(description='Voxelnet for semantic')
    parser.add_argument('--lr', default=0.001, type=float, help='Initial learning rate')
    parser.add_argument('--epochs', default=50, help='epochs')  # default=100
    parser.add_argument('--batchsize', default=4, help='epochs')   # default=32
    parser.add_argument('--weight_file', default='', help='weights to load')
    parser.add_argument('--test_area', type=int, default=2, help='Which area to use for test, option: 1-2 [default: 2]')
    parser.add_argument('--num_point', type=int, default=4096, help='Point number [default: 4096]')

    args = parser.parse_args()
    NUM_POINT = args.num_point
    BATCH_SIZE = args.batchsize
    lr = args.lr
    ALL_FILES = getDataFiles('indoor3d_sem_seg_hdf5_data/all_files.txt')
    room_filelist = [line.rstrip() for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt')]

    # Load ALL data into a big data_batch & a big label_batch
    data_batch_list = []
    label_batch_list = []
    print(ALL_FILES)
    for h5_filename in ALL_FILES:
        # print(h5_filename)
        # data_batch, label_batch = loadDataFile(h5_filename)
        h5_dir = os.path.join('/home/chenkun/pointnet_pytorch-master/indoor3d_sem_seg_hdf5_data', h5_filename)
        f = h5py.File(h5_dir)
        data_batch = f['data'][:]
        label_batch = f['label'][:]
        data_batch_list.append(data_batch)
        label_batch_list.append(label_batch)

    data_batches = np.concatenate(data_batch_list, 0)
    label_batches = np.concatenate(label_batch_list, 0)
    print(data_batches.shape)
    print(label_batches.shape)

    test_area = 'Area_' + str(args.test_area)
    train_idxs = []
    test_idxs = []
    for i, room_name in enumerate(room_filelist):
        if test_area in room_name:
            test_idxs.append(i)
        else:
            train_idxs.append(i)

    train_data = data_batches[train_idxs, ...]
    train_label = label_batches[train_idxs].astype(np.int64)
    test_data = data_batches[test_idxs, ...]
    test_label = label_batches[test_idxs].astype(np.int64)
    print(train_data.shape, train_label.shape)
    print(test_data.shape, test_label.shape)

    time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string)
    if not os.path.exists(log_dir): os.makedirs(log_dir)

    checkpoint_dir = os.path.join(log_dir, 'checkpoint')
    if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir)

    writer = SummaryWriter(log_dir=os.path.join( log_dir, 'tensorboard'))

    start_epoch = 0
    epochs = args.epochs
    model = get_model()
    model.cuda()
    # print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr)

    # class_names = ["ground", "vegetation", "building", "clutter"]    # ZZC
    class_names = ["T2T", "B2B", "BH", "BL", "V2V", "OT"]

    # Add weights to the loss function
    # weightsTrain = [0.04, 0.20, 0.12, 0.64]  # default
    # weightsTrain = [0.25, 0.25, 0.25, 0.25]
    # weightsTrain = [0.20, 0.50, 0.30, 0.50]
    weightsTrain = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00]
    class_weights_Train = torch.FloatTensor(weightsTrain).cuda()
    criterionTrain = nn.CrossEntropyLoss(weight=class_weights_Train,
                                         size_average=True).cuda()
    # True: loss is averaged over each loss element in batch
    weightsVal = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00]  # default  [0.08, 0.37, 0.15, 0.40]
    class_weights_Val = torch.FloatTensor(weightsVal).cuda()
    criterionVal = nn.CrossEntropyLoss(weight=class_weights_Val,
                                       size_average=True).cuda()

    #criterion = nn.CrossEntropyLoss().cuda()

    if args.weight_file != '':
        pre_trained_model = torch.load(args.weight_file)
        start_epoch = pre_trained_model['epoch']
        model_state = model.state_dict()
        model_state.update(pre_trained_model['state_dict'])
        model.load_state_dict(model_state)

    global_counter = 0
    max_mIoU_test = 0.0

    for epoch in range(start_epoch, epochs):
        learn_rate_now = adjust_learning_rate(optimizer, global_counter, BATCH_SIZE, lr)
        # writer.add_scalar('train/learning_rate', learn_rate_now, global_counter)
        #
        # losses = AverageMeter()
        # top1 = AverageMeter()
        # model.train()
        #
        # train_data_shuffled, train_label_shuffled, _ = shuffle_data(train_data[:, 0:NUM_POINT, :], train_label)
        # file_size = train_data_shuffled.shape[0]
        # num_batches = file_size // BATCH_SIZE
        iter_loss = 0.0  # Initialisation: loss for one epoch
        iterations = 0

        cm = ConfusionMatrix(6, class_names=class_names)
        cm.clear()

        model.train()

        train_data_shuffled, train_label_shuffled, _ = shuffle_data(train_data[:, 0:NUM_POINT, :], train_label)
        file_size = train_data_shuffled.shape[0]  # total number of training batches
        num_batches = file_size // BATCH_SIZE  # number of iterations in one epoch
        print('\nnum_batches(training):\t', num_batches)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE

            feature = train_data_shuffled[start_idx:end_idx, :, :]
            label = train_label_shuffled[start_idx:end_idx]

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True)

            input = torch.transpose(input, 3, 1)

            target = Variable(torch.from_numpy(label).cuda(), requires_grad=False)

            target = target.view(-1,)

            output = model(input)
            output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 6)

            _, pred = torch.max(output.data, 1)
            pred = pred.view(-1, )
            cm.add_batch(target.cpu().numpy(), pred.cpu().numpy())  # detach()
            loss = criterionTrain(output_reshaped, target)
            iter_loss += loss.item()  # Accumulate the loss
            iterations += 1

            loss = criterion(output_reshaped, target)
            prec1 = accuracy(output_reshaped.data, target.data, topk=(1,))
            prec1[0] = prec1[0].cpu().numpy()
            losses.update(loss.item(), BATCH_SIZE)
            top1.update(prec1[0], BATCH_SIZE)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_counter += 1

            if batch_idx%10==0:
                print('Epoch: [%3d][%3d]\t Loss: %.4f'%(epoch,batch_idx,loss))   # Print loss for one bath


            # print('Epoch: [{0}][{1}]\t'
            #       'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
            #       'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
            #     epoch, batch_idx, loss=losses, top1=top1))
            #
            # with open(os.path.join(log_dir,'train_log.txt'), 'a') as f:
            #     f.write('Epoch: [{0}][{1}]\t'
            #             'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
            #             'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) \n'.format(
            #         epoch, batch_idx, loss=losses, top1=top1))

        # Print training results for 1 epoch
        iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU()
        print('Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU))  # Print loss for the epoch
        print('T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%' % (iou0, iou1, iou2, iou3, iou4, iou5))

        with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f:
            f.write('Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU))
            f.write('T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n' % (iou0, iou1, iou2, iou3, iou4, iou5))

            #global_counter += 1

        writer.add_scalar('train/loss', losses.avg, global_counter)
        writer.add_scalar('train/accuracy', top1.avg, global_counter)


        # losses = AverageMeter()
        # top1 = AverageMeter()
        model.eval()

        file_size = test_data.shape[0]
        num_batches = file_size // BATCH_SIZE

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            feature = test_data[start_idx:end_idx, :, :]
            label = test_label[start_idx:end_idx]

            feature = np.expand_dims(feature, axis=1)
            input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True)
            input = torch.transpose(input, 3, 1)
            target = Variable(torch.from_numpy(label).cuda(), requires_grad=False)
            target = target.view(-1,)
            output = model(input)
            output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 13)

            loss = criterion(output_reshaped, target)
            prec1 = accuracy(output_reshaped.data, target.data, topk=(1,))
            prec1[0] = prec1[0].cpu().numpy()
            losses.update(loss.item(), BATCH_SIZE)
            top1.update(prec1[0], BATCH_SIZE)

        writer.add_scalar('val/loss', losses.avg, global_counter)
        writer.add_scalar('val/accuracy', top1.avg, global_counter)

        print('Epoch {} Val Loss {:.3f} Val Acc {:.3f}  \t'
              .format(epoch, losses.avg, top1.avg))

        with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f:
            f.write('Epoch: [{0}]\t'
                    'Loss {loss.avg:.4f} \t'
                    'Prec@1 {top1.avg:.3f} \n'.format(
                epoch, loss=losses, top1=top1))

        if(epoch % 5 == 0):
            torch.save(
                {'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()},
                os.path.join(checkpoint_dir, 'checkpoint_' + str(epoch) + '.pth.tar') )

    writer.close()
コード例 #8
0
    def train_one_epoch(self):
        """
        One epoch training function
        """

        # Set the model to be in training mode
        self.model.train()
        # Initialize your average meters
        train_loss = AverageMeter()
        train_err_joints = AverageMeter()
        train_err_rotation = AverageMeter()
        train_err_translation = AverageMeter()

        total_batch = len(self.data_loader.train_loader)

        print("Starting Epoch Training with batch size: {:d}".format(
            total_batch))
        print("Batch Size: {:d}".format(self.config.batch_size))
        current_batch = 1

        # times = {}

        q_dist_list = []
        trans_list = []
        joint_list = []

        tic = time.time()

        mean_speed = 0
        total_loss_sum = 0

        samples = 300
        np_batch_bench = np.zeros([samples, 4])

        for x, y in self.data_loader.train_loader:
            batch_start = time.time()

            if self.cuda:
                x = x.to(device=self.device, dtype=torch.long)
                y = y.to(device=self.device, dtype=torch.long)

            progress = float(
                self.current_epoch * self.data_loader.train_iterations +
                current_batch) / (self.config.max_epoch *
                                  self.data_loader.train_iterations)

            # adjust learning rate
            lr = adjust_learning_rate(self.optimizer,
                                      self.current_epoch,
                                      self.config,
                                      batch=current_batch,
                                      nBatch=self.data_loader.train_iterations)

            # model

            pred = self.model((x))

            if self.config.data_output_type == "joints_absolute":
                loss_joints = self.loss(pred, y)
                total_loss = loss_joints
                train_loss.update(total_loss.item())
                train_err_joints.update(total_loss.item())
            elif self.config.data_output_type == "q_trans_simple":
                loss_q_trans_simple = self.loss(pred, y)
                total_loss = loss_q_trans_simple
            elif self.config.data_output_type == "pose_relative":
                # loss for rotation
                # select rotation indices from the prediction tensor
                indices = torch.tensor([3, 4, 5, 6])
                indices = indices.to(self.device)
                rotation = torch.index_select(pred, 1, indices)
                # select rotation indices from the label tensor
                y_rot = torch.index_select(y, 1, indices)
                # calc MSE loss for rotation
                # loss_rotation = self.loss(rotation, y_rot)

                # trans_list.append(loss_rotation[0].item().numpy())
                # print(loss_rotation.item())

                # penalty loss from facebook paper posenet
                # penalty_loss = self.config.rot_reg * torch.mean((torch.sum(quater ** 2, dim=1) - 1) ** 2)
                penalty_loss = 0

                q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy())
                q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy())
                q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot))
                q_dist_list.append(q_dist)

                # loss for translation
                # select translation indices from the prediction tensor
                indices = torch.tensor([0, 1, 2])
                indices = indices.to(self.device)
                translation = torch.index_select(pred, 1, indices)
                # select translation indices from the label tensor
                y_trans = torch.index_select(y, 1, indices)

                # calc MSE loss for translation
                loss_translation = self.loss(translation, y_trans)
                trans_list.append(loss_translation.item())

                # total_loss = penalty_loss + loss_rotation + loss_translation
                # use simple loss
                total_loss = self.loss(pred.double(), y.double())

                # calc translation MSE
                q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy())
                q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy())
                q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot))
                q_dist_list.append(q_dist)
                trans_pred = translation[0].cpu().detach().numpy()
                trans_label = y_trans[0].cpu().detach().numpy()
                mse_trans = (np.square(trans_pred - trans_label)).mean()
                train_err_translation.update(mse_trans)
                train_err_rotation.update(q_dist)

            elif self.config.data_output_type == "pose_absolute":
                # select rotation indices from the prediction tensor
                indices = torch.tensor([3, 4, 5, 6])
                indices = indices.to(self.device)
                rotation = torch.index_select(pred, 1, indices)
                # select rotation indices from the label tensor
                y_rot = torch.index_select(y, 1, indices)

                q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy())
                q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy())
                q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot))
                q_dist_list.append(q_dist)

                # loss for translation
                # select translation indices from the prediction tensor
                indices = torch.tensor([0, 1, 2])
                indices = indices.to(self.device)
                translation = torch.index_select(pred, 1, indices)
                # select translation indices from the label tensor
                y_trans = torch.index_select(y, 1, indices)

                trans_pred = translation[0].cpu().detach().numpy()
                trans_label = y_trans[0].cpu().detach().numpy()

                # calc MSE loss for translation
                loss_translation = self.loss(translation, y_trans)
                trans_list.append(loss_translation.item())

                # use simple loss
                total_loss = self.loss(pred, y)

                # calc translation MSE
                mse_trans = (np.square(trans_pred - trans_label)).mean()
                train_err_translation.update(mse_trans)
                train_err_rotation.update(q_dist)

            elif self.config.data_output_type == "joints_relative":
                total_loss = self.loss(pred, y)
                train_err_joints.update(total_loss.item())
                # print("Train loss {:f}".format(total_loss.item()))
                joint_list.append(total_loss.item())
            else:
                raise Exception("Wrong data output type chosen.")

            if np.isnan(float(total_loss.item())):
                raise ValueError('Loss is nan during training...')

            # optimizer
            self.optimizer.zero_grad()
            total_loss.backward()
            self.optimizer.step()

            train_loss.update(total_loss.item())

            self.current_iteration += 1

            batch_duration = time.time() - batch_start
            mean_speed += batch_duration
            speed = float(mean_speed / current_batch)
            remaining_sec = speed * (total_batch - current_batch) * (
                self.config.max_epoch - self.current_epoch)
            batch_progress = float(current_batch / total_batch) * 100
            # print(int(batch_progress) % 5)

            total_loss_sum += total_loss.item()
            avg_total_loss = float(total_loss_sum / current_batch)
            #
            # if avg_total_loss <= self.config.min_avg_loss:
            #     print("Loss is {:.3e} <= {:.3e}".format(avg_total_loss, self.config.min_avg_loss))
            # else:
            #     print("Loss is {:.3e} > {:.3e}".format(avg_total_loss, self.config.min_avg_loss))

            if self.config.DEBUG_TRAINING_DURATION:  # and int(math.floor(batch_progress)) % 25 == 0:
                print(
                    "Current Batch {:d} {:d} {:2.1%} {:.2f} s Avg {:.2f} s/batch Loss {:.3e} Remaining {:s}"
                    .format(
                        current_batch, total_batch,
                        float(current_batch / total_batch), batch_duration,
                        speed, avg_total_loss,
                        time.strftime('Days %d Time %H:%M:%S',
                                      time.gmtime(remaining_sec))))

            if current_batch > samples:
                break

            print(np_batch_bench.shape)
            print(current_batch)
            np_batch_bench[current_batch - 1][0] = current_batch
            np_batch_bench[current_batch - 1][1] = total_batch
            np_batch_bench[current_batch - 1][2] = batch_duration
            np_batch_bench[current_batch - 1][3] = speed

            current_batch += 1

        # save mean of q_dist_list into bigger array
        mean = np.mean(np.asarray(q_dist_list))
        # print("Q mean {:3.2f} deg".format(mean))
        mean_t = np.mean(np.asarray(trans_list))
        mean_joints = np.mean(np.asarray(joint_list))

        self.trans_mean.append([self.iter, mean_t])
        self.q_dist_mean.append([self.iter, mean])
        self.joints_mean.append([self.iter, mean_joints])
        self.iter += 1

        # update logging dict
        self.logging_dict["learning_rate"].append(lr)
        self.logging_dict["train_loss"].append(train_loss.val)
        self.logging_dict["train_err_rotation"].append(train_err_rotation.val)
        self.logging_dict["train_err_translation"].append(
            train_err_translation.val)
        self.logging_dict["train_err_joints"].append(train_err_joints.val)

        # print progress
        progress = float((self.current_epoch + 1) / self.config.max_epoch)
        duration_epoch = time.time() - tic
        if self.current_epoch % self.config.display_step == 0 or self.current_epoch % 1 == 0:
            self.duration = time.time() - self.start_time
            self.logger.info(
                "Train Epoch: {:>4d} | Total: {:>4d} | Progress: {:>3.2%} | Loss: {:>3.2e} | Translation [mm]: {:>3.2e} |"
                " Rotation [deg] {:>3.2e} | Joints [deg] {:>3.2e} | ({:02d}:{:02d}:{:02d}) "
                .format(self.current_epoch + 1, self.config.max_epoch,
                        progress, train_loss.val, train_err_translation.val,
                        train_err_rotation.val, train_err_joints.val,
                        int(self.duration /
                            3600), int(np.mod(self.duration, 3600) / 60),
                        int(np.mod(np.mod(self.duration, 3600), 60))) +
                time.strftime("%d.%m.%y %H:%M:%S", time.localtime()))

        ds = pd.DataFrame(np_batch_bench)
        print(ds)
        path = "/home/speerponar/pytorch_models/evaluation/"
        ds.to_csv(path + "test_" + str(self.config.batch_size) + "w_" +
                  str(self.config.data_loader_workers) + ".csv")
        print("Save csv file")
コード例 #9
0
            losses.update(loss.data.tolist(), inputs.size(0))
            top1.update(prec1[0], inputs.size(0))
            arc.update(auroc, inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

    print('{batch}/{size} | Loss:{loss:.4f} | top1:{tp1:.4f} | AUROC:{ac:.4f}'.format(
         batch=batch_idx+1, size=len(val_loader), loss=losses.avg, tp1=top1.avg, ac=arc.avg))
    return (losses.avg, top1.avg, arc.avg)


for epoch in range(opt.start_epoch, opt.epochs):
    opt.lr = optimizer.state_dict()['param_groups'][0]['lr']
    adjust_learning_rate(optimizer, epoch, opt)
    print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr))
    
    train_loss, train_acc, train_auroc = train(opt, train_loader, model, criterion, optimizer, epoch, use_cuda)
    test_loss, test_acc, test_auroc = test(opt, val_loader, model, criterion, epoch, use_cuda)
    
    logger.append([opt.lr, train_loss, test_loss, train_acc, test_acc, train_auroc, test_auroc])
    scheduler_warmup.step()

    is_best = test_acc > best_acc
    best_acc = max(test_acc, best_acc)
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict' : model.state_dict(),
        'acc': test_acc,
        'best_acc': best_acc,
コード例 #10
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=============> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()
    print(model)
    # freeze all layers but the last fc
    #     for name, param in model.named_parameters():
    #         if name not in ['fc.weight', 'fc.bias']:
    #             param.requires_grad = False
    # init the fc layer
    model.fc = nn.Linear(512, args.num_class, bias=True)
    model.fc.weight.data.normal_(mean=0.0, std=0.01)
    model.fc.bias.data.zero_()

    # load from pre-trained, before DistributedDataParallel constructor
    if args.pretrained:
        if os.path.isfile(args.pretrained):
            print("=> loading checkpoint '{}'".format(args.pretrained))
            checkpoint = torch.load(args.pretrained, map_location="cpu")

            # rename moco pre-trained keys
            state_dict = checkpoint['state_dict']
            for k in list(state_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('module.encoder_q'
                                ) and not k.startswith('module.encoder_q.fc'):
                    # remove prefix
                    state_dict[k[len("module.encoder_q."):]] = state_dict[k]
                # delete renamed or unused k
                del state_dict[k]

            args.start_epoch = 0
            msg = model.load_state_dict(state_dict, strict=False)
            assert set(msg.missing_keys) == {"fc.weight", "fc.bias"}

            print("=> loaded pre-trained model '{}'".format(args.pretrained))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrained))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model)  #.cuda() for debug on cpu
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    # optimize only the linear classifier
    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    # assert len(parameters) == 2  # fc.weight, fc.bias
    optimizer = torch.optim.SGD(parameters,
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    normalize_video = transforms_video.NormalizeVideo(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    video_augmentation_train = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.RandomResizedCropVideo(args.crop_size),
        transforms_video.RandomHorizontalFlipVideo(),
        normalize_video,
    ])
    video_augmentation_val = transforms.Compose([
        transforms_video.ToTensorVideo(),
        transforms_video.CenterCropVideo(args.crop_size),
        normalize_video,
    ])
    data_dir = os.path.join(args.data, 'data')
    anno_dir = os.path.join(args.data, 'anno')
    audio_augmentation = moco.loader.DummyAudioTransform()
    train_augmentation = {
        'video': video_augmentation_train,
        'audio': audio_augmentation
    }
    val_augmentation = {
        'video': video_augmentation_val,
        'audio': audio_augmentation
    }

    train_dataset = UCF101(data_dir,
                           anno_dir,
                           args.frame_per_clip,
                           args.step_between_clips,
                           fold=1,
                           train=True,
                           transform=train_augmentation,
                           num_workers=16)
    train_sampler = RandomClipSampler(train_dataset.video_clips, 10)
    if args.distributed:
        train_sampler = DistributedSampler(train_sampler)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               multiprocessing_context="fork")

    val_dataset = UCF101(data_dir,
                         anno_dir,
                         args.frame_per_clip,
                         args.step_between_clips,
                         fold=1,
                         train=False,
                         transform=val_augmentation,
                         num_workers=16)
    # Do not use DistributedSampler since it will destroy the testing iteration process
    val_sampler = UniformClipSampler(val_dataset.video_clips,
                                     args.clip_per_video)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.clip_per_video,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             multiprocessing_context="fork")

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    if args.multiprocessing_distributed and args.gpu == 0:
        log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir,
                                                       args.batch_size,
                                                       args.lr, args.crop_size,
                                                       args.frame_per_clip)
        writer = SummaryWriter(log_dir)
    else:
        writer = None
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        val_loss, acc1, acc5 = validate(val_loader, model, criterion, args)
        if writer is not None:
            writer.add_scalar('lincls_val/loss', val_loss, epoch)
            writer.add_scalar('lincls_val/acc1', acc1, epoch)
            writer.add_scalar('lincls_val/acc5', acc5, epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(
                args.ckp_dir, args.batch_size, args.lr, args.crop_size,
                args.frame_per_clip)
            save_checkpoint(epoch, {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            },
                            ckp_dir,
                            max_save=1,
                            is_best=is_best)