def main(): parser = argparse.ArgumentParser(description='Voxelnet for semantic') parser.add_argument('--lr', default=0.001, type=float, help='Initial learning rate') parser.add_argument('--epochs', default=100, help='epochs') parser.add_argument('--batchsize', default=4, help='epochs') parser.add_argument('--weight_file', default='', help='weights to load') parser.add_argument( '--test_area', type=int, default=5, help='Which area to use for test, option: 1-6 [default: 6]') parser.add_argument('--num_point', type=int, default=4096, help='Point number [default: 4096]') args = parser.parse_args() NUM_POINT = args.num_point BATCH_SIZE = args.batchsize lr = args.lr ALL_FILES = getDataFiles('indoor3d_sem_seg_hdf5_data/all_files.txt') room_filelist = [ line.rstrip() for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt') ] # Load ALL data data_batch_list = [] label_batch_list = [] for h5_filename in ALL_FILES: data_batch, label_batch = loadDataFile(h5_filename) data_batch_list.append(data_batch) label_batch_list.append(label_batch) data_batches = np.concatenate(data_batch_list, 0) label_batches = np.concatenate(label_batch_list, 0) print(data_batches.shape) print(label_batches.shape) test_area = 'Area_' + str(args.test_area) train_idxs = [] test_idxs = [] for i, room_name in enumerate(room_filelist): if test_area in room_name: test_idxs.append(i) else: train_idxs.append(i) train_data = data_batches[ train_idxs, ...] # ... means ellipsis, the same as [train_idxs, :, :] train_label = label_batches[train_idxs].astype(np.int64) test_data = data_batches[test_idxs, ...] test_label = label_batches[test_idxs].astype(np.int64) print(train_data.shape, train_label.shape) print(test_data.shape, test_label.shape) time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string) if not os.path.exists(log_dir): os.makedirs(log_dir) checkpoint_dir = os.path.join(log_dir, 'checkpoint') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) #writer = SummaryWriter(log_dir=os.path.join( log_dir, 'tensorboard')) start_epoch = 0 epochs = args.epochs model = get_model() model.cuda() # print(model) optimizer = torch.optim.Adam(model.parameters(), lr) criterion = nn.CrossEntropyLoss().cuda() if args.weight_file != '': pre_trained_model = torch.load(args.weight_file) start_epoch = pre_trained_model['epoch'] model_state = model.state_dict() model_state.update(pre_trained_model['state_dict']) model.load_state_dict(model_state) global_counter = 0 for epoch in range(start_epoch, epochs): learn_rate_now = adjust_learning_rate(optimizer, global_counter, BATCH_SIZE, lr) #writer.add_scalar('train/learning_rate', learn_rate_now, global_counter) losses = AverageMeter() top1 = AverageMeter() model.train() train_data_shuffled, train_label_shuffled, _ = shuffle_data( train_data[:, 0:NUM_POINT, :], train_label) file_size = train_data_shuffled.shape[0] num_batches = file_size // BATCH_SIZE for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = train_data_shuffled[start_idx:end_idx, :, :] label = train_label_shuffled[start_idx:end_idx] feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) input = torch.transpose(input, 3, 1) target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) target = target.view(-1, ) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 13) loss = criterion(output_reshaped, target) prec1 = accuracy(output_reshaped.data, target.data, topk=(1, )) #prec1[0] = prec1[0].cpu().numpy()[0] prec1 = prec1[0].cpu().numpy() #losses.update(loss.data[0], BATCH_SIZE) losses.update(loss.data, BATCH_SIZE) #top1.update(prec1[0], BATCH_SIZE) top1.update(prec1, BATCH_SIZE) optimizer.zero_grad() loss.backward() optimizer.step() print('Epoch: [{0}][{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(epoch, batch_idx, loss=losses, top1=top1)) with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f: f.write('Epoch: [{0}][{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) \n'.format( epoch, batch_idx, loss=losses, top1=top1)) global_counter += 1 #writer.add_scalar('train/loss', losses.avg, global_counter) #writer.add_scalar('train/accuracy', top1.avg, global_counter) losses = AverageMeter() top1 = AverageMeter() model.eval() file_size = test_data.shape[0] num_batches = file_size // BATCH_SIZE for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = test_data[start_idx:end_idx, :, :] label = test_label[start_idx:end_idx] feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) input = torch.transpose(input, 3, 1) target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) target = target.view(-1, ) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 13) loss = criterion(output_reshaped, target) prec1 = accuracy(output_reshaped.data, target.data, topk=(1, )) #prec1[0] = prec1[0].cpu().numpy()[0] prec1 = prec1[0].cpu().numpy() #losses.update(loss.data[0], BATCH_SIZE) losses.update(loss.data, BATCH_SIZE) #top1.update(prec1[0], BATCH_SIZE) top1.update(prec1, BATCH_SIZE) #writer.add_scalar('val/loss', losses.avg, global_counter) #writer.add_scalar('val/accuracy', top1.avg, global_counter) print('Epoch {} Val Loss {:.3f} Val Acc {:.3f} \t'.format( epoch, losses.avg, top1.avg)) with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f: f.write('Epoch: [{0}]\t' 'Loss {loss.avg:.4f} \t' 'Prec@1 {top1.avg:.3f} \n'.format(epoch, loss=losses, top1=top1)) if (epoch % 5 == 0): torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(checkpoint_dir, 'checkpoint_' + str(epoch) + '.pth.tar'))
def main(): parser = argparse.ArgumentParser(description='Voxelnet for semantic') parser.add_argument('--lr', default=0.001, type=float, help='Initial learning rate') # default=0.001(good) parser.add_argument('--epochs', default=2, help='epochs') # default=100, 50, 30 parser.add_argument('--batchsize', default=4, help='epochs') # default=32 parser.add_argument('--weight_file', default='', help='weights to load') # log_ptn/train/Area_2_2019-09-11-11-43-48/checkpoint/checkpoint_0_max_mIoU_test_25.17065278824228.pth.tar parser.add_argument( '--test_area', type=int, default=2, help='Which area to use for test, option: 1-2 [default: 2]') parser.add_argument('--num_point', type=int, default=4096, help='Point number [default: 4096]') args = parser.parse_args() NUM_POINT = args.num_point BATCH_SIZE = args.batchsize lr = args.lr ALL_FILES = getDataFiles( 'indoor3d_sem_seg_hdf5_data/all_files.txt') # .h5 file routes room_filelist = [ line.rstrip() for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt') ] # Load ALL data into a big data_batch & a big label_batch data_batch_list = [] label_batch_list = [] print(ALL_FILES) for h5_filename in ALL_FILES: h5_dir = os.path.join( '/home/chenkun/pointnet_pytorch-master/indoor3d_sem_seg_hdf5_data', h5_filename) f = h5py.File(h5_dir) data_batch = f['data'][:] label_batch = f['label'][:] data_batch_list.append(data_batch) label_batch_list.append(label_batch) data_batches = np.concatenate(data_batch_list, 0) label_batches = np.concatenate(label_batch_list, 0) print(data_batches.shape) print(label_batches.shape) test_area = 'Area_' + str(args.test_area) train_idxs = [] test_idxs = [] for i, room_name in enumerate(room_filelist): if test_area in room_name: test_idxs.append(i) else: train_idxs.append(i) train_data = data_batches[train_idxs, ...] train_label = label_batches[train_idxs].astype(np.int64) # test_data = data_batches[test_idxs, ...] # ZZC # test_label = label_batches[test_idxs].astype(np.int64) # ZZC test_data = train_data # ZZC test_label = train_label # ZZC print(train_data.shape, train_label.shape) print(test_data.shape, test_label.shape) time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string) if not os.path.exists(log_dir): os.makedirs(log_dir) checkpoint_dir = os.path.join(log_dir, 'checkpoint') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) start_epoch = 0 epochs = args.epochs model = get_model() model.cuda() # print(model) optimizer = torch.optim.Adam(model.parameters(), lr) # class_names = ["ground", "vegetation", "building", "clutter"] # ZZC class_names = ["T2T", "B2B", "BH", "BL", "V2V", "OT"] # Add weights to the loss function # weightsTrain = [0.04, 0.20, 0.12, 0.64] # default # weightsTrain = [0.25, 0.25, 0.25, 0.25] # weightsTrain = [0.20, 0.50, 0.30, 0.50] weightsTrain = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00] class_weights_Train = torch.FloatTensor(weightsTrain).cuda() criterionTrain = nn.CrossEntropyLoss(weight=class_weights_Train, size_average=True).cuda() # True: loss is averaged over each loss element in batch weightsVal = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00] # default [0.08, 0.37, 0.15, 0.40] class_weights_Val = torch.FloatTensor(weightsVal).cuda() criterionVal = nn.CrossEntropyLoss(weight=class_weights_Val, size_average=True).cuda() if args.weight_file != '': pre_trained_model = torch.load(args.weight_file) start_epoch = pre_trained_model['epoch'] model_state = model.state_dict() model_state.update(pre_trained_model['state_dict']) model.load_state_dict(model_state) # ##################################################### # Start training # ##################################################### global_counter = 0 max_mIoU_test = 0.0 for epoch in range(start_epoch, epochs): learn_rate_now = adjust_learning_rate(optimizer, global_counter, BATCH_SIZE, lr) # Seems not changing, ZZC iter_loss = 0.0 # Initialisation: loss for one epoch iterations = 0 cm = ConfusionMatrix(6, class_names=class_names) cm.clear() model.train() train_data_shuffled, train_label_shuffled, _ = shuffle_data( train_data[:, 0:NUM_POINT, :], train_label) file_size = train_data_shuffled.shape[ 0] # total number of training batches num_batches = file_size // BATCH_SIZE # number of iterations in one epoch print('\nnum_batches(training):\t', num_batches) for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = train_data_shuffled[start_idx:end_idx, :, :] label = train_label_shuffled[start_idx:end_idx] # print('Here') # print(feature.shape) # print(label.shape) # feature[:, :, 0:2] = 0.0 # feature[:, :, 6:9] = 0.0 # print(feature.shape) # print(feature[0, 0, 0]) # print(feature[0, 0, 1]) # print(feature[0, 0, 2]) # print(feature[0, 0, 3]) # print(feature[0, 0, 4]) # print(feature[0, 0, 5]) # print(feature[0, 0, 6]) # print(feature[0, 0, 7]) # print(feature[0, 0, 8]) # feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) # print(input.size()) input = torch.transpose(input, 3, 1) # ? ZZC # print(input.size()) target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) # print(target.size()) target = target.view(-1, ) # print(target.size()) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 6) # exit() # for check, ZZC _, pred = torch.max(output.data, 1) pred = pred.view(-1, ) cm.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # detach() loss = criterionTrain(output_reshaped, target) iter_loss += loss.item() # Accumulate the loss iterations += 1 optimizer.zero_grad() loss.backward() optimizer.step() global_counter += 1 if batch_idx % 10 == 0: print('Epoch: [%3d][%3d]\t Loss: %.4f' % (epoch, batch_idx, loss)) # Print loss for one bath # Print training results for 1 epoch iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU() print( 'Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) # Print loss for the epoch print( 'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%' % (iou0, iou1, iou2, iou3, iou4, iou5)) with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f: f.write( 'Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) f.write( 'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n' % (iou0, iou1, iou2, iou3, iou4, iou5)) # ##################################################### # Start validation # ##################################################### model.eval() iter_loss = 0.0 # Initialisation: loss for one epoch iterations = 0 cm = ConfusionMatrix(6, class_names=class_names) # ZZC cm.clear() file_size = test_data.shape[0] num_batches = file_size // BATCH_SIZE print('num_batches(testing):\t', num_batches) for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = test_data[start_idx:end_idx, :, :] label = test_label[start_idx:end_idx] # feature[:, :, 0:2] = 0.0 # feature[:, :, 6:9] = 0.0 feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) input = torch.transpose(input, 3, 1) # ? ZZC target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) target = target.view(-1, ) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 6) _, pred = torch.max(output.data, 1) pred = pred.view(-1, ) cm.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # detach() loss = criterionVal(output_reshaped, target) iter_loss += loss.item() # Accumulate the loss iterations += 1 # Print validation results after 1 epoch iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU() print('Epoch: [%3d]\t Test Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) # Print loss for the epoch print( 'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%' % (iou0, iou1, iou2, iou3, iou4, iou5)) with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f: f.write( 'Epoch: [%3d]\t Test Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) f.write( 'T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n' % (iou0, iou1, iou2, iou3, iou4, iou5)) # Check whether best model, -> Save model if (mIoU > max_mIoU_test or epoch == epochs - 1): max_mIoU_test = mIoU print( '-> Best performance (test mIoU) achieved or This is final epoch.' ) print('Max_mIoU in testing: %3.2f%%\n' % (max_mIoU_test)) torch.save( { 'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( checkpoint_dir, 'checkpoint_' + str(epoch) + '_max_mIoU_test_' + str(mIoU) + '.pth.tar'))
def train_one_epoch(self): """ One epoch training function """ # Initialize tqdm tqdm_batch = tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations, desc="Epoch-{}-".format(self.current_epoch)) # Set the model to be in training mode self.model.train() # Initialize your average meters epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() current_batch = 0 for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(self.config.async_loading), y.cuda( self.config.async_loading) # current iteration over total iterations progress = float( self.current_epoch * self.data_loader.train_iterations + current_batch) / (self.config.max_epoch * self.data_loader.train_iterations) # progress = float(self.current_iteration) / (self.config.max_epoch * self.data_loader.train_iterations) x, y = Variable(x), Variable(y) lr = adjust_learning_rate(self.optimizer, self.current_epoch, self.config, batch=current_batch, nBatch=self.data_loader.train_iterations) # model pred = self.model(x, progress) # loss cur_loss = self.loss(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during training...') # optimizer self.optimizer.zero_grad() cur_loss.backward() self.optimizer.step() top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) epoch_loss.update(cur_loss.item()) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) self.current_iteration += 1 current_batch += 1 self.summary_writer.add_scalar("epoch/loss", epoch_loss.val, self.current_iteration) self.summary_writer.add_scalar("epoch/accuracy", top1_acc.val, self.current_iteration) tqdm_batch.close() self.logger.info("Training at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.val) + "- Top1 Acc: " + str(top1_acc.val) + "- Top5 Acc: " + str(top5_acc.val))
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) netG = moco.builder.MaskGenerator() netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(netG) print(netD) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) netG.cuda(args.gpu) netD.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) netG = torch.nn.parallel.DistributedDataParallel( netG, device_ids=[args.gpu], find_unused_parameters=True) netD = torch.nn.parallel.DistributedDataParallel( netD, device_ids=[args.gpu], find_unused_parameters=True) else: netG.cuda() netD.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set netG = torch.nn.parallel.DistributedDataParallel(netG) netD = torch.nn.parallel.DistributedDataParallel(netD) elif args.gpu is not None: torch.cuda.set_device(args.gpu) netG = netG.cuda(args.gpu) netD = netD.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. pass # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu # torch.cuda.synchronize() optimizer_g = torch.optim.SGD(netG.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer_d = torch.optim.SGD(netD.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda(args.gpu) G_criterion = nn.L1Loss().cuda(args.gpu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] netD.load_state_dict(checkpoint['state_dict']) #optimizer_d.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if os.path.isfile(args.resumeG): print("=> loading checkpoint '{}'".format(args.resumeG)) if args.gpu is None: checkpoint = torch.load(args.resumeG) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resumeG, map_location=loc) args.start_epoch = checkpoint['epoch'] netG.load_state_dict(checkpoint['state_dict']) #optimizer_g.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resumeG, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resumeG)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') video_augmentation = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)), ]) audio_augmentation = moco.loader.DummyAudioTransform() augmentation = {'video': video_augmentation, 'audio': audio_augmentation} augmentation_gpu = moco.loader.MoCoAugmentV2( args.crop_size) if args.aug_plus else moco.loader.MoCoAugment( args.crop_size) train_dataset = Kinetics400(traindir, args.frame_per_clip, args.step_between_clips, extensions='mp4', transform=augmentation, num_workers=4) train_sampler = RandomClipSampler(train_dataset.video_clips, 1) if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler) train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True, multiprocessing_context="fork") if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer_d, epoch, args) adjust_learning_rate(optimizer_g, epoch, args) # train for one epoch train(train_loader, augmentation_gpu, criterion, G_criterion, netG, netD, optimizer_g, optimizer_d, epoch, args, writer) if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netG.state_dict(), }, ckp_dir + '/netG', max_save=20, is_best=False) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netD.state_dict(), }, ckp_dir + '/netD', max_save=20, is_best=False)
def main(): # load config cfg = importlib.import_module('configs.cifar10.{}'.format(args.config)).config # accuracy best_acc = 0 best_epoch = 0 start_epoch = 0 # fix random seed if args.rng_seed is not None: rng_seed = args.rng_seed else: rng_seed = cfg.TRAIN.rng_seed random.seed(rng_seed) np.random.seed(rng_seed) torch.manual_seed(rng_seed) # setup output and logger output_dir = mkdir(osp.join(OUTPUT_ROOT_DIR, args.config, 'rnd_%d' % rng_seed)) logger = create_logger(output_dir) logger.info('config:\n' + pprint.pformat(cfg)) logger.info('arguments:\n' + pprint.pformat(args)) logger.info('gpu(s): ' + str(os.environ.get('CUDA_VISIBLE_DEVICES'))) print("=> Creating model '{}'".format(cfg.model)) model = models.cifar10.__dict__[cfg.model]() module_dict = dict(model.named_modules()) logger.info('module:\n' + pprint.pformat(module_dict)) # define loss function (criterion) criterion = nn.CrossEntropyLoss() # gpu support assert torch.cuda.is_available(), 'Training requires cuda' # if the input size is fixed, enable it import torch.backends.cudnn as cudnn cudnn.benchmark = True # cudnn.deterministic = True # enable DataParallel, default use all cuda devices model = nn.DataParallel(model).cuda() # model = model.cuda() criterion = criterion.cuda() # define optimizer optimizer = torch.optim.SGD(model.parameters(), cfg.TRAIN.lr, momentum=cfg.TRAIN.momentum, weight_decay=cfg.TRAIN.wd) # optionally resume from a checkpoint if args.resume: if osp.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] best_epoch = checkpoint['best_epoch'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not args.eval: optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: logger.warn("=> no checkpoint found at '{}'".format(args.resume)) # load data print('=> Preparing data') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # transform into [0.0, 1.0] transforms.Normalize(PIXEL_MEANS, PIXEL_STDS), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(PIXEL_MEANS, PIXEL_STDS), ]) train_set = torchvision.datasets.CIFAR10(root=DATA_ROOT_DIR, train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(train_set, batch_size=cfg.TRAIN.batch_size, shuffle=True, num_workers=args.num_worker) val_set = torchvision.datasets.CIFAR10(root=DATA_ROOT_DIR, train=False, download=True, transform=transform_test) val_loader = torch.utils.data.DataLoader(val_set, batch_size=cfg.TEST.batch_size, shuffle=False, num_workers=args.num_worker) if args.eval: logger.info('evaluating trained model') acc = validate(val_loader, model, criterion) logger.info( 'Val-Epoch: [{0}]\t' 'Prec@1: {acc:.3f})'.format(start_epoch, acc=acc) ) return def do_checkpoint(epoch, path): torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'model': cfg.model, 'best_acc': best_acc, 'best_epoch': best_epoch, }, path) # save initialization state do_checkpoint(0, osp.join(output_dir, 'init.ckpt')) for epoch in range(start_epoch, cfg.TRAIN.end_epoch): adjust_learning_rate(optimizer, epoch, cfg.TRAIN.lr, cfg.TRAIN.lr_step) # train for one epoch epoch_result = train(train_loader, model, criterion, optimizer, epoch) logger.info( 'Train-Epoch: [{0}]\t' 'Loss: {loss:.4f}\t' 'Prec@1: {acc:.3f}'.format( epoch + 1, **epoch_result) ) # evaluate on validation set acc = validate(val_loader, model, criterion) logger.info( 'Val-Epoch: [{0}]\t' 'Prec@1: {acc:.3f}'.format( epoch + 1, acc=acc) ) # remember best acc and save checkpoint is_best = acc > best_acc epoch_t = epoch + 1 if is_best: best_epoch = epoch_t best_acc = acc do_checkpoint(best_epoch, osp.join(output_dir, 'best.ckpt')) if (args.ckpt_interval is not None) and (epoch_t % args.ckpt_interval == 0): do_checkpoint(epoch_t, osp.join(output_dir, '%03d.ckpt' % epoch_t)) logger.info( '=> Best-Epoch: [{0}]\t' 'Prec@1: {acc:.3f}'.format( best_epoch, acc=best_acc) )
columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'time'] train_utils.save_checkpoint( args.dir, start_epoch - 1, model_state=model.state_dict(), optimizer_state=optimizer.state_dict() ) test_res = {'loss': None, 'accuracy': None, 'nll': None} for epoch in range(start_epoch, args.epochs + 1): time_ep = time.time() lr = learning_rate_schedule(args.lr, epoch, args.epochs) train_utils.adjust_learning_rate(optimizer, lr) train_res = train_utils.train(loaders['train'], model, optimizer, criterion, regularizer, cuda=args.cuda) test_res = train_utils.test(loaders['test'], model, criterion, regularizer, cuda=args.cuda) if epoch % args.save_freq == 0: train_utils.save_checkpoint( args.dir, epoch, model_state=model.state_dict(), optimizer_state=optimizer.state_dict() ) time_ep = time.time() - time_ep values = [epoch, lr, train_res['loss'], train_res['accuracy'], test_res['nll'], test_res['accuracy'], time_ep]
def main(): parser = argparse.ArgumentParser(description='Voxelnet for semantic') parser.add_argument('--lr', default=0.001, type=float, help='Initial learning rate') parser.add_argument('--epochs', default=50, help='epochs') # default=100 parser.add_argument('--batchsize', default=4, help='epochs') # default=32 parser.add_argument('--weight_file', default='', help='weights to load') parser.add_argument('--test_area', type=int, default=2, help='Which area to use for test, option: 1-2 [default: 2]') parser.add_argument('--num_point', type=int, default=4096, help='Point number [default: 4096]') args = parser.parse_args() NUM_POINT = args.num_point BATCH_SIZE = args.batchsize lr = args.lr ALL_FILES = getDataFiles('indoor3d_sem_seg_hdf5_data/all_files.txt') room_filelist = [line.rstrip() for line in open('indoor3d_sem_seg_hdf5_data/room_filelist.txt')] # Load ALL data into a big data_batch & a big label_batch data_batch_list = [] label_batch_list = [] print(ALL_FILES) for h5_filename in ALL_FILES: # print(h5_filename) # data_batch, label_batch = loadDataFile(h5_filename) h5_dir = os.path.join('/home/chenkun/pointnet_pytorch-master/indoor3d_sem_seg_hdf5_data', h5_filename) f = h5py.File(h5_dir) data_batch = f['data'][:] label_batch = f['label'][:] data_batch_list.append(data_batch) label_batch_list.append(label_batch) data_batches = np.concatenate(data_batch_list, 0) label_batches = np.concatenate(label_batch_list, 0) print(data_batches.shape) print(label_batches.shape) test_area = 'Area_' + str(args.test_area) train_idxs = [] test_idxs = [] for i, room_name in enumerate(room_filelist): if test_area in room_name: test_idxs.append(i) else: train_idxs.append(i) train_data = data_batches[train_idxs, ...] train_label = label_batches[train_idxs].astype(np.int64) test_data = data_batches[test_idxs, ...] test_label = label_batches[test_idxs].astype(np.int64) print(train_data.shape, train_label.shape) print(test_data.shape, test_label.shape) time_string = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') log_dir = os.path.join('log_ptn/train', test_area + '_' + time_string) if not os.path.exists(log_dir): os.makedirs(log_dir) checkpoint_dir = os.path.join(log_dir, 'checkpoint') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) writer = SummaryWriter(log_dir=os.path.join( log_dir, 'tensorboard')) start_epoch = 0 epochs = args.epochs model = get_model() model.cuda() # print(model) optimizer = torch.optim.Adam(model.parameters(), lr) # class_names = ["ground", "vegetation", "building", "clutter"] # ZZC class_names = ["T2T", "B2B", "BH", "BL", "V2V", "OT"] # Add weights to the loss function # weightsTrain = [0.04, 0.20, 0.12, 0.64] # default # weightsTrain = [0.25, 0.25, 0.25, 0.25] # weightsTrain = [0.20, 0.50, 0.30, 0.50] weightsTrain = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00] class_weights_Train = torch.FloatTensor(weightsTrain).cuda() criterionTrain = nn.CrossEntropyLoss(weight=class_weights_Train, size_average=True).cuda() # True: loss is averaged over each loss element in batch weightsVal = [0.2, 0.4, 0.6, 1.00, 1.00, 1.00] # default [0.08, 0.37, 0.15, 0.40] class_weights_Val = torch.FloatTensor(weightsVal).cuda() criterionVal = nn.CrossEntropyLoss(weight=class_weights_Val, size_average=True).cuda() #criterion = nn.CrossEntropyLoss().cuda() if args.weight_file != '': pre_trained_model = torch.load(args.weight_file) start_epoch = pre_trained_model['epoch'] model_state = model.state_dict() model_state.update(pre_trained_model['state_dict']) model.load_state_dict(model_state) global_counter = 0 max_mIoU_test = 0.0 for epoch in range(start_epoch, epochs): learn_rate_now = adjust_learning_rate(optimizer, global_counter, BATCH_SIZE, lr) # writer.add_scalar('train/learning_rate', learn_rate_now, global_counter) # # losses = AverageMeter() # top1 = AverageMeter() # model.train() # # train_data_shuffled, train_label_shuffled, _ = shuffle_data(train_data[:, 0:NUM_POINT, :], train_label) # file_size = train_data_shuffled.shape[0] # num_batches = file_size // BATCH_SIZE iter_loss = 0.0 # Initialisation: loss for one epoch iterations = 0 cm = ConfusionMatrix(6, class_names=class_names) cm.clear() model.train() train_data_shuffled, train_label_shuffled, _ = shuffle_data(train_data[:, 0:NUM_POINT, :], train_label) file_size = train_data_shuffled.shape[0] # total number of training batches num_batches = file_size // BATCH_SIZE # number of iterations in one epoch print('\nnum_batches(training):\t', num_batches) for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = train_data_shuffled[start_idx:end_idx, :, :] label = train_label_shuffled[start_idx:end_idx] feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) input = torch.transpose(input, 3, 1) target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) target = target.view(-1,) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 6) _, pred = torch.max(output.data, 1) pred = pred.view(-1, ) cm.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # detach() loss = criterionTrain(output_reshaped, target) iter_loss += loss.item() # Accumulate the loss iterations += 1 loss = criterion(output_reshaped, target) prec1 = accuracy(output_reshaped.data, target.data, topk=(1,)) prec1[0] = prec1[0].cpu().numpy() losses.update(loss.item(), BATCH_SIZE) top1.update(prec1[0], BATCH_SIZE) optimizer.zero_grad() loss.backward() optimizer.step() global_counter += 1 if batch_idx%10==0: print('Epoch: [%3d][%3d]\t Loss: %.4f'%(epoch,batch_idx,loss)) # Print loss for one bath # print('Epoch: [{0}][{1}]\t' # 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' # 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( # epoch, batch_idx, loss=losses, top1=top1)) # # with open(os.path.join(log_dir,'train_log.txt'), 'a') as f: # f.write('Epoch: [{0}][{1}]\t' # 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' # 'Prec@1 {top1.val:.3f} ({top1.avg:.3f}) \n'.format( # epoch, batch_idx, loss=losses, top1=top1)) # Print training results for 1 epoch iou0, iou1, iou2, iou3, iou4, iou5, mIoU = cm.class_IoU() print('Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) # Print loss for the epoch print('T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%' % (iou0, iou1, iou2, iou3, iou4, iou5)) with open(os.path.join(log_dir, 'train_log.txt'), 'a') as f: f.write('Epoch: [%3d]\t Train Loss: %.4f\t OA: %3.2f%%\t mIoU : %3.2f%%\n' % (epoch, iter_loss / iterations, cm.overall_accuracy(), mIoU)) f.write('T2T: %3.2f%%, B2B: %3.2f%%, BH: %3.2f%%, BL: %3.2f%%, V2V: %3.2f%%, OT: %3.2f%%\n\n' % (iou0, iou1, iou2, iou3, iou4, iou5)) #global_counter += 1 writer.add_scalar('train/loss', losses.avg, global_counter) writer.add_scalar('train/accuracy', top1.avg, global_counter) # losses = AverageMeter() # top1 = AverageMeter() model.eval() file_size = test_data.shape[0] num_batches = file_size // BATCH_SIZE for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE feature = test_data[start_idx:end_idx, :, :] label = test_label[start_idx:end_idx] feature = np.expand_dims(feature, axis=1) input = Variable(torch.from_numpy(feature).cuda(), requires_grad=True) input = torch.transpose(input, 3, 1) target = Variable(torch.from_numpy(label).cuda(), requires_grad=False) target = target.view(-1,) output = model(input) output_reshaped = output.permute(0, 3, 2, 1).contiguous().view(-1, 13) loss = criterion(output_reshaped, target) prec1 = accuracy(output_reshaped.data, target.data, topk=(1,)) prec1[0] = prec1[0].cpu().numpy() losses.update(loss.item(), BATCH_SIZE) top1.update(prec1[0], BATCH_SIZE) writer.add_scalar('val/loss', losses.avg, global_counter) writer.add_scalar('val/accuracy', top1.avg, global_counter) print('Epoch {} Val Loss {:.3f} Val Acc {:.3f} \t' .format(epoch, losses.avg, top1.avg)) with open(os.path.join(log_dir, 'test_log.txt'), 'a') as f: f.write('Epoch: [{0}]\t' 'Loss {loss.avg:.4f} \t' 'Prec@1 {top1.avg:.3f} \n'.format( epoch, loss=losses, top1=top1)) if(epoch % 5 == 0): torch.save( {'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, os.path.join(checkpoint_dir, 'checkpoint_' + str(epoch) + '.pth.tar') ) writer.close()
def train_one_epoch(self): """ One epoch training function """ # Set the model to be in training mode self.model.train() # Initialize your average meters train_loss = AverageMeter() train_err_joints = AverageMeter() train_err_rotation = AverageMeter() train_err_translation = AverageMeter() total_batch = len(self.data_loader.train_loader) print("Starting Epoch Training with batch size: {:d}".format( total_batch)) print("Batch Size: {:d}".format(self.config.batch_size)) current_batch = 1 # times = {} q_dist_list = [] trans_list = [] joint_list = [] tic = time.time() mean_speed = 0 total_loss_sum = 0 samples = 300 np_batch_bench = np.zeros([samples, 4]) for x, y in self.data_loader.train_loader: batch_start = time.time() if self.cuda: x = x.to(device=self.device, dtype=torch.long) y = y.to(device=self.device, dtype=torch.long) progress = float( self.current_epoch * self.data_loader.train_iterations + current_batch) / (self.config.max_epoch * self.data_loader.train_iterations) # adjust learning rate lr = adjust_learning_rate(self.optimizer, self.current_epoch, self.config, batch=current_batch, nBatch=self.data_loader.train_iterations) # model pred = self.model((x)) if self.config.data_output_type == "joints_absolute": loss_joints = self.loss(pred, y) total_loss = loss_joints train_loss.update(total_loss.item()) train_err_joints.update(total_loss.item()) elif self.config.data_output_type == "q_trans_simple": loss_q_trans_simple = self.loss(pred, y) total_loss = loss_q_trans_simple elif self.config.data_output_type == "pose_relative": # loss for rotation # select rotation indices from the prediction tensor indices = torch.tensor([3, 4, 5, 6]) indices = indices.to(self.device) rotation = torch.index_select(pred, 1, indices) # select rotation indices from the label tensor y_rot = torch.index_select(y, 1, indices) # calc MSE loss for rotation # loss_rotation = self.loss(rotation, y_rot) # trans_list.append(loss_rotation[0].item().numpy()) # print(loss_rotation.item()) # penalty loss from facebook paper posenet # penalty_loss = self.config.rot_reg * torch.mean((torch.sum(quater ** 2, dim=1) - 1) ** 2) penalty_loss = 0 q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy()) q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy()) q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot)) q_dist_list.append(q_dist) # loss for translation # select translation indices from the prediction tensor indices = torch.tensor([0, 1, 2]) indices = indices.to(self.device) translation = torch.index_select(pred, 1, indices) # select translation indices from the label tensor y_trans = torch.index_select(y, 1, indices) # calc MSE loss for translation loss_translation = self.loss(translation, y_trans) trans_list.append(loss_translation.item()) # total_loss = penalty_loss + loss_rotation + loss_translation # use simple loss total_loss = self.loss(pred.double(), y.double()) # calc translation MSE q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy()) q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy()) q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot)) q_dist_list.append(q_dist) trans_pred = translation[0].cpu().detach().numpy() trans_label = y_trans[0].cpu().detach().numpy() mse_trans = (np.square(trans_pred - trans_label)).mean() train_err_translation.update(mse_trans) train_err_rotation.update(q_dist) elif self.config.data_output_type == "pose_absolute": # select rotation indices from the prediction tensor indices = torch.tensor([3, 4, 5, 6]) indices = indices.to(self.device) rotation = torch.index_select(pred, 1, indices) # select rotation indices from the label tensor y_rot = torch.index_select(y, 1, indices) q_pred = pq.Quaternion(rotation[0].cpu().detach().numpy()) q_rot = pq.Quaternion(y_rot[0].cpu().detach().numpy()) q_dist = math.degrees(pq.Quaternion.distance(q_pred, q_rot)) q_dist_list.append(q_dist) # loss for translation # select translation indices from the prediction tensor indices = torch.tensor([0, 1, 2]) indices = indices.to(self.device) translation = torch.index_select(pred, 1, indices) # select translation indices from the label tensor y_trans = torch.index_select(y, 1, indices) trans_pred = translation[0].cpu().detach().numpy() trans_label = y_trans[0].cpu().detach().numpy() # calc MSE loss for translation loss_translation = self.loss(translation, y_trans) trans_list.append(loss_translation.item()) # use simple loss total_loss = self.loss(pred, y) # calc translation MSE mse_trans = (np.square(trans_pred - trans_label)).mean() train_err_translation.update(mse_trans) train_err_rotation.update(q_dist) elif self.config.data_output_type == "joints_relative": total_loss = self.loss(pred, y) train_err_joints.update(total_loss.item()) # print("Train loss {:f}".format(total_loss.item())) joint_list.append(total_loss.item()) else: raise Exception("Wrong data output type chosen.") if np.isnan(float(total_loss.item())): raise ValueError('Loss is nan during training...') # optimizer self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() train_loss.update(total_loss.item()) self.current_iteration += 1 batch_duration = time.time() - batch_start mean_speed += batch_duration speed = float(mean_speed / current_batch) remaining_sec = speed * (total_batch - current_batch) * ( self.config.max_epoch - self.current_epoch) batch_progress = float(current_batch / total_batch) * 100 # print(int(batch_progress) % 5) total_loss_sum += total_loss.item() avg_total_loss = float(total_loss_sum / current_batch) # # if avg_total_loss <= self.config.min_avg_loss: # print("Loss is {:.3e} <= {:.3e}".format(avg_total_loss, self.config.min_avg_loss)) # else: # print("Loss is {:.3e} > {:.3e}".format(avg_total_loss, self.config.min_avg_loss)) if self.config.DEBUG_TRAINING_DURATION: # and int(math.floor(batch_progress)) % 25 == 0: print( "Current Batch {:d} {:d} {:2.1%} {:.2f} s Avg {:.2f} s/batch Loss {:.3e} Remaining {:s}" .format( current_batch, total_batch, float(current_batch / total_batch), batch_duration, speed, avg_total_loss, time.strftime('Days %d Time %H:%M:%S', time.gmtime(remaining_sec)))) if current_batch > samples: break print(np_batch_bench.shape) print(current_batch) np_batch_bench[current_batch - 1][0] = current_batch np_batch_bench[current_batch - 1][1] = total_batch np_batch_bench[current_batch - 1][2] = batch_duration np_batch_bench[current_batch - 1][3] = speed current_batch += 1 # save mean of q_dist_list into bigger array mean = np.mean(np.asarray(q_dist_list)) # print("Q mean {:3.2f} deg".format(mean)) mean_t = np.mean(np.asarray(trans_list)) mean_joints = np.mean(np.asarray(joint_list)) self.trans_mean.append([self.iter, mean_t]) self.q_dist_mean.append([self.iter, mean]) self.joints_mean.append([self.iter, mean_joints]) self.iter += 1 # update logging dict self.logging_dict["learning_rate"].append(lr) self.logging_dict["train_loss"].append(train_loss.val) self.logging_dict["train_err_rotation"].append(train_err_rotation.val) self.logging_dict["train_err_translation"].append( train_err_translation.val) self.logging_dict["train_err_joints"].append(train_err_joints.val) # print progress progress = float((self.current_epoch + 1) / self.config.max_epoch) duration_epoch = time.time() - tic if self.current_epoch % self.config.display_step == 0 or self.current_epoch % 1 == 0: self.duration = time.time() - self.start_time self.logger.info( "Train Epoch: {:>4d} | Total: {:>4d} | Progress: {:>3.2%} | Loss: {:>3.2e} | Translation [mm]: {:>3.2e} |" " Rotation [deg] {:>3.2e} | Joints [deg] {:>3.2e} | ({:02d}:{:02d}:{:02d}) " .format(self.current_epoch + 1, self.config.max_epoch, progress, train_loss.val, train_err_translation.val, train_err_rotation.val, train_err_joints.val, int(self.duration / 3600), int(np.mod(self.duration, 3600) / 60), int(np.mod(np.mod(self.duration, 3600), 60))) + time.strftime("%d.%m.%y %H:%M:%S", time.localtime())) ds = pd.DataFrame(np_batch_bench) print(ds) path = "/home/speerponar/pytorch_models/evaluation/" ds.to_csv(path + "test_" + str(self.config.batch_size) + "w_" + str(self.config.data_loader_workers) + ".csv") print("Save csv file")
losses.update(loss.data.tolist(), inputs.size(0)) top1.update(prec1[0], inputs.size(0)) arc.update(auroc, inputs.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('{batch}/{size} | Loss:{loss:.4f} | top1:{tp1:.4f} | AUROC:{ac:.4f}'.format( batch=batch_idx+1, size=len(val_loader), loss=losses.avg, tp1=top1.avg, ac=arc.avg)) return (losses.avg, top1.avg, arc.avg) for epoch in range(opt.start_epoch, opt.epochs): opt.lr = optimizer.state_dict()['param_groups'][0]['lr'] adjust_learning_rate(optimizer, epoch, opt) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr)) train_loss, train_acc, train_auroc = train(opt, train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, test_auroc = test(opt, val_loader, model, criterion, epoch, use_cuda) logger.append([opt.lr, train_loss, test_loss, train_acc, test_acc, train_auroc, test_auroc]) scheduler_warmup.step() is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict' : model.state_dict(), 'acc': test_acc, 'best_acc': best_acc,
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=============> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # freeze all layers but the last fc # for name, param in model.named_parameters(): # if name not in ['fc.weight', 'fc.bias']: # param.requires_grad = False # init the fc layer model.fc = nn.Linear(512, args.num_class, bias=True) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) #.cuda() for debug on cpu # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) # assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize_video = transforms_video.NormalizeVideo( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) video_augmentation_train = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size), transforms_video.RandomHorizontalFlipVideo(), normalize_video, ]) video_augmentation_val = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.CenterCropVideo(args.crop_size), normalize_video, ]) data_dir = os.path.join(args.data, 'data') anno_dir = os.path.join(args.data, 'anno') audio_augmentation = moco.loader.DummyAudioTransform() train_augmentation = { 'video': video_augmentation_train, 'audio': audio_augmentation } val_augmentation = { 'video': video_augmentation_val, 'audio': audio_augmentation } train_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=True, transform=train_augmentation, num_workers=16) train_sampler = RandomClipSampler(train_dataset.video_clips, 10) if args.distributed: train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, multiprocessing_context="fork") val_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=False, transform=val_augmentation, num_workers=16) # Do not use DistributedSampler since it will destroy the testing iteration process val_sampler = UniformClipSampler(val_dataset.video_clips, args.clip_per_video) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.clip_per_video, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, multiprocessing_context="fork") if args.evaluate: validate(val_loader, model, criterion, args) return if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set val_loss, acc1, acc5 = validate(val_loader, model, criterion, args) if writer is not None: writer.add_scalar('lincls_val/loss', val_loss, epoch) writer.add_scalar('lincls_val/acc1', acc1, epoch) writer.add_scalar('lincls_val/acc5', acc5, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, ckp_dir, max_save=1, is_best=is_best)