def __init__(self, numcls=21, temperature=1.0, minlossT=(0.3, 1.3), maxlossT=(0.7, 1.7), device='cuda:0', reduction_kl='batchmean', reduction_l1='mean'): super(KDloss, self).__init__() self.numclass = numcls self.kldiv = nn.KLDivLoss(reduction=reduction_kl) self.smoothl1 = nn.SmoothL1Loss(reduction=reduction_l1) self.softmax = nn.Softmax(dim=-1) self.logsoftmax = nn.LogSoftmax(dim=-1) self.tau = temperature self.minlossT = minlossT self.maxlossT = maxlossT self.finetuneloss = MultiBoxLoss(21, 0.5, True, 0, True, 3, 0.5, False, True) self.device = device
if args.resume: print('Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: print('Initializing weights...') # initialize newly added layers' weights with xavier method ssd_net.apply(weights_init) print('Loading base network...') ssd_net.load_weights_for_rosd(args.basenet) if args.cuda: net = net.cuda() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(num_classes, 0.3, True, 0, True, 3, 0.5, False, args.cuda) def train(): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch = 0 print('Loading Dataset...') dataset = ROSDDetection(args.rosd_root, train_sets, SSDAugmentation( ssd_dim, means), AnnotationTransform_ROSD()) epoch_size = len(dataset) // args.batch_size print('Training SSD on', dataset.name)
def train(): NUMB_EPOCHS = 50 EPOCH_WHERE_WHOLE_NETWORK_TRAINED = 15 BEST_WEIGHTS_FILE = 'model_best_weights_10.pth' MAX_LEARNING_RATE = 1e-3 MIN_LEARNING_RATE = 1e-4 # STEP_SIZE = [5, 5, 5, 5, 10, 10, 10] STEP_SIZE = [1] numb_epochs = sum(STEP_SIZE) * 2 cfg = voc dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'trainval')], transform=SSDAugmentation(cfg['min_dim'], MEANS)) ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) # ssd_net = torch.nn.DataParallel(ssd_net) print('Initializing weights...') ssd_net.extras.apply(weights_init) ssd_net.loc.apply(weights_init) ssd_net.conf.apply(weights_init) # show the layer sizes, given an input of size 3 channels by 300x300 # summary(ssd_net, (3, 300, 300)) # print(ssd_net.vgg) print(ssd_net) vgg_weights = torch.load(args.save_folder + args.basenet) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) if args.cuda: ssd_net = ssd_net.cuda() #freeze the base network for now, unfreeze after a couple of epochs of training utilsKP.do_requires_grad(ssd_net.vgg, requires_grad=False) # pytorch optimizer ONLY accepts parameter that requires grad # the first param chooses just the layers that require gradient # see https://github.com/pytorch/pytorch/issues/679 # optimizer = optim.SGD(filter(lambda p: p.requires_grad, ssd_net.parameters()), lr=args.lr, momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, ssd_net.parameters()), lr=args.lr, weight_decay=args.weight_decay) # # this raises ValueError: optimizing a parameter that doesn't require gradients # optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, # weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) # learning rate = lr for epoch 1,2,3,4 - then lr/10 for 5-then lr/100 for 4 and 5 # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 6], gamma=0.1) #lets try this new fancy cyclic learning rate # scheduler = utilsKP.CyclicLR(optimizer, base_lr=1e-4, max_lr=5e-3,step_size = 150) #or how about triangular or cosign annealing with warm restarts # lr = utilsKP.TriangularLR() lr = utils_LR.TriangularLR_LRFinder() # lra = utilsKP.LR_anneal_linear() lra = None scheduler = utils_LR.CyclicLR_Scheduler(optimizer, min_lr=MIN_LEARNING_RATE, max_lr=MAX_LEARNING_RATE, LR=lr, LR_anneal=lra, batch_size=64, numb_images=utils_LR.NUMB_IMAGES, step_size=STEP_SIZE) # we are going to train so set up gradients ssd_net.train() # loss counters print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size print(f"len(dataset)={len(dataset)}, batch_size={args.batch_size}") print('Training SSD on:', dataset.name) print('Using the specified args:') print(args) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # logging for tensorflow writer = utils_tb_writer.Writer('./runs') # #use same writer for LR # scheduler.setWriter(writer) all_batch_cntr = 0 loss_lowest = 10000 for epoch in range(numb_epochs): print(f"Starting epoch {epoch}") #we are going to use a lot of memory for the model and for the images, see whats being used on the GPU below print( f"torch.cuda.memory_allocated()= {torch.cuda.memory_allocated()/1000000} megabytes" ) print( f"torch.cuda.memory_cached()= {torch.cuda.memory_cached()/1000000} megabytes" ) print( f"total cuda memory= {torch.cuda.memory_allocated()/1000000 + torch.cuda.memory_cached()/1000000} megabytes" ) #create a new iterator over training data batch_iterator = iter(data_loader) # reset epoch loss counters loc_loss = 0 conf_loss = 0 #step the learning rates (for non cyclic) # scheduler.step() #for first few epochs do not backprop through vgg #train custom head first if (epoch == EPOCH_WHERE_WHOLE_NETWORK_TRAINED): utilsKP.do_requires_grad(ssd_net.vgg, requires_grad=True, apply_to_this_layer_on=24) optimizer = optim.SGD(filter(lambda p: p.requires_grad, ssd_net.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # ######################### # #dump this # if iteration in cfg['lr_steps']: # step_index += 1 # adjust_learning_rate(optimizer, args.gamma, step_index) # ######################### #iterate until finish epoch for batch_cntr, (images, targets) in enumerate(batch_iterator): #always want em on cuda if args.cuda: images = Variable(images.cuda()) with torch.no_grad(): targets = [Variable(ann.cuda()) for ann in targets] else: images = Variable(images) with torch.no_grad(): targets = [Variable(ann) for ann in targets] # forward out = ssd_net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c # save the best model if loss < loss_lowest: # print(f"New lowest loss! Was {loss_lowest} is now {loss}") loss_lowest = loss torch.save(ssd_net.state_dict(), args.save_folder + '' + BEST_WEIGHTS_FILE) loss.backward() optimizer.step() scheduler.batch_step() #for cyclic learning rate ONLY #keep track of these loc_loss += loss_l.item() conf_loss += loss_c.item() if batch_cntr % 1 == 0: print(f'batch_cntr ={batch_cntr} || Loss: {loss.item()}') all_batch_cntr += batch_cntr writer('loss_L', loss_l.item(), all_batch_cntr) writer('loss_C', loss_c.item(), all_batch_cntr) writer('loss_Total', loss, all_batch_cntr) writer('learning_rate', scheduler.cur_lr, loss)
name = k new_state_dict[name] = v net.load_state_dict(new_state_dict) if num_gpu > 1 and gpu_train: net = torch.nn.DataParallel(net, device_ids=list(range(num_gpu))) device = torch.device('cuda:0' if gpu_train else 'cpu') cudnn.benchmark = True net = net.to(device) optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay) criterion = MultiBoxLoss(num_classes, args.overlap, True, 0, True, args.negposratio, args.overlap, False) priorbox = PriorBox(cfg, image_size=(img_dim, img_dim)) with torch.no_grad(): priors = priorbox.forward() priors = priors.to(device) def train(): net.train() epoch = 0 + args.resume_epoch print('Loading Dataset...') dataset = VOCDetection(training_dataset, preproc(img_dim, rgb_mean), AnnotationTransform())
def main(): global args global minmum_loss args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size model = FaceBoxes('train', args.num_classes) print("Printing net...") if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model) model = model.cuda() # optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(num_classes=args.num_classes, overlap_thresh=0.35, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=7, neg_overlap=0.35, encode_target=False) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] minmum_loss = checkpoint['minmum_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code print('Loading Dataset...') dataset = VOCDetection(args.training_dataset, preproc(args.img_dim, args.rgb_mean), AnnotationTransform()) train_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) priorbox = PriorBox(cfg, image_size=(args.img_dim, args.img_dim)) with torch.no_grad(): priors = priorbox.forward() priors = priors.cuda() for epoch in range(args.start_epoch, args.epochs): # train for one epoch end = time.time() loss = train(train_loader, model, priors, criterion, optimizer, epoch) if args.local_rank == 0: is_best = loss < minmum_loss minmum_loss = min(loss, minmum_loss) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': minmum_loss, 'optimizer': optimizer.state_dict(), }, is_best, epoch) epoch_time = time.time() - end print('Epoch %s time cost %f' % (epoch, epoch_time))
option.setup_config() args = option.opt # dataset dataset = create_dataset(args) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # model ssd_net, (args.start_epoch, args.start_iter) = build_ssd(args, dataset.num_classes) # init log file show_jot_opt(args) # optim, loss optimizer = set_optimizer(ssd_net, args) criterion = MultiBoxLoss(dataset.num_classes, 0.5, True, 0, True, 3, 0.5, False, args.use_cuda) # init visualizer visual = Visualizer(args, dataset) epoch_size = len(dataset) // args.batch_size for epoch in range(args.start_epoch, args.max_epoch): batch_iterator = iter(data_loader) old_lr = optimizer.param_groups[0]['lr'] adjust_learning_rate(optimizer, epoch, args) new_lr = optimizer.param_groups[0]['lr'] if epoch == args.start_epoch: # only execute once if args.resume is not None: old_lr, prefix = new_lr, 'resume'
def train(): cfg, dataset = None, None if args.dataset == 'VOC': cfg = voc dataset = VOCDetection(root=args.dataset_root, transform=SSDAugmentation( cfg['min_dim'], MEANS)) else: raise Exception('No such dataset {} supported yet!'.format( args.dataset)) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) net = ssd_net if args.cuda: net = torch.nn.DataParallel(ssd_net).cuda() cudnn.benchmark = True if args.resume: print('Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: vgg_weights = torch.load(args.saved + args.basenet) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) if not args.resume: print('Initializing weights...') # Initialize newly added layers' weights with xavier method ssd_net.extras.apply(weights_init) ssd_net.loc.apply(weights_init) ssd_net.conf.apply(weights_init) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) net.train() # Loss counters loc_loss = 0 conf_loss = 0 epoch = 0 print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size print('Training SSD on:', dataset.name) print('Using the specified args:') print(args) step_index = 0 if args.visdom: vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) # Create batch iterator batch_iterator = iter(data_loader) for iteration in range(args.start_iter, cfg['max_iter']): if args.visdom and iteration != 0 and (iteration % epoch_size == 0): update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) # Reset epoch loss counters loc_loss = 0 conf_loss = 0 epoch += 1 # Learning rate decay if iteration in cfg['lr_steps']: step_index += 1 adjust_learning_rate(optimizer, args.gamma, step_index) # Load train data images, targets = next( batch_iterator ) # (batch_size, 3, h, w); a list of length batch_size if args.cuda: images = images.cuda() targets = [ann.cuda() for ann in targets] t0 = time.time() # Forward pass out = net(images) # Backward pass optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c loss.backward() optimizer.step() t1 = time.time() loc_loss += loss_l.item() conf_loss += loss_c.item() if iteration % 10 == 0: print('timer: %.4f sec.' % (t1 - t0)) print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ') if args.visdom: update_vis_plot(iteration, loss_l.item(), loss_c.item(), iter_plot, epoch_plot, 'append') if iteration != 0 and iteration % 5000 == 0: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC2007_' + repr(iteration) + '.pth') torch.save(ssd_net.state_dict(), args.saved + '' + args.dataset + '.pth')
def train(): if args.dataset == 'COCO': if args.dataset_root == VOC_ROOT: if not os.path.exists(COCO_ROOT): parser.error('Must specify dataset_root if specifying dataset') print("WARNING: Using default COCO dataset_root because " + "--dataset_root was not specified.") args.dataset_root = COCO_ROOT cfg = coco dataset = COCODetection(root=args.dataset_root, transform=SSDAugmentation(cfg['min_dim'], MEANS)) elif args.dataset == 'VOC': if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root') cfg = voc dataset = VOCDetection(root=args.dataset_root, transform=SSDAugmentation(cfg['min_dim'], MEANS)) if args.visdom: import visdom viz = visdom.Visdom() ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) net = ssd_net if args.cuda: net = torch.nn.DataParallel(ssd_net) cudnn.benchmark = True if args.resume: print('Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: vgg_weights = torch.load(args.save_folder + args.basenet) print('Loading base network...') ssd_net.vgg.load_state_dict(vgg_weights) if args.cuda: net = net.cuda() if not args.resume: print('Initializing weights...') # initialize newly added layers' weights with xavier method ssd_net.extras.apply(weights_init) ssd_net.loc.apply(weights_init) ssd_net.conf.apply(weights_init) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) net.train() # loss counters loc_loss = 0 conf_loss = 0 epoch = 0 print('Loading the dataset...') epoch_size = len(dataset) // args.batch_size print('Training SSD on:', dataset.name) print('Using the specified args:') print(args) step_index = 0 if args.visdom: vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # see https://discuss.pytorch.org/t/in-what-condition-the-dataloader-would-raise-stopiteration/17483/2 def cycle(iterable): while True: for x in iterable: yield x # create batch iterator batch_iterator = iter(cycle(data_loader)) for iteration in range(args.start_iter, cfg['max_iter']): if args.visdom and iteration != 0 and (iteration % epoch_size == 0): update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) # reset epoch loss counters loc_loss = 0 conf_loss = 0 epoch += 1 if iteration in cfg['lr_steps']: step_index += 1 adjust_learning_rate(optimizer, args.gamma, step_index) # load train data images, targets = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) with torch.no_grad(): targets = [Variable(ann.cuda()) for ann in targets] else: images = Variable(images) with torch.no_grad(): targets = [Variable(ann) for ann in targets] # forward t0 = time.time() out = net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c loss.backward() optimizer.step() t1 = time.time() loc_loss += loss_l.data[0] conf_loss += loss_c.data[0] if iteration % 10 == 0: print('timer: %.4f sec.' % (t1 - t0)) print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ') torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '1.pth') if args.visdom: update_vis_plot(iteration, loss_l.data[0], loss_c.data[0], iter_plot, epoch_plot, 'append') if iteration != 0 and iteration % 5000 == 0: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), 'weights/ssd300_COCO_' + repr(iteration) + '.pth')
if args.cuda: net = net.cuda() cudnn.benckmark = True if not args.resume: print('initialize network...') tibnet.loc.apply(tibnet.weights_init) tibnet.conf.apply(tibnet.weights_init) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss(cfg, args.dataset, args.cuda) print('args:\n', args) def train(): step_index = 0 iteration = 0 net.train() for epoch in range(start_epoch, cfg.EPOCHES): losses = 0 for batch_idx, (images, targets) in enumerate(train_loader): if args.cuda: images = Variable(images.cuda()) with torch.no_grad(): targets = [Variable(ann.cuda()) for ann in targets]