def train(): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch = 0 min_loss = float('inf') print('Loading Dataset...') dataset = Detection(args.annoPath, PyramidAugmentation(ssd_dim, means), AnnotationTransform()) epoch_size = len(dataset) // args.batch_size print('Training SSD on', dataset.name) step_index = 0 step_increase = 0 if args.visdom: # initialize visdom loss plot lot = viz.line( X=torch.zeros((1,)).cpu(), Y=torch.zeros((1, 3)).cpu(), opts=dict( xlabel='Iteration', ylabel='Loss', title='Current SSD Training Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) epoch_lot = viz.line( X=torch.zeros((1,)).cpu(), Y=torch.zeros((1, 3)).cpu(), opts=dict( xlabel='Epoch', ylabel='Loss', title='Epoch SSD Training Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) batch_iterator = None data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) for iteration in range(args.start_iter, max_iter): t0 = time.time() if (not batch_iterator) or (iteration % epoch_size == 0): # create batch iterator batch_iterator = iter(data_loader) if iteration in stepvalues: if iteration in stepvalues[0:5]: step_increase += 1 warmup_learning_rate(optimizer, args.lr, step_increase) else: step_index += 1 adjust_learning_rate(optimizer, gamma, step_index) if args.visdom: viz.line( X=torch.ones((1, 3)).cpu() * epoch, Y=torch.Tensor([loc_loss, conf_loss, loc_loss + conf_loss]).unsqueeze(0).cpu() / epoch_size, win=epoch_lot, update='append' ) # reset epoch loss counters loc_loss = 0 conf_loss = 0 epoch += 1 # load train data images, targets = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) targets = [Variable(anno.cuda(), volatile=True) for anno in targets] else: images = Variable(images) targets = [Variable(anno, volatile=True) for anno in targets] # forward t1 = time.time() out = net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(tuple(out[0:3]), targets) loss_l_head, loss_c_head = criterion(tuple(out[3:6]), targets) loss = loss_l + loss_c + 0.5 * loss_l_head + 0.5 * loss_c_head if (loss.data[0] < min_loss): min_loss = loss.data[0] print("min_loss: " , min_loss) torch.save(ssd_net.state_dict(), args.save_folder + 'best_our_ucsd_Res50_pyramid_aug' + '.pth') loss.backward() optimizer.step() t2 = time.time() loc_loss += loss_l.data[0] conf_loss += loss_c.data[0] if iteration % 50 == 0: print('front and back Timer: {} sec.'.format((t2 - t1))) print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0])) print('Loss conf: {} Loss loc: {}'.format(loss_c.data[0], loss_l.data[0])) print('Loss head conf: {} Loss head loc: {}'.format(loss_c_head.data[0], loss_l_head.data[0])) print('lr: {}'.format(optimizer.param_groups[0]['lr'])) if args.visdom and args.send_images_to_visdom: random_batch_index = np.random.randint(images.size(0)) viz.image(images.data[random_batch_index].cpu().numpy()) if args.visdom: viz.line( X=torch.ones((1, 3)).cpu() * iteration, Y=torch.Tensor([loss_l.data[0], loss_c.data[0], loss_l.data[0] + loss_c.data[0]]).unsqueeze(0).cpu(), win=lot, update='append' ) # hacky fencepost solution for 0th epoch plot if iteration == 0: viz.line( X=torch.zeros((1, 3)).cpu(), Y=torch.Tensor([loc_loss, conf_loss, loc_loss + conf_loss]).unsqueeze(0).cpu(), win=epoch_lot, update=True ) if iteration % 500 == 0 or iteration in stepvalues: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), args.save_folder + 'our_ucsd_Res50_pyramid_aug_' + repr(iteration) + '.pth') torch.save(ssd_net.state_dict(), args.save_folder + 'our_ucsd_Res50_pyramid_aug' + '.pth')
def main(): args.step_values = [int(val) for val in args.step_values.split(',')] # args.loss_reset_step = 10 args.log_step = 10 args.dataset = args.dataset.lower() args.basenet = args.basenet.lower() args.bn = abs(args.bn) # 0 freeze or else use bn if args.bn > 0: args.bn = 1 # update bn layer set the flag to 1 args.shared_heads = abs( args.shared_heads) # 0 no sharing of feature else yes if args.shared_heads > 0: args.shared_heads = 1 args.exp_name = 'FPN{:d}-{:s}sh{:02d}-{:s}-bs{:02d}-{:s}-lr{:05d}-bn{:d}'.format( args.input_dim, args.anchor_type, args.shared_heads, args.dataset, args.batch_size, args.basenet, int(args.lr * 100000), args.bn) args.save_root += args.dataset + '/' args.save_root = args.save_root + 'cache/' + args.exp_name + '/' if not os.path.isdir( args.save_root): # if save directory doesn't exist create it os.makedirs(args.save_root) source_dir = args.save_root + '/source/' # where to save the source utils.copy_source(source_dir) anchors = 'None' with torch.no_grad(): if args.anchor_type == 'kmeans': anchorbox = kanchorBoxes(input_dim=args.input_dim, dataset=args.dataset) else: anchorbox = anchorBox(args.anchor_type, input_dim=args.input_dim, dataset=args.dataset) anchors = anchorbox.forward() args.ar = anchorbox.ar args.num_anchors = anchors.size(0) if args.dataset == 'coco': args.train_sets = ['train2017'] args.val_sets = ['val2017'] else: args.train_sets = ['train2007', 'val2007', 'train2012', 'val2012'] args.val_sets = ['test2007'] args.means = [0.485, 0.456, 0.406] args.stds = [0.229, 0.224, 0.225] print('\nLoading Datasets') train_dataset = Detection(args, train=True, image_sets=args.train_sets, transform=Augmentation(args.input_dim, args.means, args.stds)) print('Done Loading Dataset Train Dataset :::>>>\n', train_dataset.print_str) val_dataset = Detection(args, train=False, image_sets=args.val_sets, transform=BaseTransform(args.input_dim, args.means, args.stds), full_test=False) print('Done Loading Dataset Validation Dataset :::>>>\n', val_dataset.print_str) args.num_classes = len(train_dataset.classes) + 1 args.classes = train_dataset.classes args.head_size = 256 if args.shared_heads > 0: net = build_fpn_shared_heads(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes) else: net = build_fpn_unshared(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes) net = net.cuda() if args.ngpu > 1: print('\nLets do dataparallel\n') net = torch.nn.DataParallel(net) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = MultiBoxLoss() scheduler = MultiStepLR(optimizer, milestones=args.step_values, gamma=args.gamma) train(args, net, anchors, optimizer, criterion, scheduler, train_dataset, val_dataset)
def train(): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch = 0 print('Loading Dataset...') dataset = Detection(args.annoPath, PyramidAugmentation(ssd_dim, means), AnnotationTransform()) print('len(dataset) = ' + str(len(dataset))) print(dataset.__getitem__(0)) epoch_size = len(dataset) // args.batch_size print('Training PyramidBox on', dataset.name) step_index = 0 if args.visdom: # initialize visdom loss plot lot = viz.line( X=np.array(torch.zeros((1,)).cpu()), Y=np.array(torch.zeros((1, 3)).cpu()), opts=dict( xlabel='Iteration', ylabel='Loss', title='Current PyramidBox Training Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) epoch_lot = viz.line( X=np.array(torch.zeros((1,)).cpu()), Y=np.array(torch.zeros((1, 3)).cpu()), opts=dict( xlabel='Epoch', ylabel='Loss', title='Epoch PyramidBox Training Loss', legend=['Loc Loss', 'Conf Loss', 'Loss'] ) ) lr_lot = viz.line( X=np.array(torch.zeros((1,)).cpu()), Y=np.array(torch.zeros((1,1)).cpu()), opts=dict( xlabel='iteration', ylabel='learning-rate', title='Warm-up', legend=['lr'] ) ) batch_iterator = None data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) print('data loading finished...') for iteration in range(args.start_iter, max_iter): t0 = time.time() try: if (not batch_iterator) or (iteration % epoch_size == 0): batch_iterator = iter(data_loader) adjust_learning_rate(optimizer, gamma, iteration) if iteration in stepvalues: step_index += 1 if args.visdom: viz.line( X=np.array(torch.ones((1, 3)).cpu()) * epoch, Y=np.array(torch.Tensor([loc_loss, conf_loss, loc_loss + conf_loss]).unsqueeze(0).cpu() )/ epoch_size, win=epoch_lot, update='append' ) # reset epoch loss counters loc_loss = 0 conf_loss = 0 epoch += 1 # load train data images, targets = next(batch_iterator) if args.cuda: images = Variable(images.cuda()) targets = [Variable(anno.cuda(), volatile=True) for anno in targets] else: images = Variable(images) targets = [Variable(anno, volatile=True) for anno in targets] # forward t1 = time.time() out = net(images) # backprop optimizer.zero_grad() loss_l, loss_c = criterion(tuple(out[0:3]), targets) loss_l_head, loss_c_head = criterion(tuple(out[3:6]), targets) loss = loss_l + loss_c + 0.5 * loss_l_head + 0.5 * loss_c_head loss.backward() optimizer.step() t2 = time.time() loc_loss += loss_l.data[0] conf_loss += loss_c.data[0] if iteration % 10 == 0: print('front and back Timer: {} sec.' .format((t2 - t1))) print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0])) print('Loss conf: {} Loss loc: {}'.format(loss_c.data[0],loss_l.data[0])) print('Loss head conf: {} Loss head loc: {}'.format(loss_c_head.data[0],loss_l_head.data[0])) print('lr: {}'.format(optimizer.param_groups[0]['lr'])) if args.visdom and args.send_images_to_visdom: random_batch_index = np.random.randint(images.size(0)) viz.image(images.data[random_batch_index].cpu().numpy()) if args.visdom: viz.line( X=np.array(torch.ones((1, 3)).cpu()) * iteration, Y=np.array(torch.Tensor([loss_l.data[0], loss_c.data[0], loss_l.data[0] + loss_c.data[0]]).unsqueeze(0).cpu()), win=lot, update='append' ) viz.line( X=np.array(torch.ones((1,1)).cpu()) * iteration, Y=np.array(torch.Tensor([optimizer.param_groups[0]['lr']]).unsqueeze(0).cpu()), win=lr_lot, update='append' ) # hacky fencepost solution for 0th epoch plot if iteration == 0: viz.line( X=np.array(torch.zeros((1, 3)).cpu()), Y=np.array(torch.Tensor([loc_loss, conf_loss, loc_loss + conf_loss]).unsqueeze(0).cpu()), win=epoch_lot, update=True ) viz.line( X=np.array(torch.zeros((1,1)).cpu()), Y=np.array(torch.Tensor([optimizer.param_groups[0]['lr']]).unsqueeze(0).cpu()), win=lr_lot, update=True ) except TypeError as e: print(e) print('-'*20,'jump to next iter and log.') continue except ValueError as e2: print(e2) print('='*20,'jump to next iter and log.') continue if iteration % 5000 == 0: print('Saving state, iter:', iteration) torch.save(ssd_net.state_dict(), args.save_folder + 'Res50_pyramid_' + repr(iteration) + '.pth') torch.save(ssd_net.state_dict(), args.save_folder + 'Res50_pyramid_' + '.pth')
def main(): args.eval_iters = [int(val) for val in args.eval_iters.split(',')] # args.loss_reset_step = 10 args.log_step = 10 args.dataset = args.dataset.lower() args.basenet = args.basenet.lower() args.bn = abs(args.bn) # 0 freeze or else use bn if args.bn > 0: args.bn = 1 # update bn layer set the flag to 1 args.exp_name = 'FPN{:d}-{:s}sh{:02d}-{:s}-bs{:02d}-{:s}-lr{:05d}-bn{:d}'.format( args.input_dim, args.anchor_type, args.shared_heads, args.dataset, args.batch_size, args.basenet, int(args.lr * 100000), args.bn) args.save_root += args.dataset + '/' args.save_root = args.save_root + 'cache/' + args.exp_name + '/' if not os.path.isdir( args.save_root): # if save directory doesn't exist create it os.makedirs(args.save_root) source_dir = args.save_root + '/source/' # where to save the source utils.copy_source(source_dir) anchors = 'None' with torch.no_grad(): if args.anchor_type == 'kmeans': anchorbox = kanchorBoxes(input_dim=args.input_dim, dataset=args.dataset) else: anchorbox = anchorBox(args.anchor_type, input_dim=args.input_dim, dataset=args.dataset) anchors = anchorbox.forward() args.ar = anchorbox.ar args.num_anchors = anchors.size(0) anchors = anchors.cuda(0, non_blocking=True) if args.dataset == 'coco': args.train_sets = ['train2017'] args.val_sets = ['val2017'] else: args.train_sets = ['train2007', 'val2007', 'train2012', 'val2012'] args.val_sets = ['test2007'] args.means = [0.485, 0.456, 0.406] args.stds = [0.229, 0.224, 0.225] val_dataset = Detection(args, train=False, image_sets=args.val_sets, transform=BaseTransform(args.input_dim, args.means, args.stds), full_test=False) print('Done Loading Dataset Validation Dataset :::>>>\n', val_dataset.print_str) args.data_dir = val_dataset.root args.num_classes = len(val_dataset.classes) + 1 args.classes = val_dataset.classes args.bias_heads = args.bias_heads > 0 args.head_size = 256 if args.shared_heads > 0: net = build_fpn_shared_heads(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes, bias_heads=args.bias_heads) else: net = build_fpn_unshared(args.basenet, args.model_dir, ar=args.ar, head_size=args.head_size, num_classes=args.num_classes, bias_heads=args.bias_heads) net = net.cuda() if args.ngpu > 1: print('\nLets do dataparallel\n') net = torch.nn.DataParallel(net) net.eval() for iteration in args.eval_iters: args.det_itr = iteration log_file = open( "{:s}/testing-{:d}.log".format(args.save_root, iteration), "w", 1) log_file.write(args.exp_name + '\n') args.model_path = args.save_root + '/model_' + repr(iteration) + '.pth' log_file.write(args.model_path + '\n') net.load_state_dict(torch.load(args.model_path)) print('Finished loading model %d !' % iteration) # Load dataset val_data_loader = data_utils.DataLoader(val_dataset, int(args.batch_size / 2), num_workers=args.num_workers, shuffle=False, pin_memory=True, collate_fn=custum_collate) # evaluation torch.cuda.synchronize() tt0 = time.perf_counter() log_file.write('Testing net \n') net.eval() # switch net to evaluation mode if args.dataset != 'coco': mAP, ap_all, ap_strs, det_boxes = validate( args, net, anchors, val_data_loader, val_dataset, iteration, iou_thresh=args.iou_thresh) else: mAP, ap_all, ap_strs, det_boxes = validate_coco( args, net, anchors, val_data_loader, val_dataset, iteration, iou_thresh=args.iou_thresh) for ap_str in ap_strs: print(ap_str) log_file.write(ap_str + '\n') ptr_str = '\nMEANAP:::=>' + str(mAP) + '\n' print(ptr_str) log_file.write(ptr_str) torch.cuda.synchronize() print('Complete set time {:0.2f}'.format(time.perf_counter() - tt0)) log_file.close()
print('Resuming training, loading {}...'.format(args.resume)) ssd_net.load_weights(args.resume) else: pass optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=momentum, weight_decay=weight_decay) ssd_net, optimizer = amp.initialize(ssd_net, optimizer, opt_level="02") criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 3, 0.35, False, False, args.cuda) criterion1 = MultiBoxLoss(num_classes, 0.35, True, 0, True, 3, 0.35, False, True, args.cuda) dataset = Detection(args.annoPath, PyramidAugmentation(ssd_dim, means), AnnotationTransform()) if args.useMultiProcess: torch.distributed.init_process_group(backend=args.dist_backend, init_method="env://", world_size=args.world_size, rank=args.device_ids) net = torch.nn.parallel.DistributedDataParallel( ssd_net, device_ids=tuple(np.arange(0, args.device_ids))) # boost computing rate but some randomness vice versa cudnn.deterministic=True cudnn.benchmark = True train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) def train(): net.train() # loss counters