def train( cfg, data_cfg, resume=False, epochs=100, batch_size=16, accumulated_batches=1, freeze_backbone=False, opt=None, ): weights = 'weights' mkdir_if_missing(weights) latest = osp.join(weights, 'latest.pt') torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run f = open(data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() cfg_dict = parse_model_cfg(cfg) img_size = [int(cfg_dict[0]['width']), int(cfg_dict[0]['height'])] # Get dataloader transforms = T.Compose([T.ToTensor()]) dataset = JointDataset(dataset_root, trainset_paths, img_size, augment=True, transforms=transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) # Initialize model model = Darknet(cfg_dict, dataset.nID) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 if resume: checkpoint = torch.load(latest, map_location='cpu') # Load weights to resume from model.load_state_dict(checkpoint['model']) model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9) start_epoch = checkpoint['epoch'] + 1 if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) del checkpoint # current, saved else: # Initialize model with backbone (optional) if cfg.endswith('yolov3_1088x608.cfg') or \ cfg.endswith('yolov3_864x480.cfg') or \ cfg.endswith('yolov3_576.320.cfg') or \ cfg.endswith('yolov3_864x480_detrac.cfg'): print('Load darknet weights') load_darknet_weights(model, osp.join(weights ,'darknet53.conv.74')) cutoff = 75 elif cfg.endswith('yolov3-tiny.cfg'): load_darknet_weights(model, osp.join(weights , 'yolov3-tiny.conv.15')) cutoff = 15 model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=1e-4) model = torch.nn.DataParallel(model) # Set scheduler scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[int(0.5*opt.epochs), int(0.75*opt.epochs)], gamma=0.1) # An important trick for detection: freeze bn during fine-tuning if not opt.unfreeze_bn: for i, (name, p) in enumerate(model.named_parameters()): p.requires_grad = False if 'batch_norm' in name else True model_info(model) t0 = time.time() for epoch in range(epochs): epoch += start_epoch logger.info(('%8s%12s' + '%10s' * 6) % ( 'Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) # Freeze darknet53.conv.74 for first epoch if freeze_backbone and (epoch < 2): for i, (name, p) in enumerate(model.named_parameters()): if int(name.split('.')[2]) < cutoff: # if layer < 75 p.requires_grad = False if (epoch == 0) else True ui = -1 rloss = defaultdict(float) # running loss optimizer.zero_grad() for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue # SGD burn-in burnin = min(1000, len(dataloader)) if (epoch == 0) & (i <= burnin): lr = opt.lr * (i / burnin) **4 for g in optimizer.param_groups: g['lr'] = lr # Compute loss, compute gradient, update parameters loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda()) components = torch.mean(components.view(-1, 5),dim=0) loss = torch.mean(loss) loss.backward() # accumulate gradient for x batches before optimizing if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): optimizer.step() optimizer.zero_grad() # Running epoch-means of tracked metrics ui += 1 for ii, key in enumerate(model.module.loss_names): rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1) s = ('%8s%12s' + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, len(dataloader) - 1), rloss['box'], rloss['conf'], rloss['id'],rloss['loss'], rloss['nT'], time.time() - t0) t0 = time.time() if i % opt.print_interval == 0: logger.info(s) # Save latest checkpoint checkpoint = {'epoch': epoch, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict()} torch.save(checkpoint, latest) # Calculate mAP if epoch % opt.test_interval ==0: with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 scheduler.step()
copyfile(cfg, weights_to + '/cfg/yolo3.cfg') copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json') latest = osp.join(weights_to, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: # making the checkpoint lite checkpoint["optimizer"] = [] torch.save(checkpoint, osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt")) # Calculate mAP if epoch % opt.test_interval == 0: with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 scheduler.step() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=30, help='number of epochs') parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch') parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step') parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') # Path to the configuration file. """ NOTE: In the folder cfg/, currently there are three configuration files: yolov3_576x320.cfg yolov3_864x480.cfg
def train( save_path, save_every, img_size, resume, epochs, batch_size, accumulated_batches, opt=None ): os.environ['CUDA_VISIBLE_DEVICES']=opt.gpu model_name = opt.backbone_name + '_img_size' + str(img_size[0]) + '_' + str(img_size[1]) weights_path = osp.join(save_path, model_name) loss_log_path = osp.join(weights_path, 'loss.json') mkdir_if_missing(weights_path) cfg = {} cfg['width'] = img_size[0] cfg['height'] = img_size[1] cfg['backbone_name'] = opt.backbone_name cfg['lr'] = opt.lr if resume: latest_resume = osp.join(weights_path, 'latest.pt') torch.backends.cudnn.benchmark = True # root = '/home/hunter/Document/torch' root = '/data/dgw' if opt.all_datasets: paths_trainset = {'02':'./data/track/train/MOT16-02.txt', '04':'./data/track/train/MOT16-04.txt', '05':'./data/track/train/MOT16-05.txt', '09':'./data/track/train/MOT16-09.txt', '10':'./data/track/train/MOT16-10.txt', '11':'./data/track/train/MOT16-11.txt', '13':'./data/track/train/MOT16-13.txt', 'CT':'./data/detect/CT_train.txt', 'ETH':'./data/detect/ETH.txt', 'PRW':'./data/detect/PRW_train.txt', 'CP':'./data/detect/cp_train.txt', 'CS':'./data/detect/CUHK_train.txt'} paths_valset = {'02':'./data/track/val/MOT16-02.txt', '04':'./data/track/val/MOT16-04.txt', '05':'./data/track/val/MOT16-05.txt', '09':'./data/track/val/MOT16-09.txt', '10':'./data/track/val/MOT16-10.txt', '11':'./data/track/val/MOT16-11.txt', '13':'./data/track/val/MOT16-13.txt', 'CP':'./data/detect/cp_val.txt', 'PRW':'./data/detect/PRW_val.txt', 'CT':'./data/detect/CT_val.txt', 'CS':'./data/detect/CUHK_val.txt'} else: paths_trainset = {'02':'./data/track/train/MOT16-02.txt', '04':'./data/track/train/MOT16-04.txt', '05':'./data/track/train/MOT16-05.txt', '09':'./data/track/train/MOT16-09.txt', '10':'./data/track/train/MOT16-10.txt', '11':'./data/track/train/MOT16-11.txt', '13':'./data/track/train/MOT16-13.txt'} paths_valset = {'02':'./data/track/val/MOT16-02.txt', '04':'./data/track/val/MOT16-04.txt', '05':'./data/track/val/MOT16-05.txt', '09':'./data/track/val/MOT16-09.txt', '10':'./data/track/val/MOT16-10.txt', '11':'./data/track/val/MOT16-11.txt', '13':'./data/track/val/MOT16-13.txt'} transforms = T.Compose([T.ToTensor()]) trainset = JointDataset(root=root, paths=paths_trainset, img_size=img_size, augment=True, transforms=transforms) valset = JointDataset(root=root, paths=paths_valset, img_size=img_size, augment=False, transforms=transforms) dataloader_trainset = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) dataloader_valset = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) cfg['num_ID'] = trainset.nID backbone = resnet_fpn_backbone(opt.backbone_name, True) backbone.out_channels = 256 model = Jde_RCNN(backbone, num_ID=trainset.nID, min_size=img_size[1], max_size=img_size[0], version=opt.model_version, len_embeddings=opt.len_embed) model.cuda().train() # model = torch.nn.DataParallel(model) start_epoch = 0 optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=5e-4) after_scheduler = StepLR(optimizer, 10, 0.1) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=after_scheduler) if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from print(model.load_state_dict(checkpoint['model'],strict=False)) start_epoch = checkpoint['epoch_det'] del checkpoint # current, saved else: with open(osp.join(weights_path,'model.yaml'), 'w+') as f: yaml.dump(cfg, f) for epoch in range(epochs): model.cuda().eval() with torch.no_grad(): if epoch%3==0: test_emb(model, dataloader_valset, print_interval=50)[-1] test(model, dataloader_valset, conf_thres=0.5, iou_thres=0.2, print_interval=50) scheduler.step(epoch+start_epoch) model.cuda().train() print('lr: ', optimizer.param_groups[0]['lr']) loss_epoch_log = dict(loss_total=0, loss_classifier=0, loss_box_reg=0, loss_reid=0, loss_objectness=0, loss_rpn_box_reg=0) for i, (imgs, labels, _, _, targets_len) in enumerate(tqdm(dataloader_trainset)): targets = [] imgs = imgs.cuda() labels = labels.cuda() flag = False for target_len, label in zip(targets_len.view(-1,), labels): ## convert the input to demanded format target = {} if target_len==0: flag = True if torch.all(label[0:int(target_len), 1]==-1): flag = True target['boxes'] = label[0:int(target_len), 2:6] target['ids'] = (label[0:int(target_len), 1]).long() target['labels'] = torch.ones_like(target['ids']) targets.append(target) if flag: continue losses = model(imgs, targets) loss = losses['loss_classifier'] + losses['loss_box_reg'] + losses['loss_objectness'] + losses['loss_rpn_box_reg'] + 0.4*losses['loss_reid'] loss.backward() if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader_trainset) - 1): optimizer.step() optimizer.zero_grad() ## print and log the loss for key, val in losses.items(): loss_epoch_log[key] = float(val) + loss_epoch_log[key] for key, val in loss_epoch_log.items(): loss_epoch_log[key] =loss_epoch_log[key]/i print("loss in epoch %d: "%(epoch)) print(loss_epoch_log) epoch_det = epoch + start_epoch epoch_reid = epoch + start_epoch checkpoint = {'epoch_det': epoch_det, 'epoch_reid': epoch_reid, 'model': model.state_dict() } latest = osp.join(weights_path, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: torch.save(checkpoint, osp.join(weights_path, "weights_epoch_" + str(epoch_det) + '_' + str(epoch_reid) + ".pt")) with open(loss_log_path, 'a+') as f: f.write('epoch_det:'+str(epoch_det)+',epoch_reid:'+str(epoch_reid)+'\n') json.dump(loss_epoch_log, f) f.write('\n')
def train( cfg, data_cfg, weights_from="", weights_to="", save_every=10, img_size=(1088, 608), resume=False, epochs=100, batch_size=16, accumulated_batches=1, freeze_backbone=False, opt=None, ): # The function starts timme = strftime("%Y-%d-%m %H:%M:%S", gmtime()) timme = timme[5:-3].replace('-', '_') timme = timme.replace(' ', '_') timme = timme.replace(':', '_') weights_to = osp.join(weights_to, 'run' + timme) mkdir_if_missing(weights_to) if resume: latest_resume = osp.join(weights_from, 'latest.pt') torch.backends.cudnn.benchmark = True # unsuitable for multiscale # Configure run f = open(data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() transforms = T.Compose([T.ToTensor()]) # Get dataloader dataset = JointDataset(dataset_root, trainset_paths, img_size, augment=True, transforms=transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) print("batch size","======",batch_size) # Initialize model model = Darknet(cfg, dataset.nID) cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from model.load_state_dict(checkpoint['model']) model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9) start_epoch = checkpoint['epoch'] + 1 if checkpoint['optimizer'] is not None: optimizer.load_state_dict(checkpoint['optimizer']) del checkpoint # current, saved else: # Initialize model with backbone (optional) if cfg.endswith('yolov3.cfg'): load_darknet_weights(model, osp.join(weights_from, 'darknet53.conv.74')) cutoff = 75 elif cfg.endswith('yolov3-tiny.cfg'): load_darknet_weights(model, osp.join(weights_from, 'yolov3-tiny.conv.15')) cutoff = 15 model.cuda().train() # Set optimizer optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=1e-4) #model = torch.nn.DataParallel(model) torch.distributed.init_process_group(backend="nccl") model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) # Set scheduler scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[int(0.5 * opt.epochs), int(0.75 * opt.epochs)], gamma=0.1) # An important trick for detection: freeze bn during fine-tuning if not opt.unfreeze_bn: for i, (name, p) in enumerate(model.named_parameters()): p.requires_grad = False if 'batch_norm' in name else True # model_info(model) t0 = time.time() for epoch in range(epochs): epoch += start_epoch logger.info(('%8s%12s' + '%10s' * 6) % ( 'Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) # Freeze darknet53.conv.74 for first epoch if freeze_backbone and (epoch < 2): for i, (name, p) in enumerate(model.named_parameters()): if int(name.split('.')[2]) < cutoff: # if layer < 75 p.requires_grad = False if (epoch == 0) else True ui = -1 rloss = defaultdict(float) # running loss optimizer.zero_grad() for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue # SGD burn-in burnin = min(1000, len(dataloader)) if (epoch == 0) & (i <= burnin): lr = opt.lr * (i / burnin) ** 4 for g in optimizer.param_groups: g['lr'] = lr # Compute loss, compute gradient, update parameters loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda()) components = torch.mean(components.view(-1, 5), dim=0) loss = torch.mean(loss) loss.backward() # accumulate gradient for x batches before optimizing if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): optimizer.step() optimizer.zero_grad() # Running epoch-means of tracked metrics ui += 1 for ii, key in enumerate(model.module.loss_names): rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1) # rloss indicates running loss values with mean updated at every epoch s = ('%8s%12s' + '%10.3g' * 6) % ( '%g/%g' % (epoch, epochs - 1), '%g/%g' % (i, len(dataloader) - 1), rloss['box'], rloss['conf'], rloss['id'], rloss['loss'], rloss['nT'], time.time() - t0) t0 = time.time() if i % opt.print_interval == 0: logger.info(s) # Save latest checkpoint checkpoint = {'epoch': epoch, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict()} if not os.path.exists(weights_to + '/cfg/'): os.mkdir(weights_to+'/cfg/') copyfile(cfg, weights_to + '/cfg/yolov3.cfg') copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json') latest = osp.join(weights_to, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: # making the checkpoint lite checkpoint["optimizer"] = [] torch.save(checkpoint, osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt")) # Calculate mAP if epoch % opt.test_interval == 0: with torch.no_grad(): mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 scheduler.step()