def build_optimizer(params, cfg, logger=None): if cfg['name'] == 'sgd': optimizer = torch.optim.SGD(params=params, lr=cfg['lr']['base_lr'], momentum=cfg['momentum'], weight_decay=cfg['weight_decay'], nesterov=cfg['nesterov']) elif cfg['name'] == 'adam': optimizer = torch.optim.Adam(params=params, lr=cfg['lr']['base_lr'], weight_decay=cfg['weight_decay']) else: raise ValueError('Unknown optimizer.') scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg['lr']['milestones'], gamma=cfg['lr']['gamma']) if 'warmup' in cfg: scheduler = GradualWarmupScheduler( optimizer, multiplier=cfg['warmup']['multiplier'], total_epoch=cfg['warmup']['epochs'], after_scheduler=scheduler) logger.add_line("=" * 30 + " Optimizer " + "=" * 30) logger.add_line(str(optimizer)) return optimizer, scheduler
def main(): model = ResNet(ResidualBlock).to(device) model.reload() model.train() logger.info("Train: Init model") criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler_after = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.5) scheduler = GradualWarmupScheduler(optimizer, 8, 10, after_scheduler=scheduler_after) train_dataloader = get_train_data_loader() loss_best = 1 for epoch in range(num_epochs): for i, (images, labels) in enumerate(tqdm(train_dataloader)): images = images.to(device) labels = labels.to(device) labels = labels.long() label1, label2 = labels[:, 0], labels[:, 1] optimizer.zero_grad() y1, y2 = model(images) loss1, loss2 = criterion(y1, label1), criterion(y2, label2) loss = loss1 + loss2 # outputs = model(images) # loss = criterion(outputs, labels) loss.backward() optimizer.step() scheduler.step() logger.info(f"epoch: {epoch}, step: {i}, loss: {loss.item()}") model.save() if loss_best > loss.item(): loss_best = loss.item() torch.save(model.state_dict(), "model/best.pkl") logger.info("Train: Saved best model") torch.save(model.state_dict(), "model/final.pkl") logger.info("Train: Saved last model")
def configure_optimizers(self): optimizer = torch.optim.AdamW( self.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay, ) T_0 = self.args.num_train_steps scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0, T_mult=1, eta_min=1e-8) scheduler_warmup = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=self.args.warmup_steps, after_scheduler=scheduler) self.scheduler = scheduler_warmup return [optimizer], [{ 'scheduler': scheduler_warmup, 'interval': 'step', }]
def main(): if not torch.cuda.is_available(): print('no gpu device available') sys.exit(1) writer = None num_gpus = torch.cuda.device_count() np.random.seed(args.seed) args.gpu = args.local_rank % num_gpus args.nprocs = num_gpus torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.deterministic = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) if args.local_rank == 0: args.exp = datetime.datetime.now().strftime("%YY_%mM_%dD_%HH") + "_" + \ "{:04d}".format(random.randint(0, 1000)) print('gpu device = %d' % args.gpu) print("args = %s", args) if args.model_type == "dynamic": model = dynamic_resnet20() elif args.model_type == "independent": model = Independent_resnet20() elif args.model_type == "slimmable": model = mutableResNet20() elif args.model_type == "original": model = resnet20() else: print("Not Implement") # model = resnet20() model = model.cuda(args.gpu) if num_gpus > 1: torch.distributed.init_process_group(backend='nccl', init_method='env://') model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) args.world_size = torch.distributed.get_world_size() args.batch_size = args.batch_size // args.world_size # criterion_smooth = CrossEntropyLabelSmooth(args.classes, args.label_smooth) # criterion_smooth = criterion_smooth.cuda() criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu) soft_criterion = CrossEntropyLossSoft() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # scheduler = torch.optim.lr_scheduler.LambdaLR( # optimizer, lambda step: (1.0-step/args.total_iters), last_epoch=-1) # a_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( # optimizer, T_0=5) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) # a_scheduler = torch.optim.lr_scheduler.LambdaLR( # optimizer, lambda epoch: 1 - (epoch / args.epochs)) a_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[60, 120, 160], last_epoch=-1) # !! scheduler = GradualWarmupScheduler(optimizer, 1, total_epoch=5, after_scheduler=a_scheduler) if args.local_rank == 0: writer = SummaryWriter( "./runs/%s-%05d" % (time.strftime("%m-%d", time.localtime()), random.randint(0, 100))) # Prepare data train_loader = get_train_loader(args.batch_size, args.local_rank, args.num_workers) # 原来跟train batch size一样,现在修改小一点 , val_loader = get_val_loader(args.batch_size, args.num_workers) archloader = ArchLoader("data/Track1_final_archs.json") for epoch in range(args.epochs): train(train_loader, val_loader, optimizer, scheduler, model, archloader, criterion, soft_criterion, args, args.seed, epoch, writer) scheduler.step() if (epoch + 1) % args.report_freq == 0: top1_val, top5_val, objs_val = infer(train_loader, val_loader, model, criterion, archloader, args, epoch) if args.local_rank == 0: # model if writer is not None: writer.add_scalar("Val/loss", objs_val, epoch) writer.add_scalar("Val/acc1", top1_val, epoch) writer.add_scalar("Val/acc5", top5_val, epoch) save_checkpoint({ 'state_dict': model.state_dict(), }, epoch, args.exp)
def train(save_path, save_every, img_size, resume, epochs, opt=None): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu model_name = 'flowNet' weights_path = osp.join(save_path, model_name) loss_log_path = osp.join(weights_path, 'loss.json') mkdir_if_missing(weights_path) cfg = {} cfg['lr'] = opt.lr cfg['height'] = img_size[1] cfg['width'] = img_size[0] if resume: latest_resume = osp.join(weights_path, 'latest.pt') torch.backends.cudnn.benchmark = True # root = '/home/hunter/Document/torch' root = '/data/dgw' paths_trainset = './data/flow/MOT16.txt' transforms = T.Compose([T.ToTensor()]) trainset = LoadImagesAndLabels_2(root=root, path=paths_trainset, img_size=img_size, augment=False, transforms=transforms) dataloader_trainset = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=True) model = flowTracker(img_size) # model.train() model.cuda().train() start_epoch = 0 optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=5e-4) after_scheduler = StepLR(optimizer, 10, 0.1) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=after_scheduler) if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from print(model.load_state_dict(checkpoint['model'], strict=False)) start_epoch = checkpoint['epoch'] + 1 del checkpoint # current, saved else: with open(osp.join(weights_path, 'model.yaml'), 'w+') as f: yaml.dump(cfg, f) for epoch in range(epochs): epoch = epoch + start_epoch print('lr: ', optimizer.param_groups[0]['lr']) scheduler.step(epoch) loss_epoch_log = 0 for i, (imgs, labels, img_path, _) in enumerate(tqdm(dataloader_trainset)): imgs = torch.cat(imgs, dim=0) imgs = imgs.permute(1, 0, 2, 3).unsqueeze(0).cuda() boxes, target = labels[0][0].cuda(), labels[1][0].cuda() loss = model(imgs, boxes, target, img_path) if loss is None: continue loss.backward() optimizer.step() ## print and log the loss if i % 50 == 0: print(loss) loss_epoch_log += loss loss_epoch_log = loss_epoch_log / i print("loss in epoch %d: " % (epoch)) print(loss_epoch_log) checkpoint = {'epoch': epoch, 'model': model.state_dict()} latest = osp.join(weights_path, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: torch.save( checkpoint, osp.join(weights_path, "weights_epoch_" + str(epoch) + ".pt")) with open(loss_log_path, 'a+') as f: f.write('epoch:' + str(epoch) + '\n') json.dump(float(loss_epoch_log), f) f.write('\n')
def __init__(self, experiment): self.experiment = str(experiment) config_file = os.path.join(CONFIG_DIR, experiment + '.json') with open(config_file, 'r') as f: self.config = json.load(f) model = self.config['model'] optimizer = self.config['optimizer'] scheduler = self.config['scheduler'] dataset_name = self.config['dataset'] self.clip_len = int(self.config['clip-length']) self.lr = float(self.config['lr']) self.weight_decay = float(self.config['weight-decay']) self.max_epochs = int(self.config['max-epochs']) self.epoch = 0 self.iteration = 0 self.train_batch_size = int(self.config['train-batch-size']) self.eval_batch_size = int(self.config['test-batch-size']) self.warmup_scheduler = bool(self.config['warmup-scheduler']) if dataset_name == 'breakfast': import data.breakfast_data as dataset_utils self.n_classes = dataset_utils.N_CLASSES else: raise ValueError('no such dataset') if model == 'ir-csn': import utils.csnet_utils as train_utils self.model = IrCsn152(n_classes=dataset_utils.N_CLASSES, clip_len=self.clip_len, crop_size=train_utils.CROP_SIZE) else: raise ValueError('no such model') train_video_files, train_labels, train_video_len_files = dataset_utils.get_train_data( ) test_video_files, test_labels, test_video_len_files = dataset_utils.get_test_data( ) self.train_batch_size = self.train_batch_size * torch.cuda.device_count( ) device_ids = list(range(torch.cuda.device_count())) self.model = nn.DataParallel(self.model, device_ids=device_ids) self.model = self.model.cuda() self.loss_fn = nn.CrossEntropyLoss() if optimizer == 'adam': self.optimizer = Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay) elif optimizer == 'sgd': self.optimizer = SGD(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay, momentum=0) else: raise ValueError('no such optimizer') if scheduler == 'step': step_size = self.config['lr-decay-step-size'] lr_decay = self.config['lr-decay-rate'] self.scheduler = StepLR(self.optimizer, gamma=lr_decay, step_size=step_size) elif scheduler == 'half-cosine': self.scheduler = CosineAnnealingWarmRestarts( self.optimizer, T_0=len(train_video_files)) else: raise ValueError('no such scheduler') if self.warmup_scheduler: if scheduler == 'half-cosine': self.scheduler = GradualWarmupScheduler( self.optimizer, multiplier=8, total_epoch=10, after_scheduler=self.scheduler) else: raise ValueError('no such scheduler') self.checkpoint_dir = os.path.join(CHECKPOINT_DIR, self.experiment) self.load_checkpoint(ckpt_name='model') self.log_dir = os.path.join(LOG_DIR, self.experiment) if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.tensorboard_logger = SummaryWriter(self.log_dir) if model == 'ir-csn': self.train_dataset = train_utils.TrainDataset( video_files=train_video_files, labels=train_labels, video_len_files=train_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN) # evaluation datasets self.train_eval_dataset = train_utils.EvalDataset( video_files=train_video_files, labels=train_labels, video_len_files=train_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN, n_clips=train_utils.N_EVAL_CLIPS) self.test_eval_dataset = train_utils.EvalDataset( video_files=test_video_files, labels=test_labels, video_len_files=test_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN, n_clips=train_utils.N_EVAL_CLIPS) else: raise ValueError('no such model... how did you even get here...')
class Trainer: def __init__(self, experiment): self.experiment = str(experiment) config_file = os.path.join(CONFIG_DIR, experiment + '.json') with open(config_file, 'r') as f: self.config = json.load(f) model = self.config['model'] optimizer = self.config['optimizer'] scheduler = self.config['scheduler'] dataset_name = self.config['dataset'] self.clip_len = int(self.config['clip-length']) self.lr = float(self.config['lr']) self.weight_decay = float(self.config['weight-decay']) self.max_epochs = int(self.config['max-epochs']) self.epoch = 0 self.iteration = 0 self.train_batch_size = int(self.config['train-batch-size']) self.eval_batch_size = int(self.config['test-batch-size']) self.warmup_scheduler = bool(self.config['warmup-scheduler']) if dataset_name == 'breakfast': import data.breakfast_data as dataset_utils self.n_classes = dataset_utils.N_CLASSES else: raise ValueError('no such dataset') if model == 'ir-csn': import utils.csnet_utils as train_utils self.model = IrCsn152(n_classes=dataset_utils.N_CLASSES, clip_len=self.clip_len, crop_size=train_utils.CROP_SIZE) else: raise ValueError('no such model') train_video_files, train_labels, train_video_len_files = dataset_utils.get_train_data( ) test_video_files, test_labels, test_video_len_files = dataset_utils.get_test_data( ) self.train_batch_size = self.train_batch_size * torch.cuda.device_count( ) device_ids = list(range(torch.cuda.device_count())) self.model = nn.DataParallel(self.model, device_ids=device_ids) self.model = self.model.cuda() self.loss_fn = nn.CrossEntropyLoss() if optimizer == 'adam': self.optimizer = Adam(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay) elif optimizer == 'sgd': self.optimizer = SGD(self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay, momentum=0) else: raise ValueError('no such optimizer') if scheduler == 'step': step_size = self.config['lr-decay-step-size'] lr_decay = self.config['lr-decay-rate'] self.scheduler = StepLR(self.optimizer, gamma=lr_decay, step_size=step_size) elif scheduler == 'half-cosine': self.scheduler = CosineAnnealingWarmRestarts( self.optimizer, T_0=len(train_video_files)) else: raise ValueError('no such scheduler') if self.warmup_scheduler: if scheduler == 'half-cosine': self.scheduler = GradualWarmupScheduler( self.optimizer, multiplier=8, total_epoch=10, after_scheduler=self.scheduler) else: raise ValueError('no such scheduler') self.checkpoint_dir = os.path.join(CHECKPOINT_DIR, self.experiment) self.load_checkpoint(ckpt_name='model') self.log_dir = os.path.join(LOG_DIR, self.experiment) if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.tensorboard_logger = SummaryWriter(self.log_dir) if model == 'ir-csn': self.train_dataset = train_utils.TrainDataset( video_files=train_video_files, labels=train_labels, video_len_files=train_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN) # evaluation datasets self.train_eval_dataset = train_utils.EvalDataset( video_files=train_video_files, labels=train_labels, video_len_files=train_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN, n_clips=train_utils.N_EVAL_CLIPS) self.test_eval_dataset = train_utils.EvalDataset( video_files=test_video_files, labels=test_labels, video_len_files=test_video_len_files, resize=train_utils.RESIZE, crop_size=train_utils.CROP_SIZE, clip_len=train_utils.CLIP_LEN, n_clips=train_utils.N_EVAL_CLIPS) else: raise ValueError('no such model... how did you even get here...') def train(self): start_epoch = self.epoch for i in range(start_epoch, self.max_epochs): print('INFO: epoch {0}/{1}'.format(i + 1, self.max_epochs)) self.epoch += 1 self.train_step() self.eval_step() self.save_checkpoint(ckpt_name='model') self.save_checkpoint(ckpt_name='model-{}'.format(self.epoch)) def train_step(self): print('INFO: training at epoch {}'.format(self.epoch)) dataloader = tdata.DataLoader(self.train_dataset, batch_size=self.train_batch_size, collate_fn=self.train_dataset.collate_fn, shuffle=True, num_workers=12, drop_last=True) i = 0 epoch_losses = [] self.model.train() pbar = tqdm(dataloader) for frames, labels in pbar: frames = frames.cuda() labels = labels.cuda() self.optimizer.zero_grad() logits = self.model(frames) loss = self.loss_fn(logits, labels) loss.backward() self.optimizer.step() i += 1 loss = loss.item() pbar.set_postfix({'loss:': loss}) epoch_losses.append(loss) epoch_loss = np.mean(epoch_losses) print('INFO: training loss: {}'.format(epoch_loss)) for loss in epoch_losses: train_log = {'loss': loss} self.iteration += 1 self.tensorboard_logger.add_scalars( '{}:train'.format(self.experiment), train_log, self.iteration) def eval_step(self, evaluate_train=True): print('INFO: evaluating...') self.model.eval() test_accuracy = self.eval_test() eval_log = {'test-accuracy': test_accuracy} if evaluate_train: eval_log['train-accuracy'] = self.eval_train() self.tensorboard_logger.add_scalars( '{}:evaluation'.format(self.experiment), eval_log, self.epoch) def eval_train(self): print('INFO: evaluating train dataset...') prediction_file = os.path.join( LOG_DIR, 'epoch-{0}-train-prediction.json'.format(self.epoch)) train_accuracy = self.eval_dataset(self.train_eval_dataset, prediction_file) print('INFO: epoch {0} train accuracy: {1}'.format( self.epoch, train_accuracy)) return train_accuracy def eval_test(self): print('INFO: evaluating test dataset...') prediction_file = os.path.join( LOG_DIR, 'epoch-{0}-test-prediction.json'.format(self.epoch)) test_accuracy = self.eval_dataset(self.test_eval_dataset, prediction_file) print('INFO: epoch {0} test accuracy: {1}'.format( self.epoch, test_accuracy)) return test_accuracy def eval_dataset(self, dataset, prediction_file): dataloader = tdata.DataLoader(dataset=dataset, batch_size=self.eval_batch_size, shuffle=False, num_workers=12, pin_memory=True, collate_fn=dataset.collate_fn) prediction_dict = dict() for i, video in enumerate(dataset.video_files): video_prediction = { 'label': int(dataset.labels[i]), 'n_clips': 0, 'logit': None } prediction_dict[video] = video_prediction with torch.no_grad(): for clips, clip_files in tqdm(dataloader): n_clips = clips.shape[0] clips = clips.cuda() logits = self.model(clips).detach().cpu() # update prediction dict. for i in range(n_clips): logit = logits[i] clip_file = clip_files[i] video_prediction = prediction_dict[clip_file] if video_prediction['n_clips'] == 0: video_prediction['logit'] = logit video_prediction['n_clips'] = 1 else: # keep a running average of logits video_prediction['n_clips'] += 1 n_clips = video_prediction['n_clips'] video_prediction['logit'] = video_prediction['logit'] * ((n_clips - 1) / n_clips) + \ logit / n_clips n_correct = 0 n_videos = len(dataset.video_files) for video, video_dict in prediction_dict.items(): logit = video_dict['logit'] label = int(video_dict['label']) prediction = int(torch.argmax(logit).item()) video_dict['prediction'] = prediction if label == prediction: n_correct += 1 del video_dict['logit'] accuracy = n_correct / n_videos with open(prediction_file, 'w') as f: json.dump(prediction_dict, f) return accuracy def save_checkpoint(self, ckpt_name): if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) checkpoint_file = os.path.join(self.checkpoint_dir, ckpt_name + '.pth') checkpoint_dict = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'epoch': self.epoch, 'iteration': self.iteration } torch.save(checkpoint_dict, checkpoint_file) print('INFO: saved checkpoint {}'.format(checkpoint_file)) def load_checkpoint(self, ckpt_name='model'): checkpoint_file = os.path.join(self.checkpoint_dir, ckpt_name + '.pth') if not os.path.exists(checkpoint_file): print('WARNING: checkpoint does not exist. Continuing...') else: checkpoint_dict = torch.load(checkpoint_file, map_location='cuda:{}'.format(0)) self.model.load_state_dict(checkpoint_dict['model']) self.optimizer.load_state_dict(checkpoint_dict['optimizer']) self.scheduler.load_state_dict(checkpoint_dict['scheduler']) self.epoch = checkpoint_dict['epoch'] self.iteration = checkpoint_dict['iteration'] def __del__(self): self.tensorboard_logger.close() del self.model
def train( save_path, save_every, img_size, resume, epochs, batch_size, accumulated_batches, opt=None ): os.environ['CUDA_VISIBLE_DEVICES']=opt.gpu model_name = opt.backbone_name + '_img_size' + str(img_size[0]) + '_' + str(img_size[1]) weights_path = osp.join(save_path, model_name) loss_log_path = osp.join(weights_path, 'loss.json') mkdir_if_missing(weights_path) cfg = {} cfg['width'] = img_size[0] cfg['height'] = img_size[1] cfg['backbone_name'] = opt.backbone_name cfg['lr'] = opt.lr if resume: latest_resume = osp.join(weights_path, 'latest.pt') torch.backends.cudnn.benchmark = True # root = '/home/hunter/Document/torch' root = '/data/dgw' if opt.all_datasets: paths_trainset = {'02':'./data/track/train/MOT16-02.txt', '04':'./data/track/train/MOT16-04.txt', '05':'./data/track/train/MOT16-05.txt', '09':'./data/track/train/MOT16-09.txt', '10':'./data/track/train/MOT16-10.txt', '11':'./data/track/train/MOT16-11.txt', '13':'./data/track/train/MOT16-13.txt', 'CT':'./data/detect/CT_train.txt', 'ETH':'./data/detect/ETH.txt', 'PRW':'./data/detect/PRW_train.txt', 'CP':'./data/detect/cp_train.txt', 'CS':'./data/detect/CUHK_train.txt'} paths_valset = {'02':'./data/track/val/MOT16-02.txt', '04':'./data/track/val/MOT16-04.txt', '05':'./data/track/val/MOT16-05.txt', '09':'./data/track/val/MOT16-09.txt', '10':'./data/track/val/MOT16-10.txt', '11':'./data/track/val/MOT16-11.txt', '13':'./data/track/val/MOT16-13.txt', 'CP':'./data/detect/cp_val.txt', 'PRW':'./data/detect/PRW_val.txt', 'CT':'./data/detect/CT_val.txt', 'CS':'./data/detect/CUHK_val.txt'} else: paths_trainset = {'02':'./data/track/train/MOT16-02.txt', '04':'./data/track/train/MOT16-04.txt', '05':'./data/track/train/MOT16-05.txt', '09':'./data/track/train/MOT16-09.txt', '10':'./data/track/train/MOT16-10.txt', '11':'./data/track/train/MOT16-11.txt', '13':'./data/track/train/MOT16-13.txt'} paths_valset = {'02':'./data/track/val/MOT16-02.txt', '04':'./data/track/val/MOT16-04.txt', '05':'./data/track/val/MOT16-05.txt', '09':'./data/track/val/MOT16-09.txt', '10':'./data/track/val/MOT16-10.txt', '11':'./data/track/val/MOT16-11.txt', '13':'./data/track/val/MOT16-13.txt'} transforms = T.Compose([T.ToTensor()]) trainset = JointDataset(root=root, paths=paths_trainset, img_size=img_size, augment=True, transforms=transforms) valset = JointDataset(root=root, paths=paths_valset, img_size=img_size, augment=False, transforms=transforms) dataloader_trainset = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) dataloader_valset = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) cfg['num_ID'] = trainset.nID backbone = resnet_fpn_backbone(opt.backbone_name, True) backbone.out_channels = 256 model = Jde_RCNN(backbone, num_ID=trainset.nID, min_size=img_size[1], max_size=img_size[0], version=opt.model_version, len_embeddings=opt.len_embed) model.cuda().train() # model = torch.nn.DataParallel(model) start_epoch = 0 optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=5e-4) after_scheduler = StepLR(optimizer, 10, 0.1) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=after_scheduler) if resume: checkpoint = torch.load(latest_resume, map_location='cpu') # Load weights to resume from print(model.load_state_dict(checkpoint['model'],strict=False)) start_epoch = checkpoint['epoch_det'] del checkpoint # current, saved else: with open(osp.join(weights_path,'model.yaml'), 'w+') as f: yaml.dump(cfg, f) for epoch in range(epochs): model.cuda().eval() with torch.no_grad(): if epoch%3==0: test_emb(model, dataloader_valset, print_interval=50)[-1] test(model, dataloader_valset, conf_thres=0.5, iou_thres=0.2, print_interval=50) scheduler.step(epoch+start_epoch) model.cuda().train() print('lr: ', optimizer.param_groups[0]['lr']) loss_epoch_log = dict(loss_total=0, loss_classifier=0, loss_box_reg=0, loss_reid=0, loss_objectness=0, loss_rpn_box_reg=0) for i, (imgs, labels, _, _, targets_len) in enumerate(tqdm(dataloader_trainset)): targets = [] imgs = imgs.cuda() labels = labels.cuda() flag = False for target_len, label in zip(targets_len.view(-1,), labels): ## convert the input to demanded format target = {} if target_len==0: flag = True if torch.all(label[0:int(target_len), 1]==-1): flag = True target['boxes'] = label[0:int(target_len), 2:6] target['ids'] = (label[0:int(target_len), 1]).long() target['labels'] = torch.ones_like(target['ids']) targets.append(target) if flag: continue losses = model(imgs, targets) loss = losses['loss_classifier'] + losses['loss_box_reg'] + losses['loss_objectness'] + losses['loss_rpn_box_reg'] + 0.4*losses['loss_reid'] loss.backward() if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader_trainset) - 1): optimizer.step() optimizer.zero_grad() ## print and log the loss for key, val in losses.items(): loss_epoch_log[key] = float(val) + loss_epoch_log[key] for key, val in loss_epoch_log.items(): loss_epoch_log[key] =loss_epoch_log[key]/i print("loss in epoch %d: "%(epoch)) print(loss_epoch_log) epoch_det = epoch + start_epoch epoch_reid = epoch + start_epoch checkpoint = {'epoch_det': epoch_det, 'epoch_reid': epoch_reid, 'model': model.state_dict() } latest = osp.join(weights_path, 'latest.pt') torch.save(checkpoint, latest) if epoch % save_every == 0 and epoch != 0: torch.save(checkpoint, osp.join(weights_path, "weights_epoch_" + str(epoch_det) + '_' + str(epoch_reid) + ".pt")) with open(loss_log_path, 'a+') as f: f.write('epoch_det:'+str(epoch_det)+',epoch_reid:'+str(epoch_reid)+'\n') json.dump(loss_epoch_log, f) f.write('\n')