def __init__(self, unsupervised_path): super().__init__() self.save_hyperparameters() self.feat_ext = LinearClassifier(network='s3d', num_class=101, dropout=0.9, use_dropout=True, use_final_bn=False, use_l2_norm=False) checkpoint = torch.load(unsupervised_path) state_dict = checkpoint['state_dict'] new_dict = {} for k, v in state_dict.items(): k = k.replace('encoder_q.0.', 'backbone.') new_dict[k] = v state_dict = new_dict neq_load_customized(self.feat_ext, state_dict, verbose=False)
def main(): args = get_args() # Fix randomness seed = args.seed torch.cuda.manual_seed_all(seed) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # ---------------------------- Prepare model ----------------------------- # if args.local_rank <= 0: print_r(args, 'Preparing model') model = models.Model(args) model = model.to(args.device) params = model.parameters() optimizer = geoopt.optim.RiemannianAdam(params, lr=args.lr, weight_decay=args.wd, stabilize=10) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[80, 150], gamma=0.1) best_acc = 0 iteration = 0 # --- restart training --- # if args.resume: if os.path.isfile(args.resume): print_r(args, f"=> loading resumed checkpoint '{args.resume}'") checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict'], strict=True) scheduler.load_state_dict(checkpoint['scheduler']) if not args.reset_lr: # if didn't reset lr, load old optimizer optimizer.load_state_dict(checkpoint['optimizer']) else: print_r( args, f'==== Restart optimizer with a learning rate {args.lr} ====' ) print_r( args, f"=> loaded resumed checkpoint '{args.resume}' (epoch {checkpoint['epoch']})" ) else: print_r(args, f"[Warning] no checkpoint found at '{args.resume}'", print_no_verbose=True) elif args.pretrain: # resume overwrites this if os.path.isfile(args.pretrain): print_r(args, f"=> loading pretrained checkpoint '{args.pretrain}'") checkpoint = torch.load(args.pretrain, map_location=torch.device('cpu')) model = neq_load_customized(args, model, checkpoint['state_dict'], parts='all', size_diff=args.final_2dim or args.feature_dim != -1) print_r( args, f"=> loaded pretrained checkpoint '{args.pretrain}' (epoch {checkpoint['epoch']})" ) else: print_r(args, f"=> no checkpoint found at '{args.pretrain}'", print_no_verbose=True) if args.only_train_linear: for name, param in model.named_parameters(): # deleted 'module' if 'network_class' not in name: param.requires_grad = False print_r( args, '\n==== parameter names and whether they require gradient ====\n') for name, param in model.named_parameters(): print_r(args, (name, param.requires_grad)) print_r(args, '\n==== start dataloading ====\n') if args.local_rank != -1: from torch.nn.parallel import DistributedDataParallel as DDP model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if not args.not_track_running_stats else model model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) args.parallel = 'ddp' elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.parallel = 'dp' else: args.parallel = 'none' # ---------------------------- Prepare dataset ----------------------------- # splits = ['train', 'val', 'test'] loaders = { split: datasets.get_data(args, split, return_label=args.use_labels, hierarchical_label=args.hierarchical_labels, action_level_gt=args.action_level_gt, num_workers=args.num_workers, path_dataset=args.path_dataset) for split in splits } # setup tools img_path, model_path = set_path(args) writer_val = SummaryWriter( log_dir=os.path.join(img_path, 'val') if not args.debug else '/tmp' ) if args.local_rank <= 0 else None writer_train = SummaryWriter( log_dir=os.path.join(img_path, 'train') if not args.debug else '/tmp' ) if args.local_rank <= 0 else None # ---------------------------- Prepare trainer and run ----------------------------- # if args.local_rank <= 0: print_r(args, 'Preparing trainer') trainer = Trainer(args, model, optimizer, loaders, iteration, best_acc, writer_train, writer_val, img_path, model_path, scheduler) if args.test: trainer.test() else: trainer.train()
def main_worker(gpu, ngpus_per_node, args): best_acc = 0 args.gpu = gpu if args.distributed: if args.local_rank != -1: args.rank = args.local_rank args.gpu = args.local_rank elif 'SLURM_PROCID' in os.environ: # slurm scheduler args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() elif args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) args.print = args.gpu == 0 # suppress printing if not master if (args.multiprocessing_distributed and args.gpu != 0) or\ (args.local_rank != -1 and args.gpu != 0) or\ ('SLURM_PROCID' in os.environ and args.rank!=0): def print_pass(*args): pass builtins.print = print_pass ### model ### print("=> creating {} model with '{}' backbone".format(args.model, args.net)) if args.model == 'coclr': model = CoCLR(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, topk=args.topk, reverse=args.reverse) if args.reverse: print('[Warning] using RGB-Mining to help flow') else: print('[Warning] using Flow-Mining to help RGB') else: raise NotImplementedError args.num_seq = 2 print('Re-write num_seq to %d' % args.num_seq) args.img_path, args.model_path, args.exp_path = set_path(args) # print(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) model_without_ddp = model.module elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") ### optimizer ### params = [] for name, param in model.named_parameters(): params.append({'params': param}) print('\n===========Check Grad============') for name, param in model.named_parameters(): print(name, param.requires_grad) print('=================================\n') optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) criterion = nn.CrossEntropyLoss().cuda(args.gpu) args.iteration = 1 ### data ### transform_train = get_transform('train', args) train_loader = get_dataloader(get_data(transform_train, 'train', args), 'train', args) transform_train_cuda = transforms.Compose([ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1)]) n_data = len(train_loader.dataset) print('===================================') lr_scheduler = None ### restart training ### if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch']+1 args.iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] state_dict = checkpoint['state_dict'] try: model_without_ddp.load_state_dict(state_dict) except: print('[WARNING] Non-Equal load for resuming training!') neq_load_customized(model_without_ddp, state_dict, verbose=True) print("=> load resumed checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) try: optimizer.load_state_dict(checkpoint['optimizer']) except: print('[WARNING] Not loading optimizer states') else: print("[Warning] no checkpoint found at '{}', use random init".format(args.resume)) elif args.pretrain != ['random', 'random']: # first path: weights to be trained # second path: weights as the oracle, not trained if os.path.isfile(args.pretrain[1]): # second network --> load as sampler checkpoint = torch.load(args.pretrain[1], map_location=torch.device('cpu')) second_dict = checkpoint['state_dict'] new_dict = {} for k,v in second_dict.items(): # only take the encoder_q if 'encoder_q.' in k: k = k.replace('encoder_q.', 'sampler.') new_dict[k] = v second_dict = new_dict new_dict = {} # remove queue, queue_ptr for k, v in second_dict.items(): if 'queue' not in k: new_dict[k] = v second_dict = new_dict print("=> Use Oracle checkpoint '{}' (epoch {})".format(args.pretrain[1], checkpoint['epoch'])) else: print("=> NO Oracle checkpoint found at '{}', use random init".format(args.pretrain[1])) second_dict = {} if os.path.isfile(args.pretrain[0]): # first network --> load both encoder q & k checkpoint = torch.load(args.pretrain[0], map_location=torch.device('cpu')) first_dict = checkpoint['state_dict'] new_dict = {} # remove queue, queue_ptr for k, v in first_dict.items(): if 'queue' not in k: new_dict[k] = v first_dict = new_dict # update both q and k with q new_dict = {} for k,v in first_dict.items(): # only take the encoder_q if 'encoder_q.' in k: new_dict[k] = v k = k.replace('encoder_q.', 'encoder_k.') new_dict[k] = v first_dict = new_dict print("=> Use Training checkpoint '{}' (epoch {})".format(args.pretrain[0], checkpoint['epoch'])) else: print("=> NO Training checkpoint found at '{}', use random init".format(args.pretrain[0])) first_dict = {} state_dict = {**first_dict, **second_dict} try: del state_dict['queue_label'] # always re-fill the queue except: pass neq_load_customized(model_without_ddp, state_dict, verbose=True) else: print("=> train from scratch") torch.backends.cudnn.benchmark = True # tensorboard plot tools writer_train = SummaryWriter(logdir=os.path.join(args.img_path, 'train')) args.train_plotter = TB.PlotterThread(writer_train) ### main loop ### for epoch in range(args.start_epoch, args.epochs): np.random.seed(epoch) random.seed(epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) _, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, transform_train_cuda, epoch, args) if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1): # save check_point on rank==0 worker if (not args.multiprocessing_distributed and args.rank == 0) \ or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): is_best = train_acc > best_acc best_acc = max(train_acc, best_acc) state_dict = model_without_ddp.state_dict() save_dict = { 'epoch': epoch, 'state_dict': state_dict, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': args.iteration} save_checkpoint(save_dict, is_best, gap=args.save_freq, filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), keep_all='k400' in args.dataset) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs)) sys.exit(0)
def main(args): if args.gpu is None: args.gpu = str(os.environ["CUDA_VISIBLE_DEVICES"]) else: os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) device = torch.device('cuda') best_acc = 0 torch.manual_seed(0) np.random.seed(0) random.seed(0) num_gpu = len(str(args.gpu).split(',')) args.batch_size = num_gpu * args.batch_size print('=> Effective BatchSize = %d' % args.batch_size) args.img_path, args.model_path, args.exp_path = set_path(args) ### classifier model ### num_class_dict = { 'ucf101': 101, 'hmdb51': 51, 'k400': 400, 'ucf101-f': 101, 'hmdb51-f': 51, 'k400-f': 400 } args.num_class = num_class_dict[args.dataset] if args.train_what == 'last': # for linear probe args.final_bn = True args.final_norm = True args.use_dropout = False else: # for training the entire network args.final_bn = False args.final_norm = False args.use_dropout = True if args.model == 'lincls': model = LinearClassifier(network=args.net, num_class=args.num_class, dropout=args.dropout, use_dropout=args.use_dropout, use_final_bn=args.final_bn, use_l2_norm=args.final_norm) else: raise NotImplementedError model.to(device) ### optimizer ### if args.train_what == 'last': print('=> [optimizer] only train last layer') params = [] for name, param in model.named_parameters(): if 'backbone' in name: param.requires_grad = False else: params.append({'params': param}) elif args.train_what == 'ft': print('=> [optimizer] finetune backbone with smaller lr') params = [] for name, param in model.named_parameters(): if 'backbone' in name: params.append({'params': param, 'lr': args.lr / 10}) else: params.append({'params': param}) else: # train all params = [] print('=> [optimizer] train all layer') for name, param in model.named_parameters(): params.append({'params': param}) if args.train_what == 'last': print('\n===========Check Grad============') for name, param in model.named_parameters(): if param.requires_grad: print(name, param.requires_grad) print('=================================\n') if args.optim == 'adam': optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(params, lr=args.lr, weight_decay=args.wd, momentum=0.9) else: raise NotImplementedError model = torch.nn.DataParallel(model) model_without_dp = model.module ce_loss = nn.CrossEntropyLoss() args.iteration = 1 ### test: higher priority ### if args.test: if os.path.isfile(args.test): print("=> loading testing checkpoint '{}'".format(args.test)) checkpoint = torch.load(args.test, map_location=torch.device('cpu')) epoch = checkpoint['epoch'] state_dict = checkpoint['state_dict'] if args.retrieval_ucf or args.retrieval_full: # if directly test on pretrained network new_dict = {} for k, v in state_dict.items(): k = k.replace('encoder_q.0.', 'backbone.') new_dict[k] = v state_dict = new_dict try: model_without_dp.load_state_dict(state_dict) except: neq_load_customized(model_without_dp, state_dict, verbose=True) else: print("[Warning] no checkpoint found at '{}'".format(args.test)) epoch = 0 print("[Warning] if test random init weights, press c to continue") import ipdb ipdb.set_trace() args.logger = Logger(path=os.path.dirname(args.test)) args.logger.log('args=\n\t\t' + '\n\t\t'.join( ['%s:%s' % (str(k), str(v)) for k, v in vars(args).items()])) transform_test_cuda = transforms.Compose([ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1) ]) if args.retrieval: test_retrieval(model, ce_loss, transform_test_cuda, device, epoch, args) elif args.center_crop or args.five_crop or args.ten_crop: transform = get_transform('test', args) test_dataset = get_data(transform, 'test', args) test_10crop(test_dataset, model, ce_loss, transform_test_cuda, device, epoch, args) else: raise NotImplementedError sys.exit(0) ### data ### transform_train = get_transform('train', args) train_loader = get_dataloader(get_data(transform_train, 'train', args), 'train', args) transform_val = get_transform('val', args) val_loader = get_dataloader(get_data(transform_val, 'val', args), 'val', args) transform_train_cuda = transforms.Compose([ T.RandomHorizontalFlip(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1) ]) # ImageNet transform_val_cuda = transforms.Compose([ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1) ]) # ImageNet print('===================================') ### restart training ### if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] + 1 args.iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] state_dict = checkpoint['state_dict'] try: model_without_dp.load_state_dict(state_dict) except: print('[WARNING] resuming training with different weights') neq_load_customized(model_without_dp, state_dict, verbose=True) print("=> load resumed checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) try: optimizer.load_state_dict(checkpoint['optimizer']) except: print( '[WARNING] failed to load optimizer state, initialize optimizer' ) else: print("[Warning] no checkpoint found at '{}', use random init". format(args.resume)) elif args.pretrain: if os.path.isfile(args.pretrain): checkpoint = torch.load(args.pretrain, map_location='cpu') state_dict = checkpoint['state_dict'] new_dict = {} for k, v in state_dict.items(): k = k.replace('encoder_q.0.', 'backbone.') new_dict[k] = v state_dict = new_dict try: model_without_dp.load_state_dict(state_dict) except: neq_load_customized(model_without_dp, state_dict, verbose=True) print("=> loaded pretrained checkpoint '{}' (epoch {})".format( args.pretrain, checkpoint['epoch'])) else: print("[Warning] no checkpoint found at '{}', use random init". format(args.pretrain)) else: print("=> train from scratch") torch.backends.cudnn.benchmark = True # plot tools writer_val = SummaryWriter(logdir=os.path.join(img_path, 'val')) writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train')) args.val_plotter = TB.PlotterThread(writer_val) args.train_plotter = TB.PlotterThread(writer_train) args.logger = Logger(path=args.img_path) args.logger.log('args=\n\t\t' + '\n\t\t'.join( ['%s:%s' % (str(k), str(v)) for k, v in vars(args).items()])) # main loop for epoch in range(args.start_epoch, args.epochs): np.random.seed(epoch) random.seed(epoch) adjust_learning_rate(optimizer, epoch, args) train_one_epoch(train_loader, model, ce_loss, optimizer, transform_train_cuda, device, epoch, args) if epoch % args.eval_freq == 0: _, val_acc = validate(val_loader, model, ce_loss, transform_val_cuda, device, epoch, args) # save check_point is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) state_dict = model_without_dp.state_dict() save_dict = { 'epoch': epoch, 'state_dict': state_dict, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': args.iteration } save_checkpoint(save_dict, is_best, 1, filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), keep_all=False) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs)) sys.exit(0)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.local_rank != -1: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) best_acc = 0 args.print = args.gpu == 0 # suppress printing if not master if (args.multiprocessing_distributed and args.gpu != 0) or\ (args.local_rank != -1 and args.gpu != 0): def print_pass(*args): pass builtins.print = print_pass if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu if args.local_rank != -1: args.rank = args.local_rank dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) ### model ### print("=> creating {} model with '{}' backbone".format(args.model, args.net)) if args.model == 'infonce': model = InfoNCE(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t) elif args.model == 'ubernce': model = UberNCE(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t) args.num_seq = 2 print('Re-write num_seq to %d' % args.num_seq) args.img_path, args.model_path, args.exp_path = set_path(args) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) model_without_ddp = model.module elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") ### optimizer ### params = [] if args.train_what == 'all': for name, param in model.named_parameters(): params.append({'params': param}) else: raise NotImplementedError print('\n===========Check Grad============') for name, param in model.named_parameters(): if not param.requires_grad: print(name, param.requires_grad) print('=================================\n') optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) criterion = nn.CrossEntropyLoss().cuda(args.gpu) args.iteration = 1 ### data ### transform_train = get_transform('train', args) train_loader = get_dataloader(get_data(transform_train, 'train', args), 'train', args) transform_train_cuda = transforms.Compose([ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1)]) n_data = len(train_loader.dataset) print('===================================') ### restart training ### if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch']+1 args.iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] state_dict = checkpoint['state_dict'] try: model_without_ddp.load_state_dict(state_dict) except: print('[WARNING] resuming training with different weights') neq_load_customized(model_without_ddp, state_dict, verbose=True) print("=> load resumed checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) try: optimizer.load_state_dict(checkpoint['optimizer']) except: print('[WARNING] failed to load optimizer state, initialize optimizer') else: print("[Warning] no checkpoint found at '{}', use random init".format(args.resume)) elif args.pretrain: if os.path.isfile(args.pretrain): checkpoint = torch.load(args.pretrain, map_location=torch.device('cpu')) state_dict = checkpoint['state_dict'] try: model_without_ddp.load_state_dict(state_dict) except: neq_load_customized(model_without_ddp, state_dict, verbose=True) print("=> loaded pretrained checkpoint '{}' (epoch {})".format(args.pretrain, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}', use random init".format(args.pretrain)) else: print("=> train from scratch") torch.backends.cudnn.benchmark = True # tensorboard plot tools writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train')) args.train_plotter = TB.PlotterThread(writer_train) ### main loop ### for epoch in range(args.start_epoch, args.epochs): np.random.seed(epoch) random.seed(epoch) if args.distributed: train_loader.sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) _, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, lr_scheduler, transform_train_cuda, epoch, args) if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1): # save check_point on rank==0 worker if (not args.multiprocessing_distributed and args.rank == 0) \ or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): is_best = train_acc > best_acc best_acc = max(train_acc, best_acc) state_dict = model_without_ddp.state_dict() save_dict = { 'epoch': epoch, 'state_dict': state_dict, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': args.iteration} save_checkpoint(save_dict, is_best, gap=args.save_freq, filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), keep_all='k400' in args.dataset) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs)) sys.exit(0)
def main(args): torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) device = torch.device('cuda') num_gpu = len(str(args.gpu).split(',')) args.batch_size = num_gpu * args.batch_size if args.dataset == 'ucf101': args.num_class = 101 elif args.dataset == 'hmdb51': args.num_class = 51 ### classifier model ### if args.model == 'lc': model = LC(sample_size=args.img_dim, num_seq=args.num_seq, seq_len=args.seq_len, network=args.net, num_class=args.num_class, dropout=args.dropout, train_what=args.train_what) else: raise ValueError('wrong model!') model.to(device) model = nn.DataParallel(model) model_without_dp = model.module criterion = nn.CrossEntropyLoss() ### optimizer ### params = None if args.train_what == 'ft': print('=> finetune backbone with smaller lr') params = [] for name, param in model.module.named_parameters(): if ('resnet' in name) or ('rnn' in name): params.append({'params': param, 'lr': args.lr / 10}) else: params.append({'params': param}) elif args.train_what == 'last': print('=> train only last layer') params = [] for name, param in model.named_parameters(): if ('bone' in name) or ('agg' in name) or ('mb' in name) or ( 'network_pred' in name): param.requires_grad = False else: params.append({'params': param}) else: pass # train all layers print('\n===========Check Grad============') for name, param in model.named_parameters(): print(name, param.requires_grad) print('=================================\n') if params is None: params = model.parameters() optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) ### scheduler ### if args.dataset == 'hmdb51': step = args.schedule if step == []: step = [150, 250] lr_lambda = lambda ep: MultiStepLR_Restart_Multiplier( ep, gamma=0.1, step=step, repeat=1) elif args.dataset == 'ucf101': step = args.schedule if step == []: step = [300, 400] lr_lambda = lambda ep: MultiStepLR_Restart_Multiplier( ep, gamma=0.1, step=step, repeat=1) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) print('=> Using scheduler at {} epochs'.format(step)) args.old_lr = None best_acc = 0 args.iteration = 1 ### if in test mode ### if args.test: if os.path.isfile(args.test): print("=> loading test checkpoint '{}'".format(args.test)) checkpoint = torch.load(args.test, map_location=torch.device('cpu')) try: model_without_dp.load_state_dict(checkpoint['state_dict']) except: print( '=> [Warning]: weight structure is not equal to test model; Load anyway ==' ) model_without_dp = neq_load_customized( model_without_dp, checkpoint['state_dict']) epoch = checkpoint['epoch'] print("=> loaded testing checkpoint '{}' (epoch {})".format( args.test, checkpoint['epoch'])) elif args.test == 'random': epoch = 0 print("=> loaded random weights") else: print("=> no checkpoint found at '{}'".format(args.test)) sys.exit(0) args.logger = Logger(path=os.path.dirname(args.test)) _, test_dataset = get_data(None, 'test') test_loss, test_acc = test(test_dataset, model, criterion, device, epoch, args) sys.exit() ### restart training ### if args.resume: if os.path.isfile(args.resume): print("=> loading resumed checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] args.iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] model_without_dp.load_state_dict(checkpoint['state_dict']) try: optimizer.load_state_dict(checkpoint['optimizer']) except: print('[WARNING] Not loading optimizer states') print("=> loaded resumed checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) sys.exit(0) if (not args.resume) and args.pretrain: if args.pretrain == 'random': print('=> using random weights') elif os.path.isfile(args.pretrain): print("=> loading pretrained checkpoint '{}'".format( args.pretrain)) checkpoint = torch.load(args.pretrain, map_location=torch.device('cpu')) model_without_dp = neq_load_customized(model_without_dp, checkpoint['state_dict']) print("=> loaded pretrained checkpoint '{}' (epoch {})".format( args.pretrain, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.pretrain)) sys.exit(0) ### data ### transform = transforms.Compose([ A.RandomSizedCrop(consistent=True, size=224, p=1.0), A.Scale(size=(args.img_dim, args.img_dim)), A.RandomHorizontalFlip(consistent=True), A.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=0.3, consistent=True), A.ToTensor(), A.Normalize() ]) val_transform = transforms.Compose([ A.RandomSizedCrop(consistent=True, size=224, p=0.3), A.Scale(size=(args.img_dim, args.img_dim)), A.RandomHorizontalFlip(consistent=True), A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.3, consistent=True), A.ToTensor(), A.Normalize() ]) train_loader, _ = get_data(transform, 'train') val_loader, _ = get_data(val_transform, 'val') # setup tools args.img_path, args.model_path = set_path(args) args.writer_val = SummaryWriter(logdir=os.path.join(args.img_path, 'val')) args.writer_train = SummaryWriter( logdir=os.path.join(args.img_path, 'train')) torch.backends.cudnn.benchmark = True ### main loop ### for epoch in range(args.start_epoch, args.epochs): train_loss, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, device, epoch, args) val_loss, val_acc = validate(val_loader, model, criterion, device, epoch, args) lr_scheduler.step(epoch) # save check_point is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) save_dict = { 'epoch': epoch, 'backbone': args.net, 'state_dict': model_without_dp.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': args.iteration } save_checkpoint(save_dict, is_best, filename=os.path.join(args.model_path, 'epoch%s.pth.tar' % str(epoch)), keep_all=False) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs)) sys.exit(0)
def main(args): model = CoCLR(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, topk=args.topk, reverse=args.reverse) if args.reverse: print('[Warning] using RGB-Mining to help flow') else: print('[Warning] using Flow-Mining to help RGB') args.num_seq = 2 args.img_path, args.model_path, args.exp_path = set_path(args) args.writer_train = SummaryWriter(logdir='runs') torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) params = [] for name, param in model.named_parameters(): params.append({'params': param}) optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) criterion = nn.CrossEntropyLoss().cuda(args.gpu) args.iteration = 1 ### data ### transform_train = get_transform('train', args) train_loader = get_dataloader(get_data(transform_train, args), args) transform_train_cuda = transforms.Compose([ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], channel=1) ]) n_data = len(train_loader.dataset) ### restart training ### if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] + 1 args.iteration = checkpoint['iteration'] best_acc = checkpoint['best_acc'] state_dict = checkpoint['state_dict'] try: model_without_ddp.load_state_dict(state_dict) except: print('[WARNING] Non-Equal load for resuming training!') neq_load_customized(model_without_ddp, state_dict, verbose=True) print("=> load resumed checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) optimizer.load_state_dict(checkpoint['optimizer']) elif args.pretrain != ['random', 'random']: # first path: weights to be trained # second path: weights as the oracle, not trained if os.path.isfile( args.pretrain[1]): # second network --> load as sampler checkpoint = torch.load(args.pretrain[1], map_location=torch.device('cpu')) second_dict = checkpoint['state_dict'] new_dict = {} for k, v in second_dict.items(): # only take the encoder_q if 'encoder_q.' in k: k = k.replace('encoder_q.', 'sampler.') new_dict[k] = v second_dict = new_dict new_dict = {} # remove queue, queue_ptr for k, v in second_dict.items(): if 'queue' not in k: new_dict[k] = v second_dict = new_dict if os.path.isfile( args.pretrain[0]): # first network --> load both encoder q & k checkpoint = torch.load(args.pretrain[0], map_location=torch.device('cpu')) first_dict = checkpoint['state_dict'] new_dict = {} # remove queue, queue_ptr for k, v in first_dict.items(): if 'queue' not in k: new_dict[k] = v first_dict = new_dict # update both q and k with q new_dict = {} for k, v in first_dict.items(): # only take the encoder_q if 'encoder_q.' in k: new_dict[k] = v k = k.replace('encoder_q.', 'encoder_k.') new_dict[k] = v first_dict = new_dict state_dict = {**first_dict, **second_dict} try: del state_dict['queue_label'] # always re-fill the queue except: pass neq_load_customized(model_without_ddp, state_dict, verbose=True) torch.backends.cudnn.benchmark = True best_acc = 0 ### main loop ### for epoch in range(args.start_epoch, args.epochs): np.random.seed(epoch) random.seed(epoch) adjust_learning_rate(optimizer, epoch, args) _, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, transform_train_cuda, epoch, args) if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1): is_best = train_acc > best_acc best_acc = max(train_acc, best_acc) state_dict = model.state_dict() save_dict = { 'epoch': epoch, 'state_dict': state_dict, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': args.iteration } save_checkpoint(save_dict, is_best, gap=args.save_freq, filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), keep_all=False) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs))