def train_detector(cfg): # model detector = build_detector(cfg.detector) detector.cuda() # data train_dataset = build_dataset(cfg.dataset.train) val_dataset = build_dataset(cfg.dataset.val) train_dataloader = DataLoader(train_dataset, batch_size=cfg.img_batch_size) val_dataloader = DataLoader(val_dataset, batch_size=cfg.img_batch_size) # runner runner = Runner(detector, batch_processor, cfg.optimizer, cfg.work_dir) runner.register_training_hooks(lr_config=cfg.lr_hook_cfg, optimizer_config=cfg.optimizer_hook_cfg, checkpoint_config=cfg.checkpoint_hook_cfg, log_config=cfg.log_hooks_cfg) # checkpoint if cfg.load_from and os.path.exists(cfg.load_from): load_checkpoint(detector, cfg.load_from, logger=runner.logger) runner.logger.info('Load checkpoint from %s...' % cfg.load_from) # start training runner.run([train_dataloader, val_dataloader], [('train', 1), ('val', 1)], cfg.epoch)
def main(): # enable mixed-precision computation if desired if args.amp: mixed_precision.enable_mixed_precision() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # get the dataset dataset = get_dataset(args.dataset) _, test_loader, _ = build_dataset( dataset=dataset, batch_size=args.batch_size, input_dir=args.input_dir ) torch_device = torch.device("cuda") checkpointer = Checkpointer() model = checkpointer.restore_model_from_checkpoint(args.checkpoint_path) model = model.to(torch_device) model, _ = mixed_precision.initialize(model, None) test_stats = AverageMeterSet() test(model, test_loader, torch_device, test_stats) stat_str = test_stats.pretty_string(ignore=model.tasks) print(stat_str)
def do_projection(dataset, do_pca=False): batch_size = 128 num_batch_pca = 20 counter = 0 _, test_loader, num_classes = datasets.build_dataset(dataset, batch_size) all_feats = [] classes_list = [] out_embeddings = 'analysis/pca_embeds.npy' iter_dataset = iter(test_loader) with torch.no_grad(): while counter < num_batch_pca: images, classes = next(iter_dataset) counter += 1 global_ft = model(images) all_feats.append(global_ft.cpu().numpy()) classes_list += [int(x) for x in classes.cpu().numpy()] all_feats = np.concatenate(all_feats, 0) if do_pca: pca = obtain_PCA_embedding(all_feats) xy = obtain_2d_coords(images, model, pca) else: xy = sklearn.manifold.TSNE().fit_transform(all_feats) return xy, classes_list
def main(args): if args.dataset not in _available_datasets: raise NotImplementedError dataset = build_dataset( name=args.dataset, shape=[args.height, args.width], ) model = build_mobilenetv3( args.model_type, input_shape=(args.height, args.width, dataset["channels"]), num_classes=dataset["num_classes"], width_multiplier=args.width_multiplier, ) if args.optimizer not in _available_optimizers: raise NotImplementedError model.load_weights(args.model_path) model.compile( optimizer=_available_optimizers.get(args.optimizer)(args.lr), loss="categorical_crossentropy", metrics=["accuracy"], ) model.evaluate( dataset["test"].make_one_shot_iterator(), steps=(dataset["num_test"] // args.valid_batch_size) + 1, )
def main(): # create target output dir if it doesn't exist yet if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) # enable mixed-precision computation if desired amp = "" if args.amp: amp = "torch" if args.apex: print("Error: Cannot use both --amp and --apex.") exit() if args.apex: amp = "apex" mixed_precision.enable_mixed_precision() # set the RNG seeds (probably more hidden elsewhere...) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # get the dataset dataset = get_dataset(args.dataset) encoder_size = get_encoder_size(dataset) # get a helper object for tensorboard logging log_dir = os.path.join(args.output_dir, args.run_name) stat_tracker = StatTracker(log_dir=log_dir) # get dataloaders for training and testing train_loader, test_loader, num_classes = \ build_dataset(dataset=dataset, batch_size=args.batch_size, input_dir=args.input_dir, labeled_only=args.classifiers) torch_device = torch.device('cuda') checkpointer = Checkpointer(args.output_dir) if args.cpt_load_path: model = checkpointer.restore_model_from_checkpoint( args.cpt_load_path, training_classifier=args.classifiers) else: # create new model with random parameters model = Model(ndf=args.ndf, n_classes=num_classes, n_rkhs=args.n_rkhs, tclip=args.tclip, n_depth=args.n_depth, encoder_size=encoder_size, use_bn=(args.use_bn == 1)) model.init_weights(init_scale=1.0) checkpointer.track_new_model(model) model = model.to(torch_device) # select which type of training to do task = train_classifiers if args.classifiers else train_self_supervised task(model, args.learning_rate, dataset, train_loader, test_loader, stat_tracker, checkpointer, args.output_dir, torch_device, amp)
def build_data_loader(args, phase='train'): data_loaders = data.DataLoader( dataset=build_dataset(args, phase), batch_size=args.batch_size, shuffle=phase == 'train', num_workers=args.num_workers, ) return data_loaders
def creat_dataset(self, config): print("creating data") self.transform_train, self.transform_test = build_aug(config.transform) self.dataset_train, self.dataset_test = build_dataset( config, self.transform_train, self.transform_test) self.data_loader = DataLoader(self.dataset_train, batch_size=config.batch_size, shuffle=True)
def main(): # create target output dir if it doesn't exist yet if not os.path.isdir(args['output_dir']): os.mkdir(args['output_dir']) # enable mixed-precision computation if desired if args['amp']: mixed_precision.enable_mixed_precision() # set the RNG seeds (probably more hidden elsewhere...) torch.manual_seed(args['seed']) torch.cuda.manual_seed(args['seed']) # get the dataset dataset = get_dataset(args['dataset']) encoder_size = get_encoder_size(dataset) # get a helper object for tensorboard logging log_dir = os.path.join(args['output_dir'], args['run_name']) stat_tracker = StatTracker(log_dir=log_dir) # get dataloaders for training and testing train_loader, test_loader, num_classes = \ build_dataset(dataset=dataset, batch_size=args['batch_size'], input_dir=args['input_dir'], labeled_only=args['classifiers']) torch_device = torch.device('cuda') checkpointer = Checkpointer(args['output_dir']) if args['cpt_load_path']: model = checkpointer.restore_model_from_checkpoint( args['cpt_load_path'], training_classifier=args['classifiers']) else: # create new model with random parameters model = Model(ndf=args['ndf'], n_classes=num_classes, n_rkhs=args['n_rkhs'], tclip=args['tclip'], n_depth=args['n_depth'], encoder_size=encoder_size, use_bn=(args['use_bn'] == 1)) model.init_weights(init_scale=1.0) checkpointer.track_new_model(model) model = model.to(torch_device) # select which type of training to do task = train_classifiers if args['classifiers'] else train_self_supervised if args['classifiers']: task = train_classifiers elif args['decoder']: task = train_decoder else: task = train_self_supervised task(model, args['learning_rate'], dataset, train_loader, test_loader, stat_tracker, checkpointer, args['output_dir'], torch_device)
def build_dataloader(config, num_workers, distributed): import torch.utils.data as data import torch.utils.data.distributed import datasets datasets, data_and_label_keys = {}, {} datasets = build_dataset(config) loader = get_loader( dataset=datasets, dataset_config=config, num_dataloader_workers=num_workers, pin_memory=False, ### Questionable ) return loader
def set_up(self): # model self.model = build_model(self.cfg) self.logger.info(f"Building model {self.cfg.model.name} ...") # mutator # self.logger.info('Cell choices: {}'.format(model.layers[0].nodes[0].cell_x.op_choice.choices)) self.mutator = build_mutator(self.model, self.cfg) for x in self.mutator.mutables: if isinstance(x, nni.nas.pytorch.mutables.LayerChoice): self.logger.info('Cell choices: {}'.format(x.choices)) break self.logger.info(f"Building mutator {self.cfg.mutator.name} ...") # dataset self.batch_size = self.cfg.dataset.batch_size self.workers = self.cfg.dataset.workers self.dataset_train, self.dataset_valid = build_dataset(self.cfg) self.logger.info(f"Building dataset {self.cfg.dataset.name} ...") # loss self.loss = build_loss_fn(self.cfg) self.logger.info(f"Building loss function {self.cfg.loss.name} ...") # optimizer self.optimizer = generate_optimizer( model=self.model, optim_name=self.cfg.optim.name, lr=self.cfg.optim.base_lr, momentum=self.cfg.optim.momentum, weight_decay=self.cfg.optim.weight_decay) self.logger.info(f"Building optimizer {self.cfg.optim.name} ...") # scheduler self.scheduler_params = parse_cfg_for_scheduler( self.cfg, self.cfg.optim.scheduler.name) self.lr_scheduler = generate_scheduler(self.optimizer, self.cfg.optim.scheduler.name, **self.scheduler_params) self.logger.info( f"Building optimizer scheduler {self.cfg.optim.scheduler.name} ..." ) # miscellaneous self.num_epochs = self.cfg.trainer.num_epochs self.log_frequency = self.cfg.logger.log_frequency self.start_epoch = 0
def benchmark(): args, _ = get_benckmark_arg_parser().parse_known_args() main_args = get_main_args_parser().parse_args(_) assert args.warm_iters < args.num_iters and args.num_iters > 0 and args.warm_iters >= 0 assert args.batch_size > 0 assert args.resume is None or os.path.exists(args.resume) dataset = build_dataset('val', main_args) model, _, _ = build_model(main_args) model.cuda() model.eval() if args.resume is not None: ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['model']) inputs = nested_tensor_from_tensor_list([dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)]) t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters) return 1.0 / t * args.batch_size
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # if args.distributed: # sampler_train = DistributedSampler(dataset_train) # sampler_val = DistributedSampler(dataset_val, shuffle=False) # else: # sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) # batch_sampler_train = torch.utils.data.BatchSampler( # sampler_train, args.batch_size, drop_last=True) sampler_val = torch.utils.data.SequentialSampler(dataset_val) # data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, # collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn_leimao, num_workers=args.num_workers) for inputs, labels in data_loader_val: print("---------------------") print(inputs.shape) print(labels)
def main(): # create target output dir if it doesn't exist yet if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) # enable mixed-precision computation if desired if args.amp: mixed_precision.enable_mixed_precision() # set the RNG seeds (probably more hidden elsewhere...) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # get the dataset dataset = get_dataset(args.dataset) # get a helper object for tensorboard logging log_dir = os.path.join(args.output_dir, args.run_name) stat_tracker = StatTracker(log_dir=log_dir) # get dataloaders for training and testing train_loader, test_loader, num_classes = \ build_dataset(dataset=dataset, batch_size=args.batch_size, input_dir=args.input_dir, labeled_only=args.classifiers) torch_device = torch.device('cuda') # create new model with random parameters model = Model(ndf=args.ndf, n_classes=num_classes, n_rkhs=args.n_rkhs, tclip=args.tclip, n_depth=args.n_depth, dataset=dataset, use_bn=(args.use_bn == 1)) # restore model parameters from a checkpoint if requested checkpoint = Checkpoint(model, args.cpt_load_path, args.output_dir) model = model.to(torch_device) # select which type of training to do task = train_classifiers if args.classifiers else train_self_supervised # do the real stuff... task(model, args.learning_rate, dataset, train_loader, test_loader, stat_tracker, checkpoint, args.output_dir, torch_device)
def main(args): if args.dataset not in _available_datasets: raise NotImplementedError dataset = build_dataset(name=args.dataset, shape=(args.height, args.width), train_batch_size=args.train_batch_size, valid_batch_size=args.valid_batch_size) model = build_mobilenetv3( args.model_type, input_shape=(args.height, args.width, dataset["channels"]), num_classes=dataset["num_classes"], width_multiplier=args.width_multiplier, l2_reg=args.l2_reg, ) if args.optimizer not in _available_optimizers: raise NotImplementedError model.compile( optimizer=_available_optimizers.get(args.optimizer)(args.lr), loss="categorical_crossentropy", metrics=["accuracy"], ) callbacks = [ tf.keras.callbacks.TensorBoard(log_dir=args.logdir), ] model.fit( dataset["train"].make_one_shot_iterator(), steps_per_epoch=(dataset["num_train"] // args.train_batch_size) + 1, epochs=args.num_epoch, validation_data=dataset["test"], validation_steps=(dataset["num_test"] // args.valid_batch_size) + 1, callbacks=callbacks, ) model.save_weights( f"mobilenetv3_{args.model_type}_{args.dataset}_{args.num_epoch}.h5")
def test(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str Dataset = build_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) Logger(opt) Detector = build_detector(opt.task) split = 'val' if not opt.trainval else 'test' dataset = Dataset(opt, split) detector = Detector(opt) results = {} num_iters = len(dataset) bar = Bar('{}'.format(opt.exp_id), max=num_iters) time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge'] avg_time_stats = {t: AverageMeter() for t in time_stats} for ind in range(num_iters): img_id = dataset.images[ind] img_info = dataset.coco.loadImgs(ids=[img_id])[0] img_path = os.path.join(dataset.img_dir, img_info['file_name']) if opt.task == 'ddd': ret = detector.run(img_path, img_info['calib']) else: ret = detector.run(img_path) results[img_id] = ret['results'] Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format( ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td) for t in avg_time_stats: avg_time_stats[t].update(ret[t]) Bar.suffix = Bar.suffix + \ '|{} {:.3f} '.format(t, avg_time_stats[t].avg) bar.next() bar.finish() dataset.run_eval(results, opt.save_dir)
def prefetch_test(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str Dataset = build_dataset(opt.dataset) opt = opts().update_dataset_info_and_set_heads(opt, Dataset) print(opt) Logger(opt) Detector = build_detector(opt.task) split = 'val' if not opt.trainval else 'test' dataset = Dataset(opt, split) detector = Detector(opt) data_loader = torch.utils.data.DataLoader(PrefetchDataset( opt, dataset, detector.pre_process), batch_size=1, shuffle=False, num_workers=1, pin_memory=True) results = {} num_iters = len(dataset) bar = Bar('{}'.format(opt.exp_id), max=num_iters) time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge'] avg_time_stats = {t: AverageMeter() for t in time_stats} for ind, (img_id, pre_processed_images) in enumerate(data_loader): ret = detector.run(pre_processed_images) results[img_id.numpy().astype(np.int32)[0]] = ret['results'] Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format( ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td) for t in avg_time_stats: avg_time_stats[t].update(ret[t]) Bar.suffix = Bar.suffix + '|{} {tm.val:.3f}s ({tm.avg:.3f}s) '.format( t, tm=avg_time_stats[t]) bar.next() bar.finish() dataset.run_eval(results, opt.save_dir)
def init_basic_settings(self): '''init train_epochs, device, loss_fn, dataset, and dataloaders ''' # train epochs try: self.train_epochs = self.cfg.args.train_epochs except: self.train_epochs = 1 # device self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.logger.info(f"Using device: {self.device}") # loss_fn self.loss_fn = build_loss_fn(self.cfg) self.loss_fn.to(self.device) self.logger.info(f"Building loss function ...") # dataset self.train_dataset, self.test_dataset = build_dataset(self.cfg) # dataloader self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_size=self.cfg.dataset.batch_size, shuffle=True, num_workers=self.cfg.dataset.workers, pin_memory=True) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_size=self.cfg.dataset.batch_size, shuffle=False, num_workers=self.cfg.dataset.workers, pin_memory=True) self.logger.info(f"Building dataset and dataloader ...")
def main(args): utils.init_distributed_mode(args) update_config_from_file(args.cfg) print(args) args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 2 * args.batch_size), sampler=sampler_val, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating SuperVisionTransformer") print(cfg) model = Vision_TransformerSuper( img_size=args.input_size, patch_size=args.patch_size, embed_dim=cfg.SUPERNET.EMBED_DIM, depth=cfg.SUPERNET.DEPTH, num_heads=cfg.SUPERNET.NUM_HEADS, mlp_ratio=cfg.SUPERNET.MLP_RATIO, qkv_bias=True, drop_rate=args.drop, drop_path_rate=args.drop_path, gp=args.gp, num_classes=args.nb_classes, max_relative_position=args.max_relative_position, relative_position=args.relative_position, change_qkv=args.change_qkv, abs_pos=not args.no_abs_pos) choices = { 'num_heads': cfg.SEARCH_SPACE.NUM_HEADS, 'mlp_ratio': cfg.SEARCH_SPACE.MLP_RATIO, 'embed_dim': cfg.SEARCH_SPACE.EMBED_DIM, 'depth': cfg.SEARCH_SPACE.DEPTH } model.to(device) if args.teacher_model: teacher_model = create_model( args.teacher_model, pretrained=True, num_classes=args.nb_classes, ) teacher_model.to(device) teacher_loss = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: teacher_model = None teacher_loss = None model_ema = None model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) # criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True) # save config for later experiments with open(output_dir / "config.yaml", 'w') as f: f.write(args_text) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) retrain_config = None if args.mode == 'retrain' and "RETRAIN" in cfg: retrain_config = { 'layer_num': cfg.RETRAIN.DEPTH, 'embed_dim': [cfg.RETRAIN.EMBED_DIM] * cfg.RETRAIN.DEPTH, 'num_heads': cfg.RETRAIN.NUM_HEADS, 'mlp_ratio': cfg.RETRAIN.MLP_RATIO } if args.eval: print(retrain_config) test_stats = evaluate(data_loader_val, model, device, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, amp=args.amp, teacher_model=teacher_model, teach_loss=teacher_loss, choices=choices, mode=args.mode, retrain_config=retrain_config, ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, # 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device, amp=args.amp, choices=choices, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): args = parse_args() args.pretrain = False root_path = 'exps/exp_{}'.format(args.exp) if not os.path.exists(root_path): os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) base_lr = args.lr # base learning rate train_dataset, val_dataset = build_dataset(args.dataset, args.data_root, args.train_list) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7) model = torch.nn.DataParallel(model) model.train() if args.resume is None: assert os.path.exists(args.load_path) state_dict = model.state_dict() print("Loading weights...") pretrain_state_dict = torch.load(args.load_path, map_location="cpu")['state_dict'] for k in list(pretrain_state_dict.keys()): if k not in state_dict: del pretrain_state_dict[k] model.load_state_dict(pretrain_state_dict) print("Loaded weights") else: print("Resuming from {}".format(args.resume)) checkpoint = torch.load(args.resume, map_location="cpu") optimizer.load_state_dict(checkpoint['optimizer_state_dict']) model.load_state_dict(checkpoint['state_dict']) logger = Logger(root_path) saver = Saver(root_path) for epoch in range(args.start_epoch, args.epochs): train(model, train_loader, optimizer, logger, args, epoch) validate(model, val_loader, optimizer, logger, saver, args, epoch) adjust_learning_rate(args, optimizer, epoch)
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print('Loading data') dataset_train = build_dataset(args.train_set, args.dataset_year, args) dataset_val = build_dataset(args.val_set, args.dataset_year, args) base_ds = get_coco_api_from_dataset(dataset_val) print('Creating data loaders') if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True, ) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) print('Creating model, always set args.return_criterion be True') args.return_criterion = True model = yolov5s(num_classes=args.num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], ) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) if args.lr_scheduler == 'cosine': lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.t_max) elif args.lr_scheduler == 'multi-step': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma, ) else: raise ValueError(f'scheduler {args.lr_scheduler} not supported') output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_val, base_ds, device) return print('Start training') start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader_train, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch, }, output_dir.joinpath(f'model_{epoch}.pth'), ) # evaluate after every epoch # evaluate(model, criterion, data_loader_val, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f'Training time {total_time_str}')
def main(): args = parse_args() if args.turnon < 0: args.pretrain = True else: args.pretrain = False print("Using GPU: {}".format(args.local_rank)) root_path = 'exps/exp_{}'.format(args.exp) if args.local_rank == 0 and not os.path.exists(root_path): os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_dataset, val_dataset = build_dataset(args.dataset, args.data_root, args.train_list, sampling=args.sampling) args.world_size = len(args.gpu.split(",")) if args.world_size > 1: os.environ['MASTER_PORT'] = args.port torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group('nccl') device = torch.device('cuda:{}'.format(args.local_rank)) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=len(args.gpu.split(",")), rank=args.local_rank) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) model_ema = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) if args.world_size > 1: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model_ema = DDP(model_ema, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model.train() model_ema.load_state_dict(model.state_dict()) print("Loaded weights") logger = Logger(root_path) saver = Saver(root_path, save_freq=args.save_freq) if args.sampling == 'default': contrast = RGBMoCo(128, K=4096, T=args.temperature).cuda(args.local_rank) elif args.sampling == 'layerwise': contrast = RGBMoCoNew(128, K=4096, T=args.temperature).cuda(args.local_rank) else: raise ValueError("unsupported sampling method") criterion = torch.nn.CrossEntropyLoss() flag = False for epoch in range(args.start_epoch, args.epochs): train_sampler.set_epoch(epoch) train(model, model_ema, train_loader, optimizer, logger, saver, args, epoch, contrast, criterion) validate(model_ema, val_loader, optimizer, logger, saver, args, epoch) adjust_learning_rate(args, optimizer, epoch)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # Save our Wandb metadata if not args.no_wb: wandb.init(entity='dl-project', project='dl-final-project', name=args.wb_name, notes=args.wb_notes, reinit=True) wandb.config.epochs = args.epochs device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) # visualize_video(model, postprocessors) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of trainable params:', n_parameters) wandb.config.n_parameters = n_parameters wandb.config.n_trainable_parameters = n_parameters # better name # Log total # of model parameters (including frozen) to W&B n_total_parameters = sum(p.numel() for p in model.parameters()) print('total number of parameters:', n_total_parameters) wandb.config.n_total_parameters = n_total_parameters dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # For visualization we want the raw images without any normalization or random resizing dataset_val_without_resize = CocoDetection( "data/coco/val2017", annFile="data/coco/annotations/instances_val2017.json", transforms=T.Compose([T.ToTensor()])) # Save metadata about training + val datasets and batch size wandb.config.len_dataset_train = len(dataset_train) wandb.config.len_dataset_val = len(dataset_val) wandb.config.batch_size = args.batch_size if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] # Not sure if we should save all hyperparameters in wandb.config? # just start with a few important ones wandb.config.lr = args.lr wandb.config.lr_backbone = args.lr_backbone if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] # print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_file_for_wb = str( output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth') checkpoint_paths = [ output_dir / 'checkpoint.pth', checkpoint_file_for_wb ] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Save model checkpoint to W&B wandb.save(checkpoint_file_for_wb) # Generate visualizations for fixed(?) set of images every epoch print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # Save the COCO metrics properly metric_name = [ "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl" ] for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]): log_stats[metric_name[i]] = metric_val if not args.no_wb: wandb.log(log_stats) print("train_loss: ", log_stats['train_loss']) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") wandb.save(str(output_dir / "log.txt")) # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth' eval_path_for_wb = str(output_dir / "eval" / eval_filename_for_wb) filenames = ['latest.pth', eval_filename_for_wb] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # TODO not sure if this file will end up being too big # I think it's the COCO precision/recall metrics # in some format... # let's track it just in case to start! wandb.save(eval_path_for_wb) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) if args.distillation_type != 'none' and args.finetune and not args.eval: raise NotImplementedError( "Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int( 1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=None, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() teacher_model = None if args.distillation_type != 'none': assert args.teacher_path, 'need to specify teacher-path when using distillation' print(f"Creating teacher model: {args.teacher_model}") teacher_model = create_model( args.teacher_model, pretrained=False, num_classes=args.nb_classes, global_pool='avg', ) if args.teacher_path.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.teacher_path, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.teacher_path, map_location='cpu') teacher_model.load_state_dict(checkpoint['model']) teacher_model.to(device) teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is 'none' criterion = DistillationLoss(criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / ('checkpoint_%04d.pth' % (epoch))] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) if not args.train_without_eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='test', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_test = samplers.DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') del checkpoint["model"]["transformer.decoder.class_embed.0.weight"] del checkpoint["model"]["transformer.decoder.class_embed.0.bias"] del checkpoint["model"]["transformer.decoder.class_embed.1.weight"] del checkpoint["model"]["transformer.decoder.class_embed.1.bias"] del checkpoint["model"]["transformer.decoder.class_embed.2.weight"] del checkpoint["model"]["transformer.decoder.class_embed.2.bias"] del checkpoint["model"]["transformer.decoder.class_embed.3.weight"] del checkpoint["model"]["transformer.decoder.class_embed.3.bias"] del checkpoint["model"]["transformer.decoder.class_embed.4.weight"] del checkpoint["model"]["transformer.decoder.class_embed.4.bias"] del checkpoint["model"]["transformer.decoder.class_embed.5.weight"] del checkpoint["model"]["transformer.decoder.class_embed.5.bias"] del checkpoint["model"]["transformer.decoder.class_embed.6.weight"] del checkpoint["model"]["transformer.decoder.class_embed.6.bias"] del checkpoint["model"]["class_embed.0.weight"] del checkpoint["model"]["class_embed.0.bias"] del checkpoint["model"]["class_embed.1.weight"] del checkpoint["model"]["class_embed.1.bias"] del checkpoint["model"]["class_embed.2.weight"] del checkpoint["model"]["class_embed.2.bias"] del checkpoint["model"]["class_embed.3.weight"] del checkpoint["model"]["class_embed.3.bias"] del checkpoint["model"]["class_embed.4.weight"] del checkpoint["model"]["class_embed.4.bias"] del checkpoint["model"]["class_embed.5.weight"] del checkpoint["model"]["class_embed.5.bias"] del checkpoint["model"]["class_embed.6.weight"] del checkpoint["model"]["class_embed.6.bias"] missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] # if len(missing_keys) > 0: # print('Missing Keys: {}'.format(missing_keys)) # if len(unexpected_keys) > 0: # print('Unexpected Keys: {}'.format(unexpected_keys)) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # #print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # args.start_epoch = checkpoint['epoch'] + 1 # # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return if args.test: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) device = torch.device(args.device) # Fix the seed for reproducibility. seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # Load from pretrained DETR model. assert args.num_queries == 100, args.num_queries assert args.enc_layers == 6 and args.dec_layers == 6 assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone if args.backbone == 'resnet50': pretrain_model = './data/detr_coco/detr-r50-e632da11.pth' elif args.backbone == 'resnet101': pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth' else: pretrain_model = None if pretrain_model is not None: pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model'] my_model_dict = model_without_ddp.state_dict() pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict} my_model_dict.update(pretrain_dict) model_without_ddp.load_state_dict(my_model_dict) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 10 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) if args.output_dir is None: args.output_dir = os.path.expanduser( '~/Data/AI2Thor_detection_features/') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_all = build_dataset(image_set='all', args=args) if args.distributed: sampler_all = DistributedSampler(dataset_all, shuffle=False) else: sampler_all = torch.utils.data.SequentialSampler(dataset_all) data_loader_all = DataLoader(dataset_all, args.batch_size, sampler=sampler_all, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_all) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start extracting features") start_time = time.time() extract_feature(model, criterion, postprocessors, data_loader_all, base_ds, device, args.output_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Extracting features time {}'.format(total_time_str)) print('Start combining files') start_time = time.time() data_dir = os.path.expanduser('~/Data/AI2Thor_offline_data_2.0.2/') combine_files(args, data_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Combining files time {}'.format(total_time_str))
def main(args): bz = args.batch_size lr = args.lr if args.cuda: if torch.cuda.device_count() >= 1: utils.init_distributed_mode(args) device = torch.device(args.device) else: device = torch.device('cpu') # fix the seed for reproducibility if args.cuda: seed = args.seed + utils.get_rank() else: seed = args.seed torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # set up model model, criterion, postprocessors = build_model(args) model_without_ddp = model if args.cuda and args.distributed: if args.mp: model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.parallel.DistributedDataParallel( model.to(args.gpu), device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module elif args.cuda: model = model.to(device) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # set up model training param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "joiner" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "joiner" in n and p.requires_grad ], "lr": args.lr_joiner, }, ] # datasets build dataset_train = build_dataset(mode="training", args=args) dataset_test = build_dataset(mode="testing", args=args) if args.cuda and args.distributed: sampler_train = DistributedSampler(dataset_train, shuffle=False) sampler_test = DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_test = DataLoader(dataset_test, 1, sampler=sampler_test, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # output and checkpoints directory checkpoint_dir = args.output_dir if not os.path.exists(checkpoint_dir): try: os.makedirs(checkpoint_dir) except OSError: pass if args.resume: checkpoint = Path(args.resume) assert checkpoint.exists() checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start Training") start_time = time.time() optimizer.zero_grad() for epoch in range(args.start_epoch, args.epochs): if args.cuda and args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(epoch, args.clip_max_norm, model, criterion, data_loader_train, optimizer, lr_scheduler, device) if args.output_dir: checkpoint_dir = Path(checkpoint_dir) checkpoint_paths = [checkpoint_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or ( epoch + 1) % args.save_checkpoint_every == 0: checkpoint_paths.append(checkpoint_dir / f'checkpoint{epoch:05}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if (epoch + 1) % args.eval_interval == 0: # evaluation test_stats = evaluate(epoch, model, criterion, postprocessors, data_loader_test, args.output_dir, args.dataset, device) log_stats = { **{'train_' + str(k): v for k, v in train_stats.items()}, **{'test_' + str(k): v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (checkpoint_dir / 'log.json').open("a") as f: f.write(json.dumps(log_stats) + "\n") lr_scheduler.step() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=0.9) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) output_dir = output_dir / f"{args.backbone}_{args.transformer_type}" if args.output_dir: output_dir.mkdir(parents=True, exist_ok=True) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch}_extra.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 1.5 * args.batch_size), shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, ) # TODO: finetuning model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))