def main(args): exp_start_time = time.time() global best_prec1 best_prec1 = 0 args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.local_rank = int(os.environ['LOCAL_RANK']) args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.amp and args.fp16: print("Please use only one of the --fp16/--amp flags") exit(1) if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): pass if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}" .format(args.optimizer_batch_size, tbs)) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) pretrained_weights = None if args.pretrained_weights: if os.path.isfile(args.pretrained_weights): print("=> loading pretrained weights from '{}'".format( args.pretrained_weights)) pretrained_weights = torch.load(args.pretrained_weights) else: print("=> no pretrained weights found at '{}'".format(args.resume)) start_epoch = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model_state = checkpoint['state_dict'] optimizer_state = checkpoint['optimizer'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None optimizer_state = None else: model_state = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) model_and_loss = ModelAndLoss((args.arch, args.model_config), loss, pretrained_weights=pretrained_weights, cuda=True, fp16=args.fp16) # Create data loaders and optimizers as needed if args.data_backend == 'pytorch': get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == 'dali-gpu': get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == 'dali-cpu': get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == 'syntetic': get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader train_loader, train_loader_len = get_train_loader(args.data, args.batch_size, 1000, args.mixup > 0.0, workers=args.workers, fp16=args.fp16) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, 1000, train_loader) val_loader, val_loader_len = get_val_loader(args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16) if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger = log.Logger(args.print_freq, [ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=log.format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file)) ]) else: logger = log.Logger(args.print_freq, []) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) optimizer = get_optimizer(list(model_and_loss.model.named_parameters()), args.fp16, args.lr, args.momentum, args.weight_decay, nesterov=args.nesterov, bn_weight_decay=args.bn_weight_decay, state=optimizer_state, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) if args.lr_schedule == 'step': lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger) elif args.lr_schedule == 'cosine': lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger) elif args.lr_schedule == 'linear': lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) if args.amp: model_and_loss, optimizer = amp.initialize( model_and_loss, optimizer, opt_level="O2", loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale) if args.distributed: model_and_loss.distributed() model_and_loss.load_model_state(model_state) train_loop(model_and_loss, optimizer, lr_policy, train_loader, val_loader, args.epochs, args.fp16, logger, should_backup_checkpoint(args), use_amp=args.amp, batch_size_multiplier=batch_size_multiplier, start_epoch=start_epoch, best_prec1=best_prec1, prof=args.prof, skip_training=args.evaluate, skip_validation=args.training_only, save_checkpoints=args.save_checkpoints and not args.evaluate, checkpoint_dir=args.workspace) exp_duration = time.time() - exp_start_time if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger.end() print("Experiment ended")
def main(args, model_args): exp_start_time = time.time() global best_prec1 best_prec1 = 0 args.distributed = False if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.local_rank = int(os.environ["LOCAL_RANK"]) else: args.local_rank = 0 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): pass if args.static_loss_scale != 1.0: if not args.amp: print( "Warning: if --amp is not used, static_loss_scale will be ignored." ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}" .format(args.optimizer_batch_size, tbs)) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) start_epoch = 0 # optionally resume from a checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] model_state = checkpoint["state_dict"] optimizer_state = checkpoint["optimizer"] if "state_dict_ema" in checkpoint: model_state_ema = checkpoint["state_dict_ema"] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) if start_epoch >= args.epochs: print( f"Launched training for {args.epochs}, checkpoint already run {start_epoch}" ) exit(1) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None model_state_ema = None optimizer_state = None else: model_state = None model_state_ema = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = (torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format) model = available_models()[args.arch](**{ k: v if k != "pretrained" else v and ( not args.distributed or dist.get_rank() == 0) for k, v in model_args.__dict__.items() }) image_size = (args.image_size if args.image_size is not None else model.arch.default_image_size) model_and_loss = ModelAndLoss(model, loss, cuda=True, memory_format=memory_format) if args.use_ema is not None: model_ema = deepcopy(model_and_loss) ema = EMA(args.use_ema) else: model_ema = None ema = None # Create data loaders and optimizers as needed if args.data_backend == "pytorch": get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == "dali-gpu": get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == "dali-cpu": get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == "syntetic": get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader else: print("Bad databackend picked") exit(1) train_loader, train_loader_len = get_train_loader( args.data, image_size, args.batch_size, model_args.num_classes, args.mixup > 0.0, interpolation=args.interpolation, augmentation=args.augmentation, start_epoch=start_epoch, workers=args.workers, memory_format=memory_format, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) val_loader, val_loader_len = get_val_loader( args.data, image_size, args.batch_size, model_args.num_classes, False, interpolation=args.interpolation, workers=args.workers, memory_format=memory_format, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger = log.Logger( args.print_freq, [ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=log.format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file), ), ], start_epoch=start_epoch - 1, ) else: logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) logger.log_parameter( {f"model.{k}": v for k, v in model_args.__dict__.items()}, verbosity=dllogger.Verbosity.DEFAULT, ) optimizer = get_optimizer( list(model_and_loss.model.named_parameters()), args.lr, args=args, state=optimizer_state, ) if args.lr_schedule == "step": lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger) elif args.lr_schedule == "cosine": lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, end_lr=args.end_lr, logger=logger) elif args.lr_schedule == "linear": lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) scaler = torch.cuda.amp.GradScaler( init_scale=args.static_loss_scale, growth_factor=2, backoff_factor=0.5, growth_interval=100 if args.dynamic_loss_scale else 1000000000, enabled=args.amp, ) if args.distributed: model_and_loss.distributed(args.gpu) model_and_loss.load_model_state(model_state) if (ema is not None) and (model_state_ema is not None): print("load ema") ema.load_state_dict(model_state_ema) train_loop( model_and_loss, optimizer, scaler, lr_policy, train_loader, val_loader, logger, should_backup_checkpoint(args), ema=ema, model_ema=model_ema, steps_per_epoch=train_loader_len, use_amp=args.amp, batch_size_multiplier=batch_size_multiplier, start_epoch=start_epoch, end_epoch=min((start_epoch + args.run_epochs), args.epochs) if args.run_epochs != -1 else args.epochs, early_stopping_patience=args.early_stopping_patience, best_prec1=best_prec1, prof=args.prof, skip_training=args.evaluate, skip_validation=args.training_only, save_checkpoints=args.save_checkpoints and not args.evaluate, checkpoint_dir=args.workspace, checkpoint_filename=args.checkpoint_filename, ) exp_duration = time.time() - exp_start_time if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger.end() print("Experiment ended")
def prepare_for_training(args, model_args, model_arch): args.distributed = False if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.local_rank = int(os.environ["LOCAL_RANK"]) else: args.local_rank = 0 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() affinity = set_affinity(args.gpu, mode=args.gpu_affinity) print(f"Training process {args.local_rank} affinity: {affinity}") if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") if args.static_loss_scale != 1.0: if not args.amp: print("Warning: if --amp is not used, static_loss_scale will be ignored.") if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}".format( args.optimizer_batch_size, tbs ) ) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) start_epoch = 0 # optionally resume from a checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu) ) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] model_state = checkpoint["state_dict"] optimizer_state = checkpoint["optimizer"] if "state_dict_ema" in checkpoint: model_state_ema = checkpoint["state_dict_ema"] print( "=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"] ) ) if start_epoch >= args.epochs: print( f"Launched training for {args.epochs}, checkpoint already run {start_epoch}" ) exit(1) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None model_state_ema = None optimizer_state = None else: model_state = None model_state_ema = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = ( torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format ) model = model_arch( **{ k: v if k != "pretrained" else v and (not args.distributed or dist.get_rank() == 0) for k, v in model_args.__dict__.items() } ) image_size = ( args.image_size if args.image_size is not None else model.arch.default_image_size ) scaler = torch.cuda.amp.GradScaler( init_scale=args.static_loss_scale, growth_factor=2, backoff_factor=0.5, growth_interval=100 if args.dynamic_loss_scale else 1000000000, enabled=args.amp, ) executor = Executor( model, loss(), cuda=True, memory_format=memory_format, amp=args.amp, scaler=scaler, divide_loss=batch_size_multiplier, ts_script=args.jit == "script", ) # Create data loaders and optimizers as needed if args.data_backend == "pytorch": get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == "dali-gpu": get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == "dali-cpu": get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == "syntetic": get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader else: print("Bad databackend picked") exit(1) train_loader, train_loader_len = get_train_loader( args.data, image_size, args.batch_size, model_args.num_classes, args.mixup > 0.0, interpolation=args.interpolation, augmentation=args.augmentation, start_epoch=start_epoch, workers=args.workers, _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) val_loader, val_loader_len = get_val_loader( args.data, image_size, args.batch_size, model_args.num_classes, False, interpolation=args.interpolation, workers=args.workers, _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: logger = log.Logger( args.print_freq, [ dllogger.StdOutBackend( dllogger.Verbosity.DEFAULT, step_format=log.format_step ), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file), ), ], start_epoch=start_epoch - 1, ) else: logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) logger.log_parameter( {f"model.{k}": v for k, v in model_args.__dict__.items()}, verbosity=dllogger.Verbosity.DEFAULT, ) optimizer = get_optimizer( list(executor.model.named_parameters()), args.lr, args=args, state=optimizer_state, ) if args.lr_schedule == "step": lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup) elif args.lr_schedule == "cosine": lr_policy = lr_cosine_policy( args.lr, args.warmup, args.epochs, end_lr=args.end_lr ) elif args.lr_schedule == "linear": lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs) if args.distributed: executor.distributed(args.gpu) if model_state is not None: executor.model.load_state_dict(model_state) trainer = Trainer( executor, optimizer, grad_acc_steps=batch_size_multiplier, ema=args.use_ema, ) if (args.use_ema is not None) and (model_state_ema is not None): trainer.ema_executor.model.load_state_dict(model_state_ema) return ( trainer, lr_policy, train_loader, train_loader_len, val_loader, logger, start_epoch, )
def main(args): exp_start_time = time.time() global best_prec1 best_prec1 = 0 args.distributed = False if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.local_rank = int(os.environ["LOCAL_RANK"]) args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) dist.init_process_group(backend="nccl", init_method="env://") args.world_size = torch.distributed.get_world_size() if args.seed is not None: print("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) np.random.seed(seed=args.seed + args.local_rank) random.seed(args.seed + args.local_rank) def _worker_init_fn(id): def handler(signum, frame): print(f"Worker {id} received signal {signum}") signal.signal(signal.SIGTERM, handler) np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) else: def _worker_init_fn(id): def handler(signum, frame): print(f"Worker {id} received signal {signum}") signal.signal(signal.SIGTERM, handler) if args.static_loss_scale != 1.0: if not args.amp: print( "Warning: if --amp is not used, static_loss_scale will be ignored." ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 else: tbs = args.world_size * args.batch_size if args.optimizer_batch_size % tbs != 0: print( "Warning: simulated batch size {} is not divisible by actual batch size {}" .format(args.optimizer_batch_size, tbs)) batch_size_multiplier = int(args.optimizer_batch_size / tbs) print("BSM: {}".format(batch_size_multiplier)) pretrained_weights = None if args.pretrained_weights: if os.path.isfile(args.pretrained_weights): print("=> loading pretrained weights from '{}'".format( args.pretrained_weights)) pretrained_weights = torch.load(args.pretrained_weights) # Temporary fix to allow NGC checkpoint loading pretrained_weights = { k.replace("module.", ""): v for k, v in pretrained_weights.items() } else: print("=> no pretrained weights found at '{}'".format(args.resume)) start_epoch = 0 # optionally resume from a checkpoint if args.resume is not None: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] model_state = checkpoint["state_dict"] optimizer_state = checkpoint["optimizer"] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) if start_epoch >= args.epochs: print( f"Launched training for {args.epochs}, checkpoint already run {start_epoch}" ) exit(1) else: print("=> no checkpoint found at '{}'".format(args.resume)) model_state = None optimizer_state = None else: model_state = None optimizer_state = None loss = nn.CrossEntropyLoss if args.mixup > 0.0: loss = lambda: NLLMultiLabelSmooth(args.label_smoothing) elif args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = (torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format) model_and_loss = ModelAndLoss( (args.arch, args.model_config, args.num_classes), loss, pretrained_weights=pretrained_weights, cuda=True, memory_format=memory_format, ) # Create data loaders and optimizers as needed if args.data_backend == "pytorch": get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader elif args.data_backend == "dali-gpu": get_train_loader = get_dali_train_loader(dali_cpu=False) get_val_loader = get_dali_val_loader() elif args.data_backend == "dali-cpu": get_train_loader = get_dali_train_loader(dali_cpu=True) get_val_loader = get_dali_val_loader() elif args.data_backend == "syntetic": get_val_loader = get_syntetic_loader get_train_loader = get_syntetic_loader else: print("Bad databackend picked") exit(1) train_loader, train_loader_len = get_train_loader( args.data, args.batch_size, args.num_classes, args.mixup > 0.0, start_epoch=start_epoch, workers=args.workers, memory_format=memory_format, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) val_loader, val_loader_len = get_val_loader( args.data, args.batch_size, args.num_classes, False, workers=args.workers, memory_format=memory_format, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger = log.Logger( args.print_freq, [ dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT, step_format=log.format_step), dllogger.JSONStreamBackend( dllogger.Verbosity.VERBOSE, os.path.join(args.workspace, args.raport_file), ), ], start_epoch=start_epoch - 1, ) else: logger = log.Logger(args.print_freq, [], start_epoch=start_epoch - 1) logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) optimizer = get_optimizer( list(model_and_loss.model.named_parameters()), args.lr, args.momentum, args.weight_decay, nesterov=args.nesterov, bn_weight_decay=args.bn_weight_decay, state=optimizer_state, ) if args.lr_schedule == "step": lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger) elif args.lr_schedule == "cosine": lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger) elif args.lr_schedule == "linear": lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) scaler = torch.cuda.amp.GradScaler( init_scale=args.static_loss_scale, growth_factor=2, backoff_factor=0.5, growth_interval=100 if args.dynamic_loss_scale else 1000000000, enabled=args.amp, ) if args.distributed: model_and_loss.distributed(args.gpu) model_and_loss.load_model_state(model_state) train_loop( model_and_loss, optimizer, scaler, lr_policy, train_loader, val_loader, logger, should_backup_checkpoint(args), use_amp=args.amp, batch_size_multiplier=batch_size_multiplier, start_epoch=start_epoch, end_epoch=(start_epoch + args.run_epochs) if args.run_epochs != -1 else args.epochs, best_prec1=best_prec1, prof=args.prof, skip_training=args.evaluate, skip_validation=args.training_only, save_checkpoints=args.save_checkpoints and not args.evaluate, checkpoint_dir=args.workspace, checkpoint_filename=args.checkpoint_filename, ) exp_duration = time.time() - exp_start_time if not torch.distributed.is_initialized() or torch.distributed.get_rank( ) == 0: logger.end() print("Experiment ended")