def train(): print(args.local_rank) torch.cuda.set_device(args.local_rank) # create dataloader & network & optimizer model, model_fn_decorator, net_func = create_model(cfg) init_weights(model, init_type='kaiming') model.cuda() root_result_dir = args.output_dir os.makedirs(root_result_dir, exist_ok=True) log_file = os.path.join(root_result_dir, "log_train.txt") logger = create_logger(log_file, get_rank()) logger.info("**********************Start logging**********************") logger.info('TRAINED MODEL:{}'.format(net_func)) # log to file gpu_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys( ) else 'ALL' logger.info("CUDA_VISIBLE_DEVICES=%s" % gpu_list) for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) logger.info("***********************config infos**********************") for key, val in vars(cfg).items(): logger.info("{:16} {}".format(key, val)) # log tensorboard if get_rank() == 0: tb_log = SummaryWriter( log_dir=os.path.join(root_result_dir, "tensorboard")) else: tb_log = None train_loader, test_loader = create_dataloader() # train_loader, test_loader = create_dataloader_Insensee() optimizer = create_optimizer(model) # load checkpoint if it is possible start_epoch = it = best_res = 0 last_epoch = -1 if args.ckpt is not None: pure_model = model it, start_epoch, best_res = load_checkpoint(pure_model, optimizer, args.ckpt, logger) last_epoch = start_epoch + 1 lr_scheduler = create_scheduler(optimizer, last_epoch=last_epoch) # lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98, last_epoch=-1) criterion = None # start training logger.info('**********************Start training**********************') ckpt_dir = os.path.join(root_result_dir, "ckpt") os.makedirs(ckpt_dir, exist_ok=True) trainer = train_utils.Trainer(model, model_fn=model_fn_decorator(), criterion=criterion, optimizer=optimizer, ckpt_dir=ckpt_dir, lr_scheduler=lr_scheduler, model_fn_eval=model_fn_decorator(), tb_log=tb_log, logger=logger, eval_frequency=1, cfg=cfg) trainer.train(start_it=it, start_epoch=start_epoch, n_epochs=args.epochs, train_loader=train_loader, test_loader=test_loader, ckpt_save_interval=args.ckpt_save_interval, best_res=best_res) logger.info('**********************End training**********************')
T_max=cfg.TRAIN.WARMUP_EPOCH * len(train_loader), eta_min=cfg.TRAIN.WARMUP_MIN) else: lr_warmup_scheduler = None # start training logger.info('**********************Start training**********************') ckpt_dir = os.path.join(root_result_dir, 'ckpt') os.makedirs(ckpt_dir, exist_ok=True) trainer = train_utils.Trainer( model, train_functions.model_joint_fn_decorator(), optimizer, ckpt_dir=ckpt_dir, lr_scheduler=lr_scheduler, bnm_scheduler=bnm_scheduler, model_fn_eval=train_functions.model_joint_fn_decorator(), tb_log=tb_log, eval_frequency=1, lr_warmup_scheduler=lr_warmup_scheduler, warmup_epoch=cfg.TRAIN.WARMUP_EPOCH, grad_norm_clip=cfg.TRAIN.GRAD_NORM_CLIP) trainer.train( it, start_epoch, args.epochs, train_loader, test_loader, ckpt_save_interval=args.ckpt_save_interval, lr_scheduler_each_iter=(cfg.TRAIN.OPTIMIZER == 'adam_onecycle'))