def test_extract_ImageNet100_CMC(self): """ Usage: proj_root=moco-exp python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/codes/$proj_root -d /cache/$proj_root -t copytree cd /cache/$proj_root export CUDA_VISIBLE_DEVICES=0 export TIME_STR=0 export PYTHONPATH=./ python -c "from template_lib.proj.imagenet.tests.test_imagenet import Testing_PrepareImageNet;\ Testing_PrepareImageNet().test_extract_ImageNet100_CMC()" :return: """ if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = '0' if 'TIME_STR' not in os.environ: os.environ['TIME_STR'] = '0' if utils.is_debugging() else '0' from template_lib.v2.config_cfgnode.argparser import \ (get_command_and_outdir, setup_outdir_and_yaml, get_append_cmd_str, start_cmd_run) from template_lib.v2.config_cfgnode import update_parser_defaults_from_yaml, global_cfg from template_lib.modelarts import modelarts_utils from distutils.dir_util import copy_tree command, outdir = get_command_and_outdir(self, func_name=sys._getframe().f_code.co_name, file=__file__) argv_str = f""" --tl_config_file template_lib/proj/imagenet/tests/configs/PrepareImageNet.yaml --tl_command {command} --tl_outdir {outdir} """ args, cfg = setup_outdir_and_yaml(argv_str, return_cfg=True) modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) modelarts_utils.prepare_dataset(global_cfg.get('modelarts_download', {}), global_cfg=global_cfg) train_dir = f'{cfg.data_dir}/train' val_dir = f'{cfg.data_dir}/val' save_train_dir = f'{cfg.saved_dir}/train' save_val_dir = f'{cfg.saved_dir}/val' os.makedirs(save_train_dir, exist_ok=True) os.makedirs(save_val_dir, exist_ok=True) with open(cfg.class_list_file, 'r') as f: class_list = f.readlines() for class_subdir in tqdm.tqdm(class_list): class_subdir, _ = class_subdir.strip().split() train_class_dir = f'{train_dir}/{class_subdir}' save_train_class_dir = f'{save_train_dir}/{class_subdir}' copy_tree(train_class_dir, save_train_class_dir) val_class_dir = f'{val_dir}/{class_subdir}' save_val_class_dir = f'{save_val_dir}/{class_subdir}' copy_tree(val_class_dir, save_val_class_dir) modelarts_utils.prepare_dataset(global_cfg.get('modelarts_upload', {}), global_cfg=global_cfg, download=False) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) pass
def test_extract_ImageNet_1000x50(self): """ Usage: proj_root=moco-exp python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/codes/$proj_root -d /cache/$proj_root -t copytree cd /cache/$proj_root export CUDA_VISIBLE_DEVICES=0 export TIME_STR=0 export PYTHONPATH=./ python -c "from template_lib.proj.imagenet.tests.test_imagenet import Testing_PrepareImageNet;\ Testing_PrepareImageNet().test_extract_ImageNet_1000x50()" :return: """ if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = '0' if 'TIME_STR' not in os.environ: os.environ['TIME_STR'] = '0' if utils.is_debugging() else '0' from template_lib.v2.config_cfgnode.argparser import \ (get_command_and_outdir, setup_outdir_and_yaml, get_append_cmd_str, start_cmd_run) from template_lib.v2.config_cfgnode import update_parser_defaults_from_yaml, global_cfg from template_lib.modelarts import modelarts_utils command, outdir = get_command_and_outdir(self, func_name=sys._getframe().f_code.co_name, file=__file__) argv_str = f""" --tl_config_file template_lib/proj/imagenet/tests/configs/PrepareImageNet.yaml --tl_command {command} --tl_outdir {outdir} """ args, cfg = setup_outdir_and_yaml(argv_str, return_cfg=True) global_cfg.merge_from_dict(cfg) global_cfg.merge_from_dict(vars(args)) modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) modelarts_utils.prepare_dataset(global_cfg.get('modelarts_download', {}), global_cfg=global_cfg) train_dir = f'{cfg.data_dir}/train' counter_cls = 0 for rootdir, subdir, files in os.walk(train_dir): if len(subdir) == 0: counter_cls += 1 extracted_files = sorted(files)[:cfg.num_per_class] for file in tqdm.tqdm(extracted_files, desc=f'class: {counter_cls}'): img_path = os.path.join(rootdir, file) img_rel_path = os.path.relpath(img_path, cfg.data_dir) saved_img_path = f'{cfg.saved_dir}/{os.path.dirname(img_rel_path)}' os.makedirs(saved_img_path, exist_ok=True) shutil.copy(img_path, saved_img_path) pass modelarts_utils.prepare_dataset(global_cfg.get('modelarts_upload', {}), global_cfg=global_cfg, download=False) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) pass
def main(): parser = build_parser() args, _ = parser.parse_known_args() is_main_process = args.local_rank == 0 update_parser_defaults_from_yaml(parser, is_main_process=is_main_process) if is_main_process: modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) modelarts_utils.prepare_dataset(global_cfg.get('modelarts_download', {}), global_cfg=global_cfg) args = parser.parse_args() setup_runtime(seed=args.seed) distributed = ddp_utils.is_distributed() if distributed: dist_utils.init_dist(args.launcher, backend='nccl') # important: use different random seed for different process torch.manual_seed(args.seed + dist.get_rank()) # dataset dataset = torch_data_utils.ImageListDataset(meta_file=global_cfg.image_list_file, ) if distributed: sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False) else: sampler = None train_loader = data_utils.DataLoader( dataset, batch_size=1, shuffle=False, sampler=sampler, num_workers=args.num_workers, pin_memory=False) # test data_iter = iter(train_loader) data = next(data_iter) if is_main_process: modelarts_utils.prepare_dataset(global_cfg.get('modelarts_upload', {}), global_cfg=global_cfg, download=False) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) if distributed: dist.barrier() pass
def main(): update_parser_defaults_from_yaml(parser) args = parser.parse_args() global_cfg.merge_from_dict(vars(args)) modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) modelarts_utils.prepare_dataset(global_cfg.get('modelarts_download', {}), global_cfg=global_cfg) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
def main(): logger = logging.getLogger('tl') modelarts_utils.setup_tl_outdir_obs(cfg=global_cfg) old_command = '' # Create bash_command.sh bash_file = os.path.join(global_cfg.tl_outdir, f'bash_{global_cfg.number}.sh') open(bash_file, 'w').close() config_file = f'{os.path.dirname(global_cfg.tl_saved_config_file)}/c_{global_cfg.number}.yaml' shutil.copy(global_cfg.tl_saved_config_file, config_file) global_cfg.tl_saved_config_file = config_file global_cfg.tl_saved_config_file_old = global_cfg.tl_saved_config_file + '.old' # copy outdir to outdir_obs, copy bash_file to outdir_obs modelarts_utils.modelarts_sync_results_dir(cfg=global_cfg, join=True) # disable moxing copy_parallel output # logger.disabled = True while True: try: try: import moxing as mox time.sleep(global_cfg.time_interval) # copy oudir_obs to outdir mox.file.copy_parallel(global_cfg.tl_outdir_obs, global_cfg.tl_outdir) except: if not os.path.exists(global_cfg.tl_saved_config_file): os.rename(global_cfg.tl_saved_config_file_old, global_cfg.tl_saved_config_file) if not os.path.exists(bash_file): open(bash_file, 'w').close() pass # parse command if not os.path.exists(bash_file) or not os.path.exists( global_cfg.tl_saved_config_file): continue shutil.copy(bash_file, os.curdir) try: with open(global_cfg.tl_saved_config_file, 'rt') as handle: config = yaml.load(handle) config = EasyDict(config) command = getattr(getattr(config, global_cfg.tl_command), 'command') except: logger.warning('Parse config.yaml error!') command = old_command # execute command if command != old_command: old_command = command if type(command) is list and command[0].startswith(('bash', )): p = Worker(name='Command worker', args=(command[0], )) p.start() elif type(command) is list and len(command) == 1: if command[0] == 'exit': exit(0) command = list(map(str, command)) # command = ' '.join(command) # print('===Execute: %s' % command) err_f = open(os.path.join(global_cfg.tl_outdir, 'err.txt'), 'w') try: cwd = os.getcwd() return_str = subprocess.check_output(command, encoding='utf-8', cwd=cwd, shell=True) print(return_str, file=err_f, flush=True) except subprocess.CalledProcessError as e: print("Oops!\n", e.output, "\noccured.", file=err_f, flush=True) print(e.returncode, file=err_f, flush=True) err_f.close() elif type(command) is list and len(command) > 1: command = list(map(str, command)) command = [command[0]] # command = ' '.join(command) print('===Execute: %s' % command) err_f = open(os.path.join(global_cfg.tl_outdir, 'err.txt'), 'w') try: cwd = os.getcwd() return_str = subprocess.check_output(command, encoding='utf-8', cwd=cwd, shell=True) print(return_str, file=err_f, flush=True) except subprocess.CalledProcessError as e: print("Oops!\n", e.output, "\noccured.", file=err_f, flush=True) print(e.returncode, file=err_f, flush=True) err_f.close() logger.info('EE') # sync outdir to outdir_obs # del configfile in outdir os.rename(global_cfg.tl_saved_config_file, global_cfg.tl_saved_config_file_old) # del bash_file in outdir os.remove(bash_file) try: mox.file.copy_parallel(global_cfg.tl_outdir, global_cfg.tl_outdir_obs) except: pass except Exception as e: if str(e) == 'server is not set correctly': print(str(e)) else: # modelarts_utils.modelarts_record_jobs(args, myargs, str_info='Exception!') import traceback logger.warning(traceback.format_exc()) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) pass
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu == 0: update_parser_defaults_from_yaml(parser) global_cfg.merge_from_dict(vars(args)) modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) logger = logging.getLogger('tl') # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) logger.info(model) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True, is_main_process=(args.gpu == 0)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if args.aug_plus: # MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709 augmentation = [ transforms.RandomResizedCrop(224, scale=(0.2, 1.)), transforms.RandomApply( [ transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) # not strengthened ], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ] else: # MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 augmentation = [ transforms.RandomResizedCrop(224, scale=(0.2, 1.)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ] train_dataset = datasets.ImageFolder( traindir, moco.loader.TwoCropsTransform(transforms.Compose(augmentation))) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename=f'{args.tl_ckptdir}/checkpoint_{epoch:04d}.pth.tar') modelarts_utils.modelarts_sync_results_dir( global_cfg, join=False, is_main_process=(args.gpu == 0))
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu update_parser_defaults_from_yaml(parser, is_main_process=(gpu == 0)) logger = logging.getLogger('tl') if args.gpu == 0: modelarts_utils.setup_tl_outdir_obs(global_cfg) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True) # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model logger.info("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # freeze all layers but the last fc for name, param in model.named_parameters(): if name not in ['fc.weight', 'fc.bias']: param.requires_grad = False # init the fc layer model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() logger.info(model) # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): logger.info("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} logger.info("=> loaded pre-trained model '{}'".format( args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) # val_loader = torch.utils.data.DataLoader( # datasets.ImageFolder(valdir, transforms.Compose([ # transforms.Resize(256), # transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, # ])), # batch_size=args.batch_size, shuffle=False, # num_workers=args.workers, pin_memory=True) evaldir = os.path.join(args.data, 'val') eval_imagenet = EvalImageNet(valdir=evaldir, gpu_id=gpu) if args.evaluate: eval_imagenet.validate(model=model, epoch=0) return # print("=> loading checkpoint '{}'".format(args.pretrained)) # checkpoint = torch.load(args.pretrained, map_location="cpu") # # # rename moco pre-trained keys # state_dict = checkpoint['state_dict'] # for k in list(state_dict.keys()): # # retain only encoder_q up to before the embedding layer # if k.startswith('module.encoder_q'): # # remove prefix # state_dict['module.' + k[len("module.encoder_q."):]] = state_dict[k] # # delete renamed or unused k # del state_dict[k] # # msg = model.load_state_dict(state_dict, strict=False) # validate(val_loader, model, criterion, args) # return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set # acc1 = validate(val_loader, model, criterion, args) # summary_dict2txtfig({'top1': acc1.item()}, prefix='eval', step=epoch, # textlogger=global_textlogger, is_main_process=(args.gpu == 0)) acc1 = eval_imagenet.validate(model=model, epoch=epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, filename=f"{args.tl_ckptdir}/checkpoint.pth.tar") modelarts_utils.modelarts_sync_results_dir( global_cfg, join=False, is_main_process=(args.gpu == 0)) if epoch == args.start_epoch: sanity_check(model.state_dict(), args.pretrained) modelarts_utils.modelarts_sync_results_dir(global_cfg, join=True, is_main_process=(args.gpu == 0))