def main(): # Build model. model = model_builder.build_model(cfg=cfg) # Read checkpoint. ckpt = torch.load( cfg.MODEL.PATH2CKPT, map_location=torch.device("cpu")) if cfg.GENERAL.RESUME else {} if cfg.GENERAL.RESUME: with utils.log_info(msg="Load pre-trained model.", level="INFO", state=True): model.load_state_dict(ckpt["model"]) # Set device. model, device = utils.set_device(model, cfg.GENERAL.GPU) try: test_data_loader = data_loader.build_data_loader( cfg, cfg.DATA.DATASET, "test") generate(cfg=cfg, model=model, data_loader=test_data_loader, device=device) except: utils.notify("Can not build data loader for test set.", level="ERROR") raise ValueError("")
def main(): # Set logger to record information. utils.check_env(cfg) logger = Logger(cfg) logger.log_info(cfg) metrics_handler = MetricsHandler(cfg.metrics) # utils.pack_code(cfg, logger=logger) # Build model. model = model_builder.build_model(cfg=cfg, logger=logger) optimizer = optimizer_helper.build_optimizer(cfg=cfg, model=model) lr_scheduler = lr_scheduler_helper.build_scheduler(cfg=cfg, optimizer=optimizer) # Read checkpoint. ckpt = torch.load(cfg.model.path2ckpt) if cfg.gnrl.resume else {} if cfg.gnrl.resume: with logger.log_info(msg="Load pre-trained model.", level="INFO", state=True, logger=logger): model.load_state_dict(ckpt["model"]) optimizer.load_state_dict(ckpt["optimizer"]) lr_scheduler.load_state_dict(ckpt["lr_scheduler"]) # Set device. model, device = utils.set_pipline( model, cfg) if cfg.gnrl.PIPLINE else utils.set_device( model, cfg.gnrl.cuda) resume_epoch = ckpt["epoch"] if cfg.gnrl.resume else 0 loss_fn = loss_fn_helper.build_loss_fn(cfg=cfg) # Prepare dataset. train_loaders, valid_loaders, test_loaders = dict(), dict(), dict() for dataset in cfg.data.datasets: if cfg.data[dataset].TRAIN: try: train_loaders[dataset] = data_loader.build_data_loader( cfg, dataset, "train") except: utils.notify(msg="Failed to build train loader of %s" % dataset) if cfg.data[dataset].VALID: try: valid_loaders[dataset] = data_loader.build_data_loader( cfg, dataset, "valid") except: utils.notify(msg="Failed to build valid loader of %s" % dataset) if cfg.data[dataset].TEST: try: test_loaders[dataset] = data_loader.build_data_loader( cfg, dataset, "test") except: utils.notify(msg="Failed to build test loader of %s" % dataset) # TODO Train, evaluate model and save checkpoint. for epoch in range(cfg.train.max_epoch): epoch += 1 if resume_epoch >= epoch: continue eval_kwargs = { "epoch": epoch, "cfg": cfg, "model": model, "loss_fn": loss_fn, "device": device, "metrics_handler": metrics_handler, "logger": logger, "save": cfg.save.save, } train_kwargs = { "epoch": epoch, "cfg": cfg, "model": model, "loss_fn": loss_fn, "optimizer": optimizer, "device": device, "lr_scheduler": lr_scheduler, "metrics_handler": metrics_handler, "logger": logger, } ckpt_kwargs = { "epoch": epoch, "cfg": cfg, "model": model.state_dict(), "metrics_handler": metrics_handler, "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), } for dataset in cfg.data.datasets: if cfg.data[dataset].TRAIN: utils.notify("Train on %s" % dataset) train_one_epoch(data_loader=train_loaders[dataset], **train_kwargs) utils.save_ckpt(path2file=cfg.model.path2ckpt, **ckpt_kwargs) if epoch in cfg.gnrl.ckphs: utils.save_ckpt(path2file=os.path.join( cfg.model.ckpts, cfg.gnrl.id + "_" + str(epoch).zfill(5) + ".pth"), **ckpt_kwargs) for dataset in cfg.data.datasets: utils.notify("Evaluating test set of %s" % dataset, logger=logger) if cfg.data[dataset].TEST: evaluate(data_loader=test_loaders[dataset], phase="test", **eval_kwargs) for dataset in cfg.data.datasets: utils.notify("Evaluating valid set of %s" % dataset, logger=logger) if cfg.data[dataset].VALID: evaluate(data_loader=valid_loaders[dataset], phase="valid", **eval_kwargs) # End of train-valid for loop. eval_kwargs = { "epoch": epoch, "cfg": cfg, "model": model, "loss_fn": loss_fn, "device": device, "metrics_handler": metrics_handler, "logger": logger, "save": cfg.save.save, } for dataset in cfg.data.datasets: if cfg.data[dataset].VALID: utils.notify("Evaluating valid set of %s" % dataset, logger=logger) evaluate(data_loader=valid_loaders[dataset], phase="valid", **eval_kwargs) for dataset in cfg.data.datasets: if cfg.data[dataset].TEST: utils.notify("Evaluating test set of %s" % dataset, logger=logger) evaluate(data_loader=test_loaders[dataset], phase="test", **eval_kwargs) for dataset in cfg.data.datasets: if "train" in cfg.data[dataset].INFER: utils.notify("Inference on train set of %s" % dataset) inference(data_loader=train_loaders[dataset], phase="infer_train", **eval_kwargs) if "valid" in cfg.data[dataset].INFER: utils.notify("Inference on valid set of %s" % dataset) inference(data_loader=valid_loaders[dataset], phase="infer_valid", **eval_kwargs) if "test" in cfg.data[dataset].INFER: utils.notify("Inference on test set of %s" % dataset) inference(data_loader=test_loaders[dataset], phase="infer_test", **eval_kwargs) return None
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max, args.iw_clip).cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
run_args = parser.parse_args() if __name__ == '__main__': args = setup(run_args.config_file) args.model = run_args.model args.gpu = run_args.gpu random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) args.__dict__['active_subnet'] = args.__dict__['pareto_models'][args.model] print(args.active_subnet) train_loader, val_loader, train_sampler = build_data_loader(args) ## init static attentivenas model with weights inherited from the supernet model = models.model_factory.create_model(args) model.to(args.gpu) model.eval() # bn running stats calibration following Slimmable (https://arxiv.org/abs/1903.05134) # please consider trying a different random seed if you see a small accuracy drop with torch.no_grad(): model.reset_running_stats_for_calibration() for batch_idx, (images, _) in enumerate(train_loader): if batch_idx >= args.post_bn_calibration_batch_num: break images = images.cuda(args.gpu, non_blocking=True)
def main(): # Set logger to record information. logger = Logger(cfg) logger.log_info(cfg) metrics_logger = Metrics() utils.pack_code(cfg, logger=logger) # Build model. model = model_builder.build_model(cfg=cfg, logger=logger) # Read checkpoint. ckpt = torch.load(cfg.MODEL.PATH2CKPT) if cfg.GENERAL.RESUME else {} if cfg.GENERAL.RESUME: model.load_state_dict(ckpt["model"]) resume_epoch = ckpt["epoch"] if cfg.GENERAL.RESUME else 0 optimizer = ckpt[ "optimizer"] if cfg.GENERAL.RESUME else optimizer_helper.build_optimizer( cfg=cfg, model=model) # lr_scheduler = ckpt["lr_scheduler"] if cfg.GENERAL.RESUME else lr_scheduler_helper.build_scheduler(cfg=cfg, optimizer=optimizer) lr_scheduler = lr_scheduler_helper.build_scheduler(cfg=cfg, optimizer=optimizer) lr_scheduler.sychronize(resume_epoch) loss_fn = ckpt[ "loss_fn"] if cfg.GENERAL.RESUME else loss_fn_helper.build_loss_fn( cfg=cfg) # Set device. model, device = utils.set_device(model, cfg.GENERAL.GPU) # Prepare dataset. if cfg.GENERAL.TRAIN: try: train_data_loader = data_loader.build_data_loader( cfg, cfg.DATA.DATASET, "train") except: logger.log_info("Cannot build train dataset.") if cfg.GENERAL.VALID: try: valid_data_loader = data_loader.build_data_loader( cfg, cfg.DATA.DATASET, "valid") except: logger.log_info("Cannot build valid dataset.") if cfg.GENERAL.TEST: try: test_data_loader = data_loader.build_data_loader( cfg, cfg.DATA.DATASET, "test") except: logger.log_info("Cannot build test dataset.") # Train, evaluate model and save checkpoint. for epoch in range(cfg.TRAIN.MAX_EPOCH): if resume_epoch >= epoch: continue try: train_one_epoch( epoch=epoch, cfg=cfg, model=model, data_loader=train_data_loader, device=device, loss_fn=loss_fn, optimizer=optimizer, lr_scheduler=lr_scheduler, metrics_logger=metrics_logger, logger=logger, ) except: logger.log_info("Failed to train model.") optimizer.zero_grad() with torch.no_grad(): utils.save_ckpt( path2file=os.path.join( cfg.MODEL.CKPT_DIR, cfg.GENERAL.ID + "_" + str(epoch).zfill(3) + ".pth"), logger=logger, model=model.state_dict(), epoch=epoch, optimizer=optimizer, lr_scheduler=lr_scheduler, # NOTE Need attribdict>=0.0.5 loss_fn=loss_fn, metrics=metrics_logger, ) try: evaluate( epoch=epoch, cfg=cfg, model=model, data_loader=valid_data_loader, device=device, loss_fn=loss_fn, metrics_logger=metrics_logger, phase="valid", logger=logger, save=cfg.SAVE.SAVE, ) except: logger.log_info("Failed to evaluate model.") with torch.no_grad(): utils.save_ckpt( path2file=os.path.join( cfg.MODEL.CKPT_DIR, cfg.GENERAL.ID + "_" + str(epoch).zfill(3) + ".pth"), logger=logger, model=model.state_dict(), epoch=epoch, optimizer=optimizer, lr_scheduler=lr_scheduler, # NOTE Need attribdict>=0.0.5 loss_fn=loss_fn, metrics=metrics_logger, ) # If test set has target images, evaluate and save them, otherwise just try to generate output images. if cfg.DATA.DATASET == "DualPixelNTIRE2021": try: generate( cfg=cfg, model=model, data_loader=valid_data_loader, device=device, phase="valid", logger=logger, ) except: logger.log_info( "Failed to generate output images of valid set of NTIRE2021.") try: evaluate( epoch=epoch, cfg=cfg, model=model, data_loader=test_data_loader, device=device, loss_fn=loss_fn, metrics_logger=metrics_logger, phase="test", logger=logger, save=True, ) except: logger.log_info("Failed to test model, try to generate images.") try: generate( cfg=cfg, model=model, data_loader=test_data_loader, device=device, phase="test", logger=logger, ) except: logger.log_info("Cannot generate output images of test set.") return None
def eval_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging("stdout.log", 'w') # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank torch.cuda.set_device(args.gpu) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # build the supernet logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) model = comm.get_parallel_model(model, args.gpu) #local rank # define loss function (criterion) criterion = nn.CrossEntropyLoss().cuda() ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) assert args.resume #reloading model model.module.load_weights_from_pretrained_models(args.resume) if train_sampler: train_sampler.set_epoch(0) targeted_min_flops = args.evo_search.targeted_min_flops targeted_max_flops = args.evo_search.targeted_max_flops # run evolutionary search parent_popu = [] for idx in range(args.evo_search.parent_popu_size): if idx == 0: cfg = model.module.sample_min_subnet() else: cfg = model.module.sample_active_subnet_within_range( targeted_min_flops, targeted_max_flops) cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}' parent_popu.append(cfg) pareto_global = {} for evo in range(args.evo_search.evo_iter): # partition the set of candidate sub-networks # and send them to each GPU for parallel evaluation # sub-networks to be evaluated on GPU {args.rank} my_subnets_to_be_evaluated = {} n_evaluated = len(parent_popu) // args.world_size * args.world_size for cfg in parent_popu[:n_evaluated]: if cfg['net_id'].startswith(f'net_{args.rank}_'): my_subnets_to_be_evaluated[cfg['net_id']] = cfg # aggregating all evaluation results eval_results = attentive_nas_eval.validate( my_subnets_to_be_evaluated, train_loader, val_loader, model, criterion, args, logger, ) # update the Pareto frontier # in this case, we search the best FLOPs vs. accuracy trade-offs for cfg in eval_results: f = round( cfg['flops'] / args.evo_search.step) * args.evo_search.step if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']: pareto_global[f] = cfg # next batch of sub-networks to be evaluated parent_popu = [] # mutate for idx in range(args.evo_search.mutate_size): while True: old_cfg = random.choice(list(pareto_global.values())) cfg = model.module.mutate_and_reset( old_cfg, prob=args.evo_search.mutate_prob) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}' parent_popu.append(cfg) # cross over for idx in range(args.evo_search.crossover_size): while True: cfg1 = random.choice(list(pareto_global.values())) cfg2 = random.choice(list(pareto_global.values())) cfg = model.module.crossover_and_reset(cfg1, cfg2) flops = model.module.compute_active_subnet_flops() if flops >= targeted_min_flops and flops <= targeted_max_flops: break cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}' parent_popu.append(cfg)