def distribute_run(self): strategy = tf.distribute.MirroredStrategy() train_global_batch = self.args.train_batch * strategy.num_replicas_in_sync val_global_batch = self.args.val_batch * strategy.num_replicas_in_sync train_date, train_batch_num, val_data, val_batch_num = get_datasets( name=self.args.dataset, train_batch=train_global_batch, val_batch=val_global_batch) with strategy.scope(): model = get_net(arch=self.args.arch, num_layers=self.args.num_layers, num_experts=self.args.num_experts, num_classes=self.args.num_classes) model.build(input_shape=(None, 32, 32, 3)) model.summary() optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr, momentum=0.9, decay=0.0001, nesterov=True) dis_trainer = DisTrainer(strategy=strategy, model=model, optimizer=optimizer, epochs=self.args.epochs, val_data=val_data, train_batch=self.args.train_batch, val_batch=self.args.val_batch, train_data=train_date, log_dir=self.log_dir, model_save_path=self.model_save_path, train_batch_num=train_batch_num, val_batch_num=val_batch_num) dis_trainer(resume=self.args.resume, val=self.args.val)
def run(self): train_date, train_batch_num, val_data, val_batch_num = get_datasets( name=self.args.dataset, train_batch=self.args.train_batch, val_batch=self.args.val_batch) model = get_net(arch=self.args.arch, num_layers=self.args.num_layers, num_experts=self.args.num_experts, num_classes=self.args.num_classes) model.build(input_shape=(None, 32, 32, 3)) model.summary() optimizer = tf.keras.optimizers.SGD(learning_rate=self.args.lr, momentum=0.9, decay=0.0001, nesterov=True) trainer = Trainer(model=model, optimizer=optimizer, epochs=self.args.epochs, val_data=val_data, train_batch=self.args.train_batch, val_batch=self.args.val_batch, train_data=train_date, log_dir=self.log_dir, model_save_path=self.model_save_path, train_batch_num=train_batch_num, val_batch_num=val_batch_num) trainer(resume=self.args.resume, val=self.args.val)
def setup_net(snapshot): """Quickly create a network for the given snapshot. Arguments: snapshot {string} -- Input snapshot, IE. kitti_best.pth Returns: [net, transform] -- PyTorch model & the image transform function. """ cudnn.benchmark = False torch.cuda.empty_cache() args = Args('./save', 'network.deepv3.DeepWV3Plus', snapshot) assert_and_infer_cfg(args, train_mode=False) # get net net = network.get_net(args, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Net built.') net, _ = restore_snapshot(net, optimizer=None, snapshot=snapshot, restore_optimizer_bool=False) net.eval() print('Net restored.') # get data mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) img_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(*mean_std)]) return net, img_transform, args
def main(): ''' Main Function ''' #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args, parser) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) net = network.get_net(args, criterion) optim, scheduler = optimizer.get_optimizer(args, net) torch.cuda.empty_cache() if args.evaluate: # Early evaluation for benchmarking validate(val_loader, net, criterion_val, optim, epoch, writer) evaluate(val_loader, net) return #Main Loop for epoch in range(args.start_epoch, args.max_epoch): # Update EPOCH CTR cfg.immutable(False) cfg.EPOCH = epoch cfg.immutable(True) scheduler.step() train(train_loader, net, criterion, optim, epoch, writer) validate(val_loader, net, criterion_val, optim, epoch, writer)
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args, parser) writer = None _, _, _, extra_val_loaders, _ = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) criterion_aux = loss.get_loss_aux(args) net = network.get_net(args, criterion, criterion_aux) optim, scheduler = optimizer.get_optimizer(args, net) net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = network.warp_network_in_dataparallel(net, args.local_rank) epoch = 0 i = 0 if args.snapshot: epoch, mean_iu = optimizer.load_weights(net, optim, scheduler, args.snapshot, args.restore_optimizer) print("#### iteration", i) torch.cuda.empty_cache() # Main Loop # for epoch in range(args.start_epoch, args.max_epoch): for dataset, val_loader in extra_val_loaders.items(): print("Extra validating... This won't save pth file") validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False)
def main_worker(gpu, ngpus_per_node): print('Use GPU:', gpu) dist.init_process_group(backend='nccl', init_method='tcp://localhost:23456', world_size=ngpus_per_node, rank=gpu) print('Group initialized.') model_dnet = network.get_net() saver = M.Saver(model_dnet) saver.restore('./model/') # model_dnet.bn_eps(1e-5) model = loss.ModelWithLoss(model_dnet) torch.cuda.set_device(gpu) model.cuda(gpu) model.train() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) print('Model get.') loader, sampler = datareader.get_train_dataloader(64) optim = torch.optim.AdamW(model.parameters(), lr=config.init_lr) for e in range(75, config.max_epoch): print('Replica:%d Epoch:%d' % (gpu, e)) sampler.set_epoch(e) for i, (img, hmap) in enumerate(loader): # print(img.shape, hmap.shape, hmap_match.shape) optim.zero_grad() hmap_loss, outs = model(img, hmap) hmap_loss = hmap_loss.mean() loss_total = hmap_loss loss_total.backward() optim.step() lr = optim.param_groups[0]['lr'] if i % 200 == 0 and gpu == 0: if not os.path.exists('./outputs/'): os.mkdir('./outputs/') visutil.vis_batch(img, outs, './outputs/%d_out.jpg' % i) visutil.vis_batch(img, hmap, './outputs/%d_gt.jpg' % i) if i % 20 == 0: curr_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) print('%s Replica:%d Progress:%d/%d LsC:%.3e LR:%.1e' % (curr_time, gpu, i, len(loader), hmap_loss, lr)) if e in config.lr_epoch: newlr = lr * 0.1 for param_group in optim.param_groups: param_group['lr'] = newlr if e % config.save_interval == 0 and gpu == 0: stamp = random.randint(0, 1000000) saver.save('./model/%d_%d.pth' % (e, stamp))
def get_net(optimizer=None, criterion=None): """ Get network for train :return: """ net = network.get_net(args, criterion=criterion) net, _ = restore_snapshot(net, optimizer=optimizer, snapshot=args.snapshot, restore_optimizer_bool=False) #net.train() return net
def get_net(): """ Get Network for evaluation """ logging.info('Load model file: %s', args.snapshot) net = network.get_net(args, criterion=None) net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = network.warp_network_in_dataparallel(net, args.local_rank) net, _, _, _, _ = restore_snapshot(net, optimizer=None, scheduler=None, snapshot=args.snapshot, restore_optimizer_bool=False) net.eval() return net
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args, parser) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) net = network.get_net(args, criterion) optim, scheduler = optimizer.get_optimizer(args, net) if args.fix_bn: net.apply(set_bn_eval) print("Fix bn for finetuning") if args.fp16: net, optim = amp.initialize(net, optim, opt_level="O1") net = network.wrap_network_in_dataparallel(net, args.apex) if args.snapshot: optimizer.load_weights(net, optim, args.snapshot, args.restore_optimizer) if args.evaluateF: assert args.snapshot is not None, "must load weights for evaluation" evaluate(val_loader, net, args) return torch.cuda.empty_cache() # Main Loop for epoch in range(args.start_epoch, args.max_epoch): # Update EPOCH CTR cfg.immutable(False) cfg.EPOCH = epoch cfg.immutable(True) scheduler.step() train(train_loader, net, optim, epoch, writer) if args.apex: train_loader.sampler.set_epoch(epoch + 1) validate(val_loader, net, criterion_val, optim, epoch, writer) if args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.build_epoch(cut=True) if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch()
def get_net(): """ Get Network for evaluation """ logging.info('Load model file: %s', args.snapshot) net = network.get_net(args, criterion=None) if args.inference_mode == 'pooling': net = MyDataParallel(net, gather=False).cuda() else: net = torch.nn.DataParallel(net).cuda() net, _ = restore_snapshot(net, optimizer=None, snapshot=args.snapshot, restore_optimizer_bool=False) net.eval() return net
def get_segmentation(self): # Get Segmentation Net assert_and_infer_cfg(self.opt, train_mode=False) self.opt.dataset_cls = cityscapes net = network.get_net(self.opt, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Segmentation Net Built.') snapshot = os.path.join(os.getcwd(), os.path.dirname(__file__), self.opt.snapshot) self.seg_net, _ = restore_snapshot(net, optimizer=None, snapshot=snapshot, restore_optimizer_bool=False) self.seg_net.eval() print('Segmentation Net Restored.')
def main(eval_args=None): ''' Main Function ''' # Parse arguments from rest_communication.py #args = parser.parse_args(eval_args) if args.snapshot == None: args.snapshot = "checkpoints/best_cityscapes_checkpoint.pth" train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) net = network.get_net(args, criterion) net = restore_snapshot(net) torch.cuda.empty_cache() return evaluate(val_loader, net)
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args, parser) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) net = network.get_net(args, criterion) optim, scheduler = optimizer.get_optimizer(args, net) if args.fp16: net, optim = amp.initialize(net, optim, opt_level="O1") net = network.warp_network_in_dataparallel(net, args.apex) if args.snapshot: optimizer.load_weights(net, optim, args.snapshot, args.restore_optimizer) torch.cuda.empty_cache() # Main Loop for epoch in range(args.start_epoch, args.max_epoch): # Update EPOCH CTR cfg.immutable(False) cfg.EPOCH = epoch cfg.immutable(True) #snapshot="/srv/beegfs02/scratch/language_vision/data/Sound_Event_Prediction/audio/semanticPred/logs/ckpt/default/Omni-network.deepv3_audioBG_Spec_diffmask_Comp_noBG_Paralleltask_depth_noSeman.DeepWV3Plus/models_depth/SOP_epoch_"+str(14)+".pth" #optimizer.load_weights(net, optim, # snapshot, args.restore_optimizer) scheduler.step() train(train_loader, net, optim, epoch, writer) if args.apex: train_loader.sampler.set_epoch(epoch + 1) validate(val_loader, net, criterion_val, optim, epoch, writer) if args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.build_epoch(cut=True) if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch()
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: print(str(net)) from pytorchOpCounter.thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
import network from optimizer import restore_snapshot from datasets import cityscapes, kitti # We only need BN layer from config import infer_cfg, cfg infer_cfg(train_mode=False) cudnn.benchmark = False torch.cuda.empty_cache() img_dir = './test_imgs/' # get net arch = 'network.deepv3.DeepWV3Plus' dataset_cls = kitti net = network.get_net(arch, dataset_cls, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Net built.') ckpt_path = './kitti_best.pth' ckpt = torch.load(ckpt_path) net.load_state_dict(ckpt['state_dict']) #net, _ = restore_snapshot(net, optimizer=None, snapshot=ckpt_path, restore_optimizer_bool=False) net.eval() print('Net restored.') # get data demo_img_path = os.path.join(img_dir, 'kitti-13.png') mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) img_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(*mean_std)])
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=False, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) net = network.wrap_network_in_dataparallel(net, args.apex) if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) if args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics # validate(val_loader, net, criterion=None, optim=None, epoch=0, # calc_metrics=False, dump_assets=args.dump_assets, # dump_all_images=True) if not os.path.exists(args.result_dir + 'image_2/'): os.mkdir(args.result_dir + 'image_2/') if not os.path.exists(args.result_dir + 'image_3/'): os.mkdir(args.result_dir + 'image_3/') num_image = 7481 for idx in tqdm(range(num_image)): sample_idx = "%06d" % idx eval_minibatch(sample_idx, "image_2/", net, args) eval_minibatch(sample_idx, "image_3/", net, args) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval)
def convert_segmentation_model(model_name='segmentation.onnx'): assert_and_infer_cfg(opt, train_mode=False) cudnn.benchmark = False torch.cuda.empty_cache() # Get segmentation Net opt.dataset_cls = cityscapes net = network.get_net(opt, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Segmentation Net built.') net, _ = restore_snapshot(net, optimizer=None, snapshot=opt.snapshot, restore_optimizer_bool=False) net.eval() print('Segmentation Net Restored.') # Input to the model batch_size = 1 x = torch.randn(batch_size, 3, 1024, 2048, requires_grad=True).cuda() torch_out = net(x) # Export the model torch.onnx.export( net.module, # model being run x, # model input (or a tuple for multiple inputs) model_name, # where to save the model (can be a file or file-like object) export_params= True, # store the trained parameter weights inside the model file opset_version=11, # the ONNX version to export the model to do_constant_folding= True, # whether to execute constant folding for optimization input_names=['input'], # the model's input names output_names=['output'], # the model's output names dynamic_axes={ 'input': { 0: 'batch_size' }, # variable lenght axes 'output': { 0: 'batch_size' } }) ort_session = onnxruntime.InferenceSession(model_name) def to_numpy(tensor): return tensor.detach().cpu().numpy( ) if tensor.requires_grad else tensor.cpu().numpy() ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)} ort_outs = ort_session.run(None, ort_inputs) # compare ONNX Runtime and PyTorch results np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-03) print( "Exported model has been tested with ONNXRuntime, and the result looks good!" )
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer # args2 = copy.deepcopy(args) assert_and_infer_cfg(args) # assert_and_infer_cfg(args2) # args2.dataset = 'kitti_trav' # print(args.dataset) # print(args2.dataset) writer = prep_experiment(args, parser) # writer = prep_experiment(args2, parser) # Dataset train_loader, val_loader, train_obj = datasets.setup_loaders(args) # train_loader2, val_loader2, train_obj2 = datasets.setup_loaders(args2) criterion, criterion_val = loss.get_loss(args, data_type='semantic') criterion2, criterion_val2 = loss.get_loss(args, data_type='trav') net = network.get_net(args, criterion, criterion2) #parameters list # param1_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final.parameters()) + [log_sigma_A] # param2_lists = list(net.mod1.parameters()) + list(net.mod2.parameters()) + list(net.mod3.parameters()) + list(net.mod4.parameters()) + list(net.mod5.parameters()) + list(net.mod6.parameters()) + list(net.mod7.parameters()) + list(net.pool2.parameters()) + list(net.pool3.parameters()) + list(net.aspp.parameters()) + list(net.bot_fine.parameters()) + list(net.bot_aspp.parameters()) + list(net.final2.parameters()) + [log_sigma_B] #optimizers optim, scheduler = optimizer.get_optimizer(args, net) # optim2, scheduler2 = optimizer.get_optimizer(args, param2_lists) if args.fp16: net, optim = amp.initialize(net, optim, opt_level="O1") net = network.wrap_network_in_dataparallel(net, args.apex) if args.snapshot: optimizer.load_weights(net, optim, args.snapshot, args.snapshot2, args.restore_optimizer) # optimizer.load_weights(net, optim2, # args.snapshot, args.snapshot2, args.restore_optimizer) torch.cuda.empty_cache() # Main Loop for epoch in range(args.start_epoch, args.max_epoch): # Update EPOCH CTR cfg.immutable(False) cfg.EPOCH = epoch cfg.immutable(True) scheduler.step() train(train_loader, net, optim, epoch, writer) if args.apex: train_loader.sampler.set_epoch(epoch + 1) # train_loader2.sampler.set_epoch(epoch + 1) validate(val_loader, net, criterion_val, criterion_val2, optim, epoch, writer) if args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.build_epoch(cut=True) # train_obj2.build_epoch(cut=True) if args.apex: train_loader.sampler.set_num_samples() # train_loader2.sampler.set_num_samples() else: train_obj.build_epoch()
# start experiment n_pool = len(Y_tr) n_test = len(Y_te) print('number of labeled pool: {}'.format(NUM_INIT_LB)) print('number of unlabeled pool: {}'.format(n_pool - NUM_INIT_LB)) print('number of testing pool: {}'.format(n_test)) # generate initial labeled pool idxs_lb = np.zeros(n_pool, dtype=bool) idxs_tmp = np.arange(n_pool) np.random.shuffle(idxs_tmp) idxs_lb[idxs_tmp[:NUM_INIT_LB]] = True # load network net = get_net(DATA_NAME) handler = get_handler(DATA_NAME) strategy = RandomSampling(X_tr, Y_tr, idxs_lb, net, handler, args) # print info print(DATA_NAME) print(type(strategy).__name__) # round 0 accuracy strategy.set_test_data(x=X_te, y=Y_te) strategy.train() P = strategy.predict(X_te, Y_te) acc = np.zeros(NUM_ROUND + 1) acc[0] = 1.0 * (torch.max(Y_te, 1)[1] == P).sum().item() / len(Y_te) print('Round 0\ntesting accuracy {}'.format(acc[0]))
def main(): ''' Main Function ''' #Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args,parser) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = loss.get_loss(args) net = network.get_net(args, criterion) optim, scheduler = optimizer.get_optimizer(args, net) torch.cuda.empty_cache() if args.mode=="test": test_sv_path = args.test_sv_path print(f"Saving prediction {test_sv_path}") net.eval() for vi, data in enumerate(tqdm(val_loader)): input, mask, img_name, img_path = data assert len(input.size()) == 4 and len(mask.size()) == 3 assert input.size()[2:] == mask.size()[1:] b, h, w = mask.size() batch_pixel_size = input.size(0) * input.size(2) * input.size(3) input, mask_cuda = input.cuda(), mask.cuda() with torch.no_grad(): seg_out, edge_out = net(input) # output = (1, 19, 713, 713) seg_predictions = seg_out.data.cpu().numpy() edge_predictions = edge_out.cpu().numpy() for i in range(b): _,file_name = os.path.split(img_path[i]) file_name = file_name.replace("jpg","png") seq = img_path[i][:5] seg_path = os.path.join(test_sv_path,"gscnn","seg",seq) if not os.path.exists(seg_path): os.makedirs(seg_path) edge_path = os.path.join(test_sv_path,"gscnn","edge",seq) edgenp_path = os.path.join(test_sv_path,"gscnn","edgenp",seq) if not os.path.exists(edge_path): os.makedirs(edge_path) os.makedirs(edgenp_path) seg_arg = np.argmax(seg_predictions[i],axis=0).astype(np.uint8) edge_arg = np.argmax(edge_predictions[i],axis=0).astype(np.uint8) seg_img = np.stack((seg_arg,seg_arg,seg_arg),axis=2) edge_img = np.stack((edge_arg,edge_arg,edge_arg),axis=2) seg_img = Image.fromarray(seg_img) seg_img.save(os.path.join(seg_path,file_name)) edge_img = Image.fromarray(edge_img) edge_img.save(os.path.join(edge_path,file_name)) np.save(os.path.join(edge_path,file_name.replace("png","npy")),edge_predictions[i]) return if args.evaluate: # Early evaluation for benchmarking default_eval_epoch = 1 validate(val_loader, net, criterion_val, optim, default_eval_epoch, writer) evaluate(val_loader, net) return #Main Loop for epoch in range(args.start_epoch, args.max_epoch): # Update EPOCH CTR cfg.immutable(False) cfg.EPOCH = epoch cfg.immutable(True) scheduler.step() train(train_loader, net, criterion, optim, epoch, writer) validate(val_loader, net, criterion_val, optim, epoch, writer)
from datasets import cityscapes from config import assert_and_infer_cfg parser = argparse.ArgumentParser(description='demo') parser.add_argument('--demo-image', type=str, default='', help='path to demo image', required=True) parser.add_argument('--snapshot', type=str, default='./pretrained_models/cityscapes_best_wideresnet38.pth', help='pre-trained checkpoint', required=True) parser.add_argument('--arch', type=str, default='network.deepv3.DeepWV3Plus', help='network architecture used for inference') parser.add_argument('--save-dir', type=str, default='./save', help='path to save your results') args = parser.parse_args() assert_and_infer_cfg(args, train_mode=False) cudnn.benchmark = False torch.cuda.empty_cache() # get net args.dataset_cls = cityscapes net = network.get_net(args, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Net built.') net, _ = restore_snapshot(net, optimizer=None, snapshot=args.snapshot, restore_optimizer_bool=False) net.eval() print('Net restored.') # get data mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) img_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(*mean_std)]) img = Image.open(args.demo_image).convert('RGB') img_tensor = img_transform(img) # predict with torch.no_grad(): img = img_tensor.unsqueeze(0).cuda()
def main(): """ Main Function """ rank = args.rank cfg.GLOBAL_RANK = rank args.gpus = torch.cuda.device_count() device = torch.device("cpu") hvd.init() torch.manual_seed(999999) #if args.cuda: args.cuda = True # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) #torch.cuda.manual_seed(args.seed) assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) #print("vefore assert and infer") # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) # args.ngpu = torch.cuda.device_count() # args.best_record = {'mean_iu': -1, 'epoch': 0} #print("before datasets / loss") train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) cwd = os.getcwd() sz = ht.MPI_WORLD.size filename = cwd + "/citys-hvd-checkpoint-" + str(sz) + ".pth.tar" if args.resume and os.path.isfile(filename): checkpoint = torch.load(filename, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True logx.msg(f"Resuming from: checkpoint={args.resume}, " \ f"epoch {args.start_epoch}, arch {args.arch}") elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True logx.msg(f"Loading weights from: checkpoint={args.snapshot}") # todo: HeAT fixes -- urgent -- DDDP / optim / scheduler net = network.get_net(args, criterion) # net = net.to(device) # todo: optim -> direct wrap after this, scheduler stays the same? optim, scheduler = get_optimizer(args, net) # if args.fp16: # net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) compression = hvd.Compression.fp16 # if args.fp16_allreduce else hvd.Compression.none optim = hvd.DistributedOptimizer( optim, named_parameters=net.named_parameters(), compression=compression, backward_passes_per_step=1, # args.batches_per_allreduce, op=hvd.Average, gradient_predivide_factor=1.0, # args.gradient_predivide_factor) ) #print("after hvd optimizer setup") if args.summary: print(str(net)) from thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print0(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: #net.loat_state_dict(checkpoint["state_dict"]) restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() #print("before parameter broadcasts") #hvd.broadcast_parameters(net.state_dict(), root_rank=0) #hvd.broadcast_optimizer_state(optim, root_rank=0) if args.start_epoch != 0: # TODO: need a loss value for the restart at a certain epoch... scheduler.step(args.start_epoch) #net = net.cuda() # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets # todo: HeAT fixes -- not urgent -- # if args.eval == 'val': # if args.dump_topn: # validate_topn(val_loader, net, criterion_val, optim, 0, args) # else: # validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, # dump_assets=args.dump_assets, # dump_all_images=args.dump_all_images, # calc_metrics=not args.no_metrics) # return 0 # elif args.eval == 'folder': # # Using a folder for evaluation means to not calculate metrics # validate(val_loader, net, criterion=None, optim=None, epoch=0, # calc_metrics=False, dump_assets=args.dump_assets, # dump_all_images=True) # return 0 # elif args.eval is not None: # raise 'unknown eval option {}'.format(args.eval) scaler = None #amp.GradScaler() args.amp = False #True nodes = str(int(hvd.size() / torch.cuda.device_count())) cwd = os.getcwd() fname = cwd + "/" + nodes + "-hvd-citys-benchmark" if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"): with open(fname + ".pkl", "rb") as f: out_dict = pickle.load(f) else: out_dict = { "epochs": [], nodes + "-avg-batch-time": [], nodes + "-total-train-time": [], nodes + "-train-loss": [], nodes + "-val-loss": [], nodes + "-val-iou": [], nodes + "-val-time": [], } print0("Output dict:", fname) # train_losses, train_btimes, train_ttime = [], [], [] # val_losses, val_iu, val_ttime = [], [], [] for epoch in range(args.start_epoch, args.max_epoch): # todo: HeAT fixes -- possible conflict between processes update_epoch(epoch) if args.only_coarse: # default: false train_obj.only_coarse() train_obj.build_epoch() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() else: train_obj.build_epoch() else: pass ls, bt, btt = train(train_loader, net, optim, epoch, scaler) # dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True) # if epoch % args.val_freq == 0: vls, iu, vtt = validate(val_loader, net, criterion_val, optim, epoch) if args.lr_schedule == "plateau": scheduler.step(ls) # val_loss) else: scheduler.step() if args.rank == 0: save_checkpoint({ "epoch": epoch + 1, "arch": args.arch, "state_dict": net.state_dict(), "optimizer": optim.state_dict(), # "skip_stable": optim.stability.get_dict() }) out_dict["epochs"].append(epoch) out_dict[nodes + "-train-loss"].append(ls) out_dict[nodes + "-avg-batch-time"].append(bt) out_dict[nodes + "-total-train-time"].append(btt) out_dict[nodes + "-val-loss"].append(vls) out_dict[nodes + "-val-iou"].append(iu) out_dict[nodes + "-val-time"].append(vtt) if args.rank == 0: save_obj(out_dict, fname) if args.rank == 0: print("\nRESULTS\n") import pandas as pd df = pd.DataFrame.from_dict(out_dict).set_index("epochs") with pd.option_context("display.max_rows", None, "display.max_columns", None): # more options can be specified also print(df) if args.benchmarking: try: fulldf = pd.read_csv(cwd + "/hvd-bench-results.csv") fulldf = pd.concat([df, fulldf], axis=1) except FileNotFoundError: fulldf = df fulldf.to_csv(cwd + "/hvd-bench-results.csv")
sys.path.insert(0, './image_segmentation') import network from optimizer import restore_snapshot from datasets import cityscapes from config import assert_and_infer_cfg TestOptions = TestOptions() opt = TestOptions.parse() assert_and_infer_cfg(opt, train_mode=False) cudnn.benchmark = False torch.cuda.empty_cache() # Get segmentation Net opt.dataset_cls = cityscapes net = network.get_net(opt, criterion=None) net = torch.nn.DataParallel(net).cuda() print('Segmentation Net built.') net, _ = restore_snapshot(net, optimizer=None, snapshot=opt.snapshot, restore_optimizer_bool=False) net.eval() print('Segmentation Net Restored.') # Get RGB Original Images data_dir = opt.demo_folder images = os.listdir(data_dir) if len(images) == 0: print('There are no images at directory %s. Check the data path.' % (data_dir))
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) #define the NASA optimizer parameter iter_tot = len(train_loader) * args.max_epoch # tau = args.tau_factor/sqrt(iter_tot) tau = 1 net = network.get_net(args, criterion) k = 1 # optim, scheduler = get_optimizer(args, net) optim, scheduler = get_optimizer(args, net, tau, k) # Visualize feature maps #activation = {} #def get_activation(name): #def hook(model, input, output): #activation[name] = output.detach() #return hook #net.layer[0].register_forward_hook(get_activation('conv1')) #data, _ = dataset[0] #data.unsqueeze_(0) #output = model(data) #act = activation['conv1'].squeeze() #fig, axarr = plt.subplots(act.size(0)) #for idx in range(act.size(0)): #axarr[idx].imshow(act[idx]) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: from thop import profile img = torch.randn(1, 3, 640, 640).cuda() mask = torch.randn(1, 1, 640, 640).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'test': validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True, testing=True, grid=city) return 0 if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args, parser) train_loader, val_loaders, train_obj, extra_val_loaders, covstat_val_loaders = datasets.setup_loaders( args) criterion, criterion_val = loss.get_loss(args) criterion_aux = loss.get_loss_aux(args) net = network.get_net(args, criterion, criterion_aux) optim, scheduler = optimizer.get_optimizer(args, net) net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = network.warp_network_in_dataparallel(net, args.local_rank) epoch = 0 i = 0 if args.snapshot: epoch, mean_iu = optimizer.load_weights(net, optim, scheduler, args.snapshot, args.restore_optimizer) if args.restore_optimizer is True: iter_per_epoch = len(train_loader) i = iter_per_epoch * epoch else: epoch = 0 print("#### iteration", i) torch.cuda.empty_cache() # Main Loop # for epoch in range(args.start_epoch, args.max_epoch): while i < args.max_iter: # Update EPOCH CTR cfg.immutable(False) cfg.ITER = i cfg.immutable(True) i = train(train_loader, net, optim, epoch, writer, scheduler, args.max_iter) train_loader.sampler.set_epoch(epoch + 1) if (args.dynamic and args.use_isw and epoch % (args.cov_stat_epoch + 1) == args.cov_stat_epoch) \ or (args.dynamic is False and args.use_isw and epoch == args.cov_stat_epoch): net.module.reset_mask_matrix() for trial in range(args.trials): for dataset, val_loader in covstat_val_loaders.items( ): # For get the statistics of covariance validate_for_cov_stat(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False) net.module.set_mask_matrix() if args.local_rank == 0: print("Saving pth file...") evaluate_eval(args, net, optim, scheduler, None, None, [], writer, epoch, "None", None, i, save_pth=True) if args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.build_epoch(cut=True) train_loader.sampler.set_num_samples() else: train_obj.build_epoch() epoch += 1 # Validation after epochs if len(val_loaders) == 1: # Run validation only one time - To save models for dataset, val_loader in val_loaders.items(): validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i) else: if args.local_rank == 0: print("Saving pth file...") evaluate_eval(args, net, optim, scheduler, None, None, [], writer, epoch, "None", None, i, save_pth=True) for dataset, val_loader in extra_val_loaders.items(): print("Extra validating... This won't save pth file") validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False)
def main(): """ Main Function """ rank = args.rank cfg.GLOBAL_RANK = rank args.gpus = torch.cuda.device_count() device = torch.device("cpu") loc_dist = True if args.gpus > 1 else False loc_rank = rank % args.gpus args.gpu = loc_rank args.local_rank = loc_rank if loc_dist: device = "cuda:" + str(loc_rank) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "19500" os.environ["NCCL_SOCKET_IFNAME"] = "ib" torch.cuda.set_device(device) torch.distributed.init_process_group(backend="nccl", rank=loc_rank, world_size=args.gpus) # torch.cuda.set_device(device) elif args.gpus == 1: args.gpus = torch.cuda.device_count() device = "cuda:0" args.local_rank = 0 torch.cuda.set_device(device) assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) # args.ngpu = torch.cuda.device_count() # args.best_record = {'mean_iu': -1, 'epoch': 0} train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) cwd = os.getcwd() sz = ht.MPI_WORLD.size filename = cwd + "/citys-heat-checkpoint-" + str(sz) + ".pth.tar" if args.resume and os.path.isfile(filename): checkpoint = torch.load(filename, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True logx.msg(f"Resuming from: checkpoint={args.resume}, " f"epoch {args.start_epoch}, arch {args.arch}") elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True logx.msg(f"Loading weights from: checkpoint={args.snapshot}") net = network.get_net(args, criterion) net = net.to(device) # args.lr = (1. / args.world_size * (5 * (args.world_size - 1) / 6.)) * 0.0125 * args.world_size optim, scheduler = get_optimizer(args, net) # the scheduler in this code is only run at the end of each epoch # todo: make heat an option not this whole file # if args.heat: dp_optim = ht.optim.DASO( local_optimizer=optim, total_epochs=args.max_epoch, max_global_skips=4, ) #if args.no_cycling: dp_optim.disable_cycling(global_skips=args.batch_skip, batches_to_wait=args.gs) # this is where the network is wrapped with DDDP (w/apex) or DP htnet = ht.nn.DataParallelMultiGPU(net, comm=ht.MPI_WORLD, optimizer=dp_optim) if args.summary: print(str(net)) from thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print0(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) dp_optim.stability.load_dict(checkpoint["skip_stable"]) if args.restore_net: #restore_net(net, checkpoint) htnet.load_state_dict(checkpoint["state_dict"]) #dp_optim.module.load_state_dist(checkpoint["state_dict"]) # htnet = ht.nn.DataParallelMultiGPU(net, ht.MPI_WORLD, dp_optim) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: # TODO: need a loss value for the restart at a certain epoch... scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets # todo: HeAT fixes -- not urgent -- if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) scaler = amp.GradScaler() if dp_optim.comm.rank == 0: print("scheduler", args.lr_schedule) dp_optim.add_scaler(scaler) nodes = str(int(dp_optim.comm.size / torch.cuda.device_count())) cwd = os.getcwd() fname = cwd + "/" + nodes + "-heat-citys-benchmark" if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"): with open(fname + ".pkl", "rb") as f: out_dict = pickle.load(f) else: out_dict = { "epochs": [], nodes + "-avg-batch-time": [], nodes + "-total-train-time": [], nodes + "-train-loss": [], nodes + "-val-loss": [], nodes + "-val-iou": [], nodes + "-val-time": [], } print0("Output dict:", fname) for epoch in range(args.start_epoch, args.max_epoch): # todo: HeAT fixes -- possible conflict between processes update_epoch(epoch) if args.only_coarse: # default: false train_obj.only_coarse() train_obj.build_epoch() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() else: train_obj.build_epoch() else: pass ls, bt, btt = train(train_loader, htnet, dp_optim, epoch, scaler) dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True) # if epoch % args.val_freq == 0: vls, iu, vtt = validate(val_loader, htnet, criterion_val, dp_optim, epoch) if args.lr_schedule == "plateau": if dp_optim.comm.rank == 0: print("loss", ls, 'best:', scheduler.best * (1. - scheduler.threshold), scheduler.num_bad_epochs) scheduler.step(ls) # val_loss) else: scheduler.step() if args.rank == 0: save_checkpoint({ "epoch": epoch + 1, "arch": args.arch, "state_dict": htnet.state_dict(), "optimizer": optim.state_dict(), "skip_stable": dp_optim.stability.get_dict() }) out_dict["epochs"].append(epoch) out_dict[nodes + "-train-loss"].append(ls) out_dict[nodes + "-avg-batch-time"].append(bt) out_dict[nodes + "-total-train-time"].append(btt) out_dict[nodes + "-val-loss"].append(vls) out_dict[nodes + "-val-iou"].append(iu) out_dict[nodes + "-val-time"].append(vtt) if args.rank == 0: save_obj(out_dict, fname) if args.rank == 0: print("\nRESULTS\n") import pandas as pd df = pd.DataFrame.from_dict(out_dict).set_index("epochs") with pd.option_context("display.max_rows", None, "display.max_columns", None): # more options can be specified also print(df) if args.benchmarking: try: fulldf = pd.read_csv(cwd + "/heat-bench-results.csv") fulldf = pd.concat([df, fulldf], axis=1) except FileNotFoundError: fulldf = df fulldf.to_csv(cwd + "/heat-bench-results.csv")
default='./save', help='path to save your results') args = parser.parse_args() assert_and_infer_cfg(args, train_mode=False) cudnn.benchmark = False torch.cuda.empty_cache() # setup logger date_str = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) log_dir = os.path.join(args.save_dir, "log") os.makedirs(log_dir, exist_ok=True) save_log('log', log_dir, date_str, rank=0) # get net args.dataset_cls = cityscapes net = get_net(args, criterion=None) net = torch.nn.DataParallel(net).cuda() logging.info('Net built.') net, _ = restore_snapshot(net, optimizer=None, snapshot=args.snapshot, restore_optimizer_bool=False) net.eval() logging.info('Net restored.') # get data data_dir = args.demo_folder images = os.listdir(data_dir) if len(images) == 0: logging.info('There are no images at directory %s. Check the data path.' % (data_dir))