num_workers=4) # ------------ preparation ------------ if exp_cfg['model'] == "scnn": net = SCNN(resize_shape, pretrained=True) elif exp_cfg['model'] == "enet_sad": net = ENet_SAD(resize_shape, sad=True, dataset=dataset_name) else: raise Exception( "Model not match. 'model' in 'cfg.json' should be 'scnn' or 'enet_sad'." ) net = net.to(device) net = torch.nn.DataParallel(net) optimizer = optim.SGD(net.parameters(), **exp_cfg['optim']) lr_scheduler = PolyLR(optimizer, 0.9, **exp_cfg['lr_scheduler']) best_val_loss = 1e6 def train(epoch): print("Train Epoch: {}".format(epoch)) net.train() train_loss = 0 train_loss_seg = 0 train_loss_exist = 0 progressbar = tqdm(range(len(train_loader))) for batch_idx, sample in enumerate(train_loader): img = sample['img'].to(device) segLabel = sample['segLabel'].to(device)
collate_fn=dataset.collate, num_workers=4) # ------------ preparation ------------ seg_classes = 5 if hasattr(dataset, 'seg_classes'): seg_classes = getattr(dataset, 'seg_classes') net = SCNN(resize_shape, pretrained=True, seg_classes=seg_classes, weights=Dataset_Type.get_weights( exp_cfg['dataset']['other']['seg_mode'])) net = net.to(device) #net = torch.nn.DataParallel(net) optimizer = optim.Adam(net.parameters(), **exp_cfg['optim']) lr_scheduler = PolyLR(optimizer, 0.9, **exp_cfg['lr_scheduler']) best_val_loss = 1e6 def train(epoch): print("Train Epoch: {}".format(epoch)) net.train() train_loss = 0 train_loss_seg = 0 train_loss_exist = 0 progressbar = tqdm(range(len(train_loader))) for batch_idx, sample in enumerate(train_loader): img = sample['img'].to(device) segLabel = sample['segLabel'].to(device)
net = SCNN(resize_shape, pretrained=True) lr_scaler = 1 if torch.cuda.is_available(): net.cuda() # Horovod: Scale learning rate as per number of devices if hvd.nccl_built(): lr_scaler = hvd.local_size() net = torch.nn.DataParallel(net) lr = exp_cfg['optim']['lr'] momentum = exp_cfg['optim']['momentum'] weight_decay = exp_cfg['optim']['weight_decay'] nesterov = exp_cfg['optim']['nesterov'] # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(net.parameters(), lr=lr * lr_scaler, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. #compression = hvd.Compression.fp16 # Horovod: wrap optimizer with DistributedOptimizer. gradient_predivide_factor = 1.0 optimizer = hvd.DistributedOptimizer(
val_bdd100k = BDDDataset(image_path=bdd100k_val_img_path, drivable_path=bdd100k_val_dl_path) val_bdd100k_dataset_loader = DataLoader(dataset=val_bdd100k, **params) #Declare model & optimizers net = SCNN(resize_shape, pretrained=True) net = net.to(device) #torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) #torch.cuda.set_device() #net = torch.nn.parallel.DistributedDataParallel(net) #net = torch.nn.DataParallel(net) # #net.eval() tensorboard = SummaryWriter(exp_dir + "tb/") optimizer = optim.SGD(net.parameters(), **optim_set) lr_scheduler = PolyLR(optimizer, 0.9, **lr_set) best_val_loss = 1000 #@profile def train(epoch): print("Train Epoch: {}".format(epoch)) net.train() train_loss = 0 train_loss_seg = 0 ##train_loss_exist = 0 epoch_accuracy = 0 progressbar = tqdm(range(len(train_bdd100k_dataset_loader))) #Training loop