def builder(args): model = MagFaceBuilder(args) model.features = parallel.DistributedDataParallel(model.features.to( args.gpu), device_ids=[args.gpu], output_device=args.gpu) getattr(model, args.parallel_module_name).to(args.gpu) return model
def builder(args): model = MagFaceBuilder(args) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(args.gpu) model.features = parallel.DistributedDataParallel(model.features.to( args.gpu), device_ids=[args.gpu], output_device=args.gpu) getattr(model, args.parallel_module_name).to(args.gpu) return model
def _decorate_model(self, parallel_decorate=True): self.logging('=' * 20 + 'Decorate Model' + '=' * 20) if self.setting.fp16: self.model.half() self.model.to(self.device) self.logging('Set model device to {}'.format(str(self.device))) if parallel_decorate: if self.in_distributed_mode(): self.model = para.DistributedDataParallel( self.model, device_ids=[self.setting.local_rank], output_device=self.setting.local_rank) self.logging('Wrap distributed data parallel') # self.logging('In Distributed Mode, but do not use DistributedDataParallel Wrapper') elif self.n_gpu > 1: self.model = para.DataParallel(self.model) self.logging('Wrap data parallel') else: self.logging('Do not wrap parallel layers')
def __main__(): args = get_args_parser() dist.init_process_group(backend='nccl') torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True set_random_seed(args.random_seed + dist.get_rank()) torch.cuda.set_device(torch.device('cuda:{}'.format(dist.get_rank()))) dist_logger = DistributedLogger(args.name, args.output_base_path, args.master_rank, args.use_tensorboard) train_dataset = TrainDataset(args.dataset_root, args.dataset_year, (args.input_size_h, args.input_size_w), args.pooler_size) train_sampler = data.distributed.DistributedSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, sampler=train_sampler, pin_memory=True, drop_last=True) val_dataset = ValDataset(args.dataset_root, args.dataset_year, (args.input_size_h, args.input_size_w)) val_sampler = data.distributed.DistributedSampler(val_dataset) val_dataloader = data.DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, sampler=val_sampler) model = BlendMask(len(COCO_CLASSES), args.fpn_channels, args.bases_module_channels, args.num_bases, args.atten_size, args.pooler_size).cuda() # model.load_state_dict(torch.load(f'./output/{args.name}/model/param.pth')) model = parallel.DistributedDataParallel(model, device_ids=[dist.get_rank()], find_unused_parameters=True) criterion = Criterion(args.focal_alpha, args.focal_gamma) optim_parameters = [{ 'params': [ p for n, p in model.module.named_parameters() if not n.endswith('bias') and p.requires_grad ] }, { 'params': [ p for n, p in model.module.named_parameters() if n.endswith('bias') and p.requires_grad ], 'lr': args.lr * args.bias_lr_mul, 'weight_decay': args.weight_decay * args.bias_weight_decay_mul }] optimizer = optim.SGD(optim_parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_lambda = utils.lr_lambda.get_warm_up_multi_step_lr_lambda( len(train_dataloader), args.warm_up_epoch, args.warm_up_ratio, args.milestones, args.step_gamma) lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) nms_cfg = { 'nms_pre': args.nms_pre, 'cls_score_thr': args.nms_cls_score_thr, 'iou_thr': args.nms_iou_thr } for epoch_idx in range(args.epochs): train_sampler.set_epoch(epoch_idx) val_sampler.set_epoch(epoch_idx) engine.train_one_epoch(model, criterion, optimizer, lr_scheduler, train_dataloader, epoch_idx, dist_logger)
def __init__( self, model, loss_fn, optimizer, train_loader, eval_loader, lr_scheduler=None, clip_grad_norm=None, skip_grad_norm=None, sample_epochs=None, sample_fn=None, log_dir=None, save_checkpoint_epochs=1, n_gpus=0, device_id=None, ): """Initializes a new Trainer instance. Args: model: Model to train and evaluate. loss_fn: A `fn(inputs, targets, predictions)->output`. The output can either be a single loss Tensor or a metrics dictionary containing multiple Tensors. The dictionary must contain a `loss` key which will be used as the primary loss for backprop. optimizer: Optimizer to use when training. train_loader: DataLoader for the training set. eval_loader: DataLoader for the evaluation set. lr_scheduler: An torch.optim.lr_scheduler whose step() method is called after every batch. clip_grad_norm: L2 norm to scale gradients to if their norm is greater. skip_grad_norm: Maximum L2 norm above which gradients are discarded. sample_epochs: Number of epochs to wait between generating new image samples and logging them to TensorBoard. If not `None`, `sample_fn` must be provided. sample_fn: A `fn(model)->Tensor` which returns an NCHW Tensor of images to log to TensorBoard. log_dir: The directory where to log checkpoints and TensorBoard metrics. If `None` a temporary directory is created (note that this directory is not cleaned up automatically). save_checkpoint_epochs: Number of epochs to wait between checkpoints. Note that this does not affect TensorBoard logging frequency. n_gpus: The number of GPUs to use for training and evaluation. If 0, the CPUs are used instead. device_id: When running on multiple GPUs, the id of the GPU device this Trainer instance is running on. """ self.loss_fn = loss_fn self.train_loader = train_loader self.eval_loader = eval_loader self.clip_grad_norm = clip_grad_norm self.skip_grad_norm = skip_grad_norm self.log_dir = log_dir or tempfile.mkdtemp() self.save_checkpoint_epochs = save_checkpoint_epochs self.sample_epochs = sample_epochs self.sample_fn = sample_fn if self.sample_epochs: msg = "sample_fn cannot be None if sample_epochs is not None" assert self.sample_fn, msg self.device = "cuda" if n_gpus > 0 else "cpu" self.device_id = 0 if device_id is None and n_gpus == 1 else device_id model = model.to(self.device) if n_gpus > 1: assert device_id is not None, "'device_id' must be provided if n_gpus > 1." model = parallel.DistributedDataParallel( model, device_ids=[self.device_id], output_device=self.device_id) # Trainer state saved during checkpointing. self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler self._step = 0 self._epoch = 0 self._examples_processed = 0 self._time_taken = 0 self._summary_writer = tensorboard.SummaryWriter(self.log_dir, max_queue=100)
import torch import torch.nn as nn import torch.nn.parallel as par import torch.optim as optim #setup model and optimizer net = nn.Linear(10, 10) net = par.DistributedDataParallel(net) opt = optim.SGD(net.parameters(), lr = 0.01) #run forward pass inp = torch.randn(20, 10) exp = torch.randn(20, 10) out = net(inp) #run backward pass nn.MSELoss()(out, exp).backward() #update parameters opt.step()