def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) broadcasted_tensor = hvd.broadcast(tensor, root_rank) broadcasted_tensor.backward(torch.ones([17] * dim)) grad_out = tensor.grad.data.numpy() c = size if rank == root_rank else 0 expected = np.ones([17] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allreduce_grad(self): """Test the correctness of the allreduce gradient.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) summed = hvd.allreduce(tensor, average=False) summed.backward(torch.ones([17] * dim)) grad_out = tensor.grad.data.numpy() expected = np.ones([17] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.cuda.FloatTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_allreduce_inplace(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension torch.manual_seed(1234) dims = [17 + rank] * 3 tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass # Same number of elements, different rank torch.manual_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_inplace(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) root_tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(root_rank) tensor = tensor.type(dtype) root_tensor = root_tensor.type(dtype) broadcasted_tensor = hvd.broadcast_(tensor, root_rank) assert (tensor == broadcasted_tensor).min() == 1, \ 'hvd.broadcast does not modify source tensor' assert (broadcasted_tensor == root_tensor).min() == 1, \ 'hvd.broadcast produces incorrect broadcasted tensor'
def backward(ctx, grad_output): grad_reduced = allreduce(grad_output, average=False) dim_t = torch.IntTensor([ctx.dim]) dim = allgather(dim_t).view(size()) r = rank() offset = torch.sum(dim.narrow(0, 0, r)).data[0] if r != 0 else 0 return grad_reduced.narrow(0, offset, ctx.dim), None
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor = torch.FloatTensor(*([17] * 3)).fill_(1) try: hvd.broadcast(tensor, rank) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_allreduce_async_fused(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] tests = [] is_hvd_poll_false_once = False for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) handle = hvd.allreduce_async(tensor, average=False) if not hvd.poll(handle): is_hvd_poll_false_once = True multiplied = tensor * size tests.append((dtype, multiplied, handle)) # Make sure it's an asynchronous operation. assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?' for dtype, multiplied, handle in tests: summed = hvd.synchronize(handle) max_difference = summed.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.numpy() expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*tensor_size) else: tensor = torch.FloatTensor(*tensor_size) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = gathered[sum( tensor_sizes[:i]):sum(tensor_sizes[:i + 1])] assert list(rank_tensor.shape) == rank_size assert rank_tensor.data.min() == i assert rank_tensor.data.max() == i
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) device = local_rank * 2 + (iter + local_rank) % 2 tensor = tensor.cuda(device).type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def synchronize(self): if hvd.size() > 1: for p in self._handles: handle = self._handles[p] synchronize(handle) #p_size = np.prod(p.size()) p_size = np.prod(p.size()) #torch.numel(p) if self._use_allgather and p_size > self._plan1: torch.cuda.synchronize() begin_time_sync = time.time() #fjr decompress name = self._parameter_names.get(p) g_size = p.grad.data.size() p_flatten = p.grad.data.view(-1) p_flatten.zero_() torch.cuda.synchronize() begin_unpack_time = time.time() if self._use_gpu: count_nnz = 0 if p_size > self._plan3: offset = 0 for node_idx in range(hvd.size()): msg_size = self._compressed_msg[name][ offset].type('torch.cuda.LongTensor') offset += 1 p_flatten[self._compressed_msg[name][ offset: \ offset + msg_size].type('torch.cuda.LongTensor')] += \ self._compressed_msg[name][offset + msg_size : \ offset + 2*msg_size] offset += msg_size * 2 count_nnz += msg_size else: msg_size = self._compressed_msg_size[name] for node_idx in range(hvd.size()): p_flatten[self._compressed_msg[name][node_idx*msg_size*2 : \ node_idx*msg_size*2 + msg_size].type('torch.cuda.LongTensor')] += \ self._compressed_msg[name][node_idx*msg_size*2 + msg_size : \ node_idx*msg_size*2 + 2*msg_size] #if hvd.rank() == 0: # print("sparsity ", name, check_sparsity(p_flatten)) p.grad.data = p_flatten.view(g_size) torch.cuda.synchronize() self.unpack_time += time.time() - begin_unpack_time torch.cuda.synchronize() self.pruning_time += time.time() - begin_time_sync if self._debug: diff = torch.sum(self._v_ref[name] - p.grad.data) if (torch.abs(diff) > 1e-3): print("error diff is, ", diff, name, p.size()) else: pass self._handles.clear()
def test_resnet(affine, track_running_stats): torch.manual_seed(0) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if torch.cuda.is_available(): device = torch.device('cuda:%d' % hvd.rank()) else: device = torch.device('cpu') if hvd.rank() == 0: print('affine:', affine, 'track_running_stats', track_running_stats) bn = functools.partial(nn.BatchNorm2d, track_running_stats=track_running_stats, affine=affine) sync_bn = functools.partial(SynchronizedBatchNorm2d, track_running_stats=track_running_stats, affine=affine) # prepare model model = models.resnet18(norm_layer=bn).to(device) sync_model = models.resnet18(norm_layer=sync_bn).to(device) sync_model.load_state_dict(model.state_dict()) # print(sync_model) # prepare inputs num_samples = 8 num_steps = 10 inputs = torch.rand(num_steps, num_samples, 3, 32, 32).float().to(device) start_idx = hvd.rank() * int(num_samples / hvd.size()) end_idx = (hvd.rank() + 1) * int(num_samples / hvd.size()) # test inference if hvd.rank() == 0: print('[INFERENCE PHASE-1]') model.eval() with torch.no_grad(): for i in range(num_steps): t1 = time.time() outputs = model(inputs[i]) t2 = (time.time() - t1) * 1000 if hvd.rank() == 0: view('model.outputs.%d-%.4f' % (i, t2), outputs) sync_model.eval() with torch.no_grad(): for i in range(num_steps): t1 = time.time() outputs = sync_model(inputs[i]) t2 = (time.time() - t1) * 1000 if hvd.rank() == 0: view('sync_model.outputs.%d-%.4f' % (i, t2), outputs) # test training if hvd.rank() == 0: print('[TRAINING PHASE]') # using pytorch-official version model.train() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) for i in range(num_steps): t1 = time.time() outputs = model(inputs[i]) t2 = (time.time() - t1) * 1000 loss = outputs.mean() optimizer.zero_grad() loss.backward() if hvd.rank() == 0: view('model.outputs.%d-%.4f' % (i, t2), outputs) optimizer.step() # using sync-version sync_model.train() optimizer = torch.optim.SGD(sync_model.parameters(), lr=0.1) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=sync_model.named_parameters()) hvd.broadcast_parameters(sync_model.state_dict(), root_rank=0) for i in range(num_steps): t1 = time.time() outputs = sync_model(inputs[i, start_idx:end_idx]) t2 = (time.time() - t1) * 1000 loss = outputs.mean() optimizer.zero_grad() loss.backward() outputs = hvd.allgather(outputs) if hvd.rank() == 0: view('sync_model.outputs.%d-%.4f' % (i, t2), outputs) optimizer.step() # test inference if hvd.rank() == 0: print('[INFERENCE PHASE-2]') model.eval() with torch.no_grad(): for i in range(num_steps): t1 = time.time() outputs = model(inputs[i]) t2 = (time.time() - t1) * 1000 if hvd.rank() == 0: view('model.outputs.%d-%.4f' % (i, t2), outputs) sync_model.eval() with torch.no_grad(): for i in range(num_steps): t1 = time.time() outputs = sync_model(inputs[i]) t2 = (time.time() - t1) * 1000 if hvd.rank() == 0: view('sync_model.outputs.%d-%.4f' % (i, t2), outputs) if hvd.rank() == 0: # for key, value in sync_model.state_dict().items(): # print(key, value.shape) print('\n')
models.squeezenet: models.squeezenet.__all__[1:], models.vgg: models.vgg.__all__[1:], models.mobilenet: models.mobilenet.__all__[1:], models.shufflenetv2: models.shufflenetv2.__all__[1:] } precisions = ["float", "half"] for precision in precisions: for model_type in MODEL_LIST.keys(): for model_name in MODEL_LIST[model_type]: # Set up standard model. model = getattr(model_type, model_name)() model = getattr(model, precision)() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(
np.random.seed(seed) torch.backends.cudnn.deterministic = True # Distributed: set up horovod over multiple gpu's if distributed: import horovod.torch as hvd # initialize horovod hvd.init() # pin gpu to "local rank" (see Horovod documentation) torch.cuda.set_device(hvd.local_rank()) print(f"My local rank is {hvd.local_rank()}") # distribute mini-batches over the different gpu's batch_size //= hvd.size() # string-tag for logging tag = f'nz{nz}' # define the "root process": only one of the gpu's has to log relevant values # set only one gpu as root process root_process = True if distributed and not hvd.rank() == 0: root_process = False # set GPU/CPU options use_cuda = torch.cuda.is_available() cudastring = "cuda" if distributed else f"cuda:{gpu}" device = torch.device(cudastring if use_cuda else "cpu")
def horovod_train(self, model): # call setup after the ddp process has connected if not self.testing: self.setup('fit') model.setup('fit') if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank assert self.root_gpu == hvd.local_rank() torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR for scheduler in self.lr_schedulers: scheduler = scheduler['scheduler'] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * hvd.size() for lr in scheduler.base_lrs ] if self.use_amp: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers) # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.global_rank = hvd.rank() rank_zero_only.rank = self.global_rank with ExitStack() as stack: for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) result = self.run_pretrain_routine(model) # Make sure all workers have finished training before returning to the user hvd.join() return result
if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) class Net(nn.Module):
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) else: txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset eval_dataset = inf_dataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings" + ".position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts, model_opts) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results = list(concat(all_gather_list(results))) if hvd.rank() == 0: save_json(all_results, f'{result_dir}/results_{opts.checkpoint}_all.json') LOGGER.info('All results written......')
# This flag allows you to enable the inbuilt cudnn # auto-tuner to find the best algorithm to use for your hardware. cudnn.benchmark = True if hvd.rank() is 0: # Announce print(args) # Init tensorboard rmtree(args.tensorboard_path, ignore_errors=True) writer = SummaryWriter(args.tensorboard_path) # DataLoader train_dataset = WFLW('train', path=args.data_dir) train_sampler = DistributedSampler(dataset=train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.per_batch, sampler=train_sampler) # Model # norm实现有问题 models = [ BoundaryHeatmapEstimator( args.img_channels, args.hourglass_channels, args.boundary, ).cuda(), ] models.append(LandmarksRegressor(channels=args.hourglass_channels).cuda())
def main(args): # Create a model, synthetic data, and a guide. pyro.set_rng_seed(args.seed) model = Model(args.size) covariates = torch.randn(args.size) data = model(covariates) guide = AutoNormal(model) if args.horovod: # Initialize Horovod and set PyTorch globals. import horovod.torch as hvd hvd.init() torch.set_num_threads(1) if args.cuda: torch.cuda.set_device(hvd.local_rank()) if args.cuda: torch.set_default_tensor_type("torch.cuda.FloatTensor") device = torch.tensor(0).device if args.horovod: # Initialize parameters and broadcast to all workers. guide(covariates[:1], data[:1]) # Initializes model and guide. hvd.broadcast_parameters(guide.state_dict(), root_rank=0) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Create an ELBO loss and a Pyro optimizer. elbo = Trace_ELBO() optim = Adam({"lr": args.learning_rate}) if args.horovod: # Wrap the basic optimizer in a distributed optimizer. optim = HorovodOptimizer(optim) # Create a dataloader. dataset = torch.utils.data.TensorDataset(covariates, data) if args.horovod: # Horovod requires a distributed sampler. sampler = torch.utils.data.distributed.DistributedSampler( dataset, hvd.size(), hvd.rank()) else: sampler = torch.utils.data.RandomSampler(dataset) config = {"batch_size": args.batch_size, "sampler": sampler} if args.cuda: config["num_workers"] = 1 config["pin_memory"] = True # Try to use forkserver to spawn workers instead of fork. if (hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods()): config["multiprocessing_context"] = "forkserver" dataloader = torch.utils.data.DataLoader(dataset, **config) # Run stochastic variational inference. svi = SVI(model, guide, optim, elbo) for epoch in range(args.num_epochs): if args.horovod: # Set rng seeds on distributed samplers. This is required. sampler.set_epoch(epoch) for step, (covariates_batch, data_batch) in enumerate(dataloader): loss = svi.step(covariates_batch.to(device), data_batch.to(device)) if args.horovod: # Optionally average loss metric across workers. # You can do this with arbitrary torch.Tensors. loss = torch.tensor(loss) loss = hvd.allreduce(loss, "loss") loss = loss.item() # Print only on the rank=0 worker. if step % 100 == 0 and hvd.rank() == 0: print("epoch {} step {} loss = {:0.4g}".format(epoch, step, loss)) else: if step % 100 == 0: print("epoch {} step {} loss = {:0.4g}".format(epoch, step, loss)) if args.horovod: # After we're done with the distributed parts of the program, # we can shutdown all but the rank=0 worker. hvd.shutdown() if hvd.rank() != 0: return if args.outfile: print("saving to {}".format(args.outfile)) torch.save({"model": model, "guide": guide}, args.outfile)
def split_by_rank(data): points_per_rank = int(len(data) / hvd.size()) first = hvd.rank() * points_per_rank last = first + points_per_rank return data[first:last]
def get_world_size(self) -> int: return hvd.size()
def __init__(self, model, lr=0.1, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, sparse=False, sparse_ratio=0.01, exclude_parts=''): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d'} self.modules = [] self.module_names = [] self.name_module_map = {} self.module_name_map = {} self._register_modules(model) self.fw_merged_comm = MergedCommAllReduce(self.module_names, prefix='forward', merge=True, single_layer=False) self.bw_merged_comm = MergedCommAllReduce(self.module_names, prefix='backward', merge=False, single_layer=False) self.inverseA_merged_comm = MergedCommBcast(self.module_names, prefix='inverseA') self.inverseG_merged_comm = MergedCommBcast(self.module_names, prefix='inverseG') self.multi_comm = MultiTensorComm() self.steps = 0 # Dictionaries keyed by `module` to storing the factors and # eigendecompositions self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_QA, self.m_QG = {}, {} self.m_dA_ranks = {} self.m_dG_ranks = {} self.module_ranks = None self.sparse = sparse self.sparse_ratio = sparse_ratio self.residualsA, self.residualsG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.have_cleared_Q = True if self.diag_warmup == 0 else False self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(hvd.size())))
parser.add_argument('--max_grad_norm', default=1.0, type=float, help='') args = parser.parse_args() if args.device == 'cuda': args.device = 'cuda' if cuda.is_available() else 'cpu' args = parser.parse_args() torch.manual_seed(args.seed) hvd.init() if cuda.is_available(): torch.cuda.set_device(hvd.local_rank()) tokenizer = BertTokenizer.from_pretrained(args.tokenizer_model) tokenizer.add_tokens("[GENERATE_ARTICLE]") args.tokenizer = tokenizer args.rank_size = hvd.size() print(args) train_df = pd.read_csv(args.train_df_path) train_loader, train_sampler = get_dataLoader(train_df) model = ConditionalGenerationModel(**args).to(args.device) # model=torch.nn.DataParallel(model).to(args.device)#并行程序会卡住,不晓得为什么 compression = hvd.Compression.fp16 if args.compression_fp16 else hvd.Compression.none optim = torch.optim.SGD(model.parameters(), lr=args.base_lr * hvd.local_size() * args.batches_per_allreduce, momentum=args.momentum, weight_decay=args.wd) optim = hvd.DistributedOptimizer(
and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = \ datasets.ImageFolder(args.train_dir, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) val_dataset = \ datasets.ImageFolder(args.val_dir, transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) val_sampler = torch.utils.data.distributed.DistributedSampler(
def train_fn(data_dir=None, seed=42, use_cuda=False, batch_size=64, use_adasum=False, lr=0.01, momentum=0.5, num_epochs=10, log_interval=10): # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "./data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average) for epoch in range(1, num_epochs + 1): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if use_cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: # Horovod: use train_sampler to determine the number of # examples in this worker's partition. print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item()))
parser.add_argument( '--exit-mode', default='exception', help='means used to cause a worker to exit [exception | kill]') args = parser.parse_args() hvd.init() batch_size = 32 data = torch.randn(batch_size, 2) target = torch.LongTensor(batch_size).random_() % 2 lr = 0.001 model = torch.nn.Sequential(torch.nn.Linear(2, 2)) optimizer = torch.optim.SGD(model.parameters(), lr=lr * hvd.size()) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) hostname = os.environ.get('HOROVOD_HOSTNAME') start_rank = int(os.environ.get('HOROVOD_RANK', 0)) discovery_schedule = json.loads(args.discovery_schedule) epoch_to_hosts = { epoch: hosts for epoch, hosts in discovery_schedule if epoch is not None } default_hosts = discovery_schedule[-1][1] if discovery_schedule else [] exit_schedule = json.loads(args.exit_schedule) if args.exit_schedule else {}
transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) print('cwd:', os.getcwd()) train_dataset = torchvision.datasets.CIFAR10( root='~/distributed-training/data', train=True, download=False, transform=transform) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) # Set up standard ResNet-50 model. model = models.resnet101() if args.cuda: # Move model to GPU. model.cuda() # Horovod: scale learning rate by the number of GPUs. # Gradient Accumulation: scale learning rate by batches_per_allreduce
def setup(self, model): # call setup after the ddp process has connected self.trainer.call_setup_hook(model) if torch.cuda.is_available() and self.trainer.on_gpu: # Horovod: pin GPU to local rank assert self.trainer.root_gpu == hvd.local_rank() torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers( model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.trainer.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR for scheduler in self.trainer.lr_schedulers: scheduler = scheduler['scheduler'] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * hvd.size() for lr in scheduler.base_lrs ] if self.trainer.amp_backend: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties( self.trainer.optimizers, self.trainer.lr_schedulers) # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.trainer.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.trainer.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.trainer.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() rank_zero_only.rank = self.trainer.global_rank self.trainer.model = model
tokenizer = BertTokenizer.from_pretrained(args.pretrained_weights) model = SurveyClassifier.from_pretrained(args.pretrained_weights) model.to(device) hvd.broadcast_parameters(model.state_dict(), root_rank=0) print("Shuffling") shuffle(training_data) ############################################################################## print("Training specialty model") loss_fn = torch.nn.MSELoss() optimizer = torch.optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), lr=args.learning_rate * hvd.size(), ) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), ) hvd.broadcast_optimizer_state(optimizer, root_rank=0) best_validation_loss = None best_model = None for epoch in range(args.epochs): if epoch == args.unfreeze_bert_epoch: model.unfreeze_layers_starting_with(11) if epoch == args.unfreeze_bert_epoch * 2: model.unfreeze_layers_starting_with(10) for phase in ["train", "validate"]:
def evaluate(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) if hvd.local_rank() == 0 and not os.path.exists(args.dir): # create 16 random image, mask paris for evaluation print( f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(16): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation val_transforms = Compose([ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys="img"), ToTensord(keys=["img", "seg"]), ]) # create a evaluation data loader val_ds = Dataset(data=val_files, transform=val_transforms) # create a evaluation data sampler val_sampler = DistributedSampler(val_ds, shuffle=False, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr( mp, "_supports_context" ) and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # sliding window inference need to input 1 image in every iteration val_loader = DataLoader( val_ds, batch_size=1, shuffle=False, num_workers=2, pin_memory=True, sampler=val_sampler, multiprocessing_context=multiprocessing_context, ) dice_metric = DiceMetric(include_background=True, reduction="mean") post_trans = Compose( [Activations(sigmoid=True), AsDiscrete(threshold_values=True)]) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") torch.cuda.set_device(device) model = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) if hvd.rank() == 0: # load model parameters for evaluation model.load_state_dict(torch.load("final_model.pth")) # Horovod broadcasts parameters hvd.broadcast_parameters(model.state_dict(), root_rank=0) model.eval() with torch.no_grad(): # define PyTorch Tensor to record metrics result at each GPU # the first value is `sum` of all dice metric, the second value is `count` of not_nan items metric = torch.zeros(2, dtype=torch.float, device=device) for val_data in val_loader: val_images, val_labels = val_data["img"].to( device), val_data["seg"].to(device) # define sliding window size and batch size for windows inference roi_size = (96, 96, 96) sw_batch_size = 4 val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, model) val_outputs = post_trans(val_outputs) value, not_nans = dice_metric(y_pred=val_outputs, y=val_labels) value = value.squeeze() metric[0] += value * not_nans metric[1] += not_nans # synchronizes all processes and reduce results print( f"metric in rank {hvd.rank()}: sum={metric[0].item()}, count={metric[1].item()}" ) avg_metric = hvd.allreduce(metric, name="mean_dice") if hvd.rank() == 0: print( f"average metric: sum={avg_metric[0].item()}, count={avg_metric[1].item()}" ) print("evaluation metric:", (avg_metric[0] / avg_metric[1]).item())
def step(self, closure=None, epoch=None): """Perform one K-FAC step Note: - this function should always be called before `optimizer.step()` - gradients must be averaged across ranks before calling `step()` Args: closure: for compatibility with the base optimizer class. `closure` is ignored by KFAC epoch (int, optional): epoch to use for determining when to end the `diag_warmup` period. `epoch` is not necessary if not using `diag_warmup` """ # Update params, used for compatibilty with `KFACParamScheduler` group = self.param_groups[0] self.lr = group['lr'] self.damping = group['damping'] self.fac_update_freq = group['fac_update_freq'] self.kfac_update_freq = group['kfac_update_freq'] #print('fac_update_freq: ', self.fac_update_freq) #print('kfac_update_freq: ', self.kfac_update_freq) updates = {} handles = [] if epoch is None: if self.diag_warmup > 0: print("WARNING: diag_warmup > 0 but epoch was not passed to " "KFAC.step(). Defaulting to no diag_warmup") diag_blocks = self.diag_blocks else: diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1 if hvd.size() > 1 and self.steps % self.fac_update_freq == 0: self.fw_merged_comm.synchronize() self.bw_merged_comm.synchronize() # Compute A and G after aggregation of a and g for module in self.modules: a = self.m_a[module] g = self.m_g[module] if hvd.rank() == 0: logger.info('a Name: %s, shape %s', module, a.shape) logger.info('g Name: %s, shape %s', module, g.shape) A = torch.einsum('ki,kj->ij', a, a / a.size(0)) G = torch.einsum('ki,kj->ij', g, g / g.size(0)) update_running_avg(A, self.m_A[module], self.factor_decay) update_running_avg(G, self.m_G[module], self.factor_decay) raise # if we are switching from no diag approx to approx, we need to clear # off-block-diagonal elements if not self.have_cleared_Q and \ epoch == self.diag_warmup and \ self.steps % self.kfac_update_freq == 0: self._clear_eigen() self.have_cleared_Q = True if self.steps % self.kfac_update_freq == 0: # reset rank iter so device get the same layers # to compute to take advantage of caching self.rank_iter.reset() handles = [] #eigen_ranks = self._generate_eigen_ranks(epoch) eigen_ranks = self._generate_eigen_ranks_uniform(epoch) #eigen_ranks = self._generate_eigen_ranks_naive(epoch) #inverse_As = [] #A_ranks = [] #inverse_Gs = [] #G_ranks = [] rank_to_tensors = {} for module in self.modules: ranks_a, ranks_g = eigen_ranks[module] self.m_dA_ranks[module] = ranks_a[0] self.m_dG_ranks[module] = ranks_g[0] rank_a = ranks_a[0] rank_g = ranks_g[0] name = self.module_name_map[module] self._update_inverse_A(module, ranks_a) #if hvd.size() > 1 and rank_a >= 0: # self.inverseA_merged_comm.bcast_async_(name, self.m_QA[module], rank_a) self._update_inverse_G(module, ranks_g) #if hvd.size() > 1 and rank_g >= 0: # self.inverseG_merged_comm.bcast_async_(name, self.m_QG[module], rank_g) #if rank_a not in rank_to_tensors: # rank_to_tensors[rank_a] = [] #rank_to_tensors[rank_a].append((name, self.m_QA[module], self.m_QG[module])) if hvd.size() > 1 and rank_g >= 0: self.multi_comm.bcast_async_( [name], [self.m_QA[module], self.m_QG[module]], rank_g) #if hvd.size() > 1: # for rank in rank_to_tensors.keys(): # names = [] # tensors = [] # for name, ta, tb in rank_to_tensors[rank]: # names.append(name) # tensors.append(ta) # tensors.append(tb) # self.multi_comm.bcast_async_(names, tensors, rank) if hvd.size() > 1 and self.steps % self.kfac_update_freq == 0: #self.inverseA_merged_comm.synchronize() #self.inverseG_merged_comm.synchronize() self.multi_comm.synchronize() for i, module in enumerate(self.modules): grad = self._get_grad(module) precon_grad = self._get_preconditioned_grad(module, grad) updates[module] = precon_grad self._update_scale_grad(updates) self.steps += 1
def hook(*ignore): assert p not in self._handles assert not p.grad.requires_grad name = self._parameter_names.get(p) p_size = np.prod(p.size()) torch.cuda.synchronize() begin_time = time.time() if self._use_allgather and p_size > 1024: # fjr compress grad p.grad.data.add_(torch.mul(p.data, self._weight_decay)) p.grad.data.div_(hvd.size()) if self._use_nesterov: self._U[name] = torch.mul( torch.add(self._U[name], p.grad.data), self._momentum) self._V[name] = self._V[name] + self._U[name] + p.grad.data else: self._U[ name] = self._momentum * self._U[name] + p.grad.data self._V[name] = self._V[name] + self._U[name] compressed_val = [] compressed_idx = [] torch.cuda.synchronize() begin_select_time = time.time() if self._flag[name] == 1: self._masks[name], compressed_val, compressed_idx = \ select_topk_truncated_mean(self._V[name], 0.001, self._masks[name]) self._flag[name] = 0 else: self._masks[name], compressed_val, compressed_idx = \ select_lowk_truncated_mean(self._V[name], 0.001, self._masks[name]) self._flag[name] = 1 torch.cuda.synchronize() end_select_time = time.time() self.select_time += end_select_time - begin_select_time p.grad.data = torch.mean(compressed_val) * (1.0 - self._masks[name]) handle = allreduce_async_(p.grad.data, average=False) self._handles[p] = handle self._V[name].mul_(self._masks[name]) self._U[name].mul_(self._masks[name]) torch.cuda.synchronize() begin_comm_time = time.time() #if hvd.size() > 1: # self._compressed_msg_size[name] = len(compressed_idx) # if self._use_gpu: # compressed_msg = torch.cat([compressed_idx.type('torch.cuda.FloatTensor'), compressed_val]) # else: # compressed_msg = torch.cat([compressed_idx.type('torch.FloatTensor'), compressed_val]) # handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name) # self._handles[p] = handle torch.cuda.synchronize() end_comm_time = time.time() self.pack_time += end_comm_time - begin_comm_time else: p.grad.data.add_(torch.mul(p.data, self._weight_decay)) if self._use_nesterov: self._U[name] = torch.mul( torch.add(self._U[name], p.grad.data), self._momentum) #self._V[name] = self._U[name] + p.grad.data p.grad.data = self._U[name] + p.grad.data else: self._U[ name] = self._momentum * self._U[name] + p.grad.data #self._V[name] = self._U[name] p.grad.data = self._U[name] #compressed_msg = torch.randn(100).cuda() #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name) if hvd.size() > 1: handle = allreduce_async_(p.grad.data, average=True, name=name) self._handles[p] = handle torch.cuda.synchronize() end_time = time.time() self.pruning_time += end_time - begin_time
# set model model = torchnet.Net() # load model to predict if prediction: model.load_state_dict(torch.load(this_path + "/torchmodel.pth")) elif args.net_name is None: # set model model = netdataloader.Net() # load model to predict if prediction: model.load_state_dict(torch.load(this_path + "/torchmodel.pth")) ##### HOROVOD ##### if prediction: test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) else: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) if validation: valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=hvd.size(), rank=hvd.rank()) kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' if prediction:
def train(): os.environ["CUDA_VISIBLE_DEVICES"] = '6, 7' args = parse_args() # 1. 初始化horovod hvd.init() # 2. 给当前进程分配对应的gpu,local_rank()返回的是当前是第几个进程 torch.cuda.set_device(hvd.local_rank()) # torch.cuda.set_device(args.local_rank) # dist.init_process_group( # backend = 'nccl', # init_method = 'tcp://127.0.0.1:33271', # world_size = 2, # world_size = torch.cuda.device_count(), # rank=args.local_rank # ) setup_logger(respth) ## dataset n_classes = 19 n_img_per_gpu = 8 n_workers = 4 cropsize = [1024, 1024] ds = CityScapes('/dataset/cityscapes/leftImg8bit_trainvaltest', cropsize=cropsize, mode='train') sampler = torch.utils.data.distributed.DistributedSampler( ds, num_replicas=hvd.size(), rank=hvd.rank()) dl = DataLoader(ds, batch_size=n_img_per_gpu, shuffle=False, sampler=sampler, num_workers=n_workers, pin_memory=True, drop_last=True) ## model ignore_idx = 255 net = BiSeNet(n_classes=n_classes) net.cuda() # 5. 初始化的时候广播参数,这个是为了在一开始的时候同步各个gpu之间的参数 hvd.broadcast_parameters(net.state_dict(), root_rank=0) net.train() # net = nn.parallel.DistributedDataParallel(net, # device_ids = [args.local_rank, ], # output_device = args.local_rank # ) score_thres = 0.7 n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16 criteria_p = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_16 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) criteria_32 = OhemCELoss(thresh=score_thres, n_min=n_min, ignore_lb=ignore_idx) ## optimizer momentum = 0.9 weight_decay = 5e-4 lr_start = 1e-2 max_iter = 80000 power = 0.9 warmup_steps = 1000 warmup_start_lr = 1e-5 optim = Optimizer(model=net, lr0=lr_start, momentum=momentum, wd=weight_decay, warmup_steps=warmup_steps, warmup_start_lr=warmup_start_lr, max_iter=max_iter, power=power) hvd.broadcast_optimizer_state(optim.optim, root_rank=0) optim = hvd.DistributedOptimizer(optim.optim, named_parameters=net.named_parameters()) ## train loop msg_iter = 50 loss_avg = [] st = glob_st = time.time() diter = iter(dl) epoch = 0 for it in range(max_iter): try: im, lb = next(diter) if not im.size()[0] == n_img_per_gpu: raise StopIteration except StopIteration: epoch += 1 sampler.set_epoch(epoch) diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() H, W = im.size()[2:] lb = torch.squeeze(lb, 1) optim.zero_grad() out, out16, out32 = net(im) lossp = criteria_p(out, lb) loss2 = criteria_16(out16, lb) loss3 = criteria_32(out32, lb) loss = lossp + loss2 + loss3 loss.backward() optim.step() loss_avg.append(loss.item()) ## print training log message if (it + 1) % msg_iter == 0: loss_avg = sum(loss_avg) / len(loss_avg) lr = optim.lr ed = time.time() t_intv, glob_t_intv = ed - st, ed - glob_st eta = int((max_iter - it) * (glob_t_intv / it)) eta = str(datetime.timedelta(seconds=eta)) msg = ', '.join([ 'it: {it}/{max_it}', 'lr: {lr:4f}', 'loss: {loss:.4f}', 'eta: {eta}', 'time: {time:.4f}', ]).format(it=it + 1, max_it=max_iter, lr=lr, loss=loss_avg, time=t_intv, eta=eta) logger.info(msg) loss_avg = [] st = ed ## dump the final model save_pth = osp.join(respth, 'model_final.pth') net.cpu() state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('training done, model saved to: {}'.format(save_pth))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') parser.add_argument('--results_path', type=str, help="Path to store results") args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() if args.cuda: # Move model to GPU. model.cuda() # Horovod: broadcast parameters. hvd.broadcast_parameters(model.state_dict(), root_rank=0) global optimizer # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) #for epoch in range(1, args.epochs + 1): # train(epoch, model, optimizer, train_sampler, train_loader, args) # test(model, test_sampler, test_loader, args) checkpoint_file = os.path.join(args.results_path, f'skopt_torch_results') checkpoint_saver = CheckpointSaver(checkpoint_file, compress=9) space = Space([(2, 8)]) try: res = load(checkpoint_file) x0 = res.x_iters y0 = res.func_vals except FileNotFoundError: print(f'No previous save point.') # Need to randomly sample the bounds to prime the optimization. x0 = space.rvs(1) y0 = None gp_minimize( lambda x: objective(x, model, train_sampler, train_loader, args ), # the function to minimize space, # the bounds on each dimension of x x0=x0, # already examined values for x y0=y0, # observed values for x0 acq_func="LCB", # the acquisition function (optional) n_calls=20, # the number of evaluations of f including at x0 n_random_starts=0, # the number of random initialization points callback=[checkpoint_saver], random_state=777)
def main(args): hvd.init() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: device = torch.device('cuda') # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) device = 'GPU' if args.cuda else 'CPU' if hvd.rank() == 0: log('Using PyTorch version: %s, Device: %s' % (torch.__version__, device)) log('Horovod version: %s, CUDA: %s, ROCM: %s, NCCL: %s, MPI: %s' % (horovod.__version__, hvd.cuda_built(), hvd.rocm_built(), hvd.nccl_built(), hvd.mpi_built())) log(torch.__config__.show()) cudnn.benchmark = True # Set up standard model. log('Initializing %s model...' % args.model) model = getattr(models, args.model)() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if args.fixed_data: data, target = generate_data(args) def benchmark_step(): nonlocal data, target if not args.fixed_data: data, target = generate_data(args) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) log('Number of %ss: %d' % (device, hvd.size())) # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log('Total img/sec on %d %s(s): %.1f +-%.1f' % (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
def on_state_reset(): for param_group in optimizer.param_groups: param_group['lr'] = lr * hvd.size()
def run_test_from_config(trainer_options, on_gpu, check_size): """Trains the default model with the given config.""" set_random_main_port() reset_seed() ckpt_path = trainer_options["default_root_dir"] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): def on_train_start(self) -> None: expected_device = torch.device( "cuda", self.trainer.local_rank) if on_gpu else torch.device("cpu") assert self.device == expected_device def training_epoch_end(self, outputs) -> None: res = self.trainer.strategy.reduce(torch.tensor( 1.0, device=self.device), reduce_op="sum") assert res.sum() == self.trainer.strategy.world_size model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" trainer.test(model) assert model.device == torch.device("cpu") # Horovod should be initialized following training. If not, this will raise an exception. if check_size: assert hvd.size() == 2 if trainer.global_rank > 0: return # test model loading pretrained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: batch = next(iter(dataloader)) pretrained_model(batch) # test HPC saving # save logger to make sure we get all the metrics if trainer.logger: trainer.logger.finalize("finished") hpc_save_path = trainer._checkpoint_connector.hpc_save_path(ckpt_path) trainer.save_checkpoint(hpc_save_path) # test HPC loading checkpoint_path = trainer._checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder( ckpt_path) trainer._checkpoint_connector.restore(checkpoint_path) if on_gpu: trainer = Trainer(accelerator="gpu", devices=1, strategy="horovod", max_epochs=1) # test root gpu index assert trainer.strategy.root_device.index == hvd.local_rank()
def test_horovod_size(self): """Test that the size returned by hvd.size() is correct.""" _, true_size = mpi_env_rank_and_size() hvd.init() size = hvd.size() assert true_size == size
def hook(*ignore): assert p not in self._handles assert not p.grad.requires_grad name = self._parameter_names.get(p) p_size = np.prod(p.size()) torch.cuda.synchronize() begin_time = time.time() if self._use_allgather and p_size > self._plan1: torch.cuda.synchronize() begin_mom_time = time.time() weight_decay = self._weight_decay #group['weight_decay'] momentum = self._momentum #group['momentum'] dampening = 0.0 #group['dampening'] d_p = p.grad.data d_p.div_(hvd.size()) if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) buf.mul_(momentum).add_(d_p) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) #TODO if 'residue_buffer' not in param_state: rsd = param_state['residue_buffer'] = torch.zeros_like( p.data) rsd.add_(param_state['momentum_buffer']) if self._use_nesterov: rsd = rsd.add(momentum, d_p) else: rsd = param_state['residue_buffer'] rsd.add_(param_state['momentum_buffer']) if self._use_nesterov: rsd = rsd.add(momentum, d_p) torch.cuda.synchronize() self.mom_time += time.time() - begin_mom_time compressed_val = [] compressed_idx = [] torch.cuda.synchronize() begin_select_time = time.time() if 'mid_store' not in param_state: param_state['mid_store'] = 0.0 if 'interval' not in param_state: param_state['interval'] = 10 it = 0 sparsity = 0.0 if p_size > self._plan3: compressed_val, compressed_idx, it, _, sparsity = \ select_top_k_thdv3(param_state['residue_buffer'], 0.001) elif p_size > self._plan2: compressed_val, compressed_idx = \ select_trim_topk(param_state['residue_buffer'], 0.001) else: compressed_val, compressed_idx = \ select_topk(param_state['residue_buffer'], 0.001) assert (len(compressed_idx) > 0) torch.cuda.synchronize() end_select_time = time.time() self.select_time += end_select_time - begin_select_time #if param_state['interval'] == 10: # compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \ # select_top_k_thdv3(param_state['residue_buffer'], 0.001) # param_state['interval'] = 0 #else: # compressed_val, compressed_idx, sparsity = \ # select_top_k_fixthd(param_state['residue_buffer'], param_state['mid_store']) # param_state['interval'] += 1 #if hvd.rank() == 0: # print(name, p.size()) #if hvd.rank() == 0 and name == "features.27.weight": #if name == "features.27.weight": # torch.save(compressed_val, 'compressed_val' + str(local_rank())) # torch.save(compressed_idx, 'compressed_idx' + str(local_rank())) #if hvd.rank() == 0 and name == "features.27.weight": # self._it = it # self._mid = param_state['mid_store'] # self._sparsity = sparsity #tmp_t = torch.tensor([local_len], dtype=torch.long) # tmp_t = torch.tensor([local_len]) # print("len list, ", global_len_list) #local_len = torch.min(global_len_list) ##print("local_len, ", local_len) #compressed_val = compressed_val[0:local_len] #compressed_idx = compressed_idx[0:local_len] torch.cuda.synchronize() begin_mask_time = time.time() masks_size = self._masks[name].size() self._masks[name].zero_() self._masks[name] = self._masks[name].view(-1) self._masks[name][compressed_idx] = 1.0 self._masks[name] = 1.0 - self._masks[name] self._masks[name] = self._masks[name].view(masks_size) if self._debug: self._v_ref[name] = param_state['residue_buffer'] * ( 1.0 - self._masks[name]) allreduce_(self._v_ref[name], average=False) if hvd.size() == 1: p.grad.data = param_state['residue_buffer'] * ( 1.0 - self._masks[name]) param_state['residue_buffer'].mul_(self._masks[name]) param_state['momentum_buffer'].mul_(self._masks[name]) end_mask_time = time.time() self.mask_time += end_mask_time - begin_mask_time torch.cuda.synchronize() begin_pack_time = time.time() if hvd.size() > 1: if self._use_gpu: if p_size > self._plan3: compressed_msg = torch.cat([\ torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'),\ compressed_idx.type('torch.cuda.FloatTensor'), \ compressed_val]) handle = _allgather_async( compressed_msg, self._compressed_msg[name], name=name) else: self._compressed_msg_size[name] = len( compressed_idx) compressed_msg = torch.cat([compressed_idx.type('torch.cuda.FloatTensor'), \ compressed_val]) handle = _allgather_async( compressed_msg, self._compressed_msg[name], name=name) self._handles[p] = handle torch.cuda.synchronize() self.pack_time += time.time() - begin_pack_time else: #compressed_msg = torch.randn(100).cuda() #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name) torch.cuda.synchronize() begin_allreduce_time = time.time() p.grad.data.div_(hvd.size()) p.grad.data.add_(torch.mul(p.data, self._weight_decay)) param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like( p.data) else: buf = param_state['momentum_buffer'] if self._use_nesterov: buf = torch.mul(torch.add(buf, p.grad.data), self._momentum) p.grad.data.add_(buf) #param_state['momentum_buffer'] = torch.mul(torch.add(param_state['momentum_buffer'], p.grad.data), self._momentum) #p.grad.data.add_(param_state['momentum_buffer']) else: param_state[ 'momentum_buffer'] = self._momentum * param_state[ 'momentum_buffer'] + p.grad.data p.grad.data = param_state['momentum_buffer'] if hvd.size() > 1: handle = allreduce_async_(p.grad.data, average=False, name=name) self._handles[p] = handle torch.cuda.synchronize() self.allreduce_time += time.time() - begin_allreduce_time torch.cuda.synchronize() end_time = time.time() self.pruning_time += end_time - begin_time