def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) dist.reduce(tensor, src, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) dist.reduce(tensor, src, op, group_id) self._barrier()
def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value, cuda=False, rank_to_GPU=None): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.reduce(tensor, src, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.reduce(tensor, src, op, group_id) self._barrier()
def reduce_loss_dict(loss_dict): world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): keys = [] losses = [] for k in sorted(loss_dict.keys()): keys.append(k) losses.append(loss_dict[k]) losses = torch.stack(losses, 0) dist.reduce(losses, dst=0) if dist.get_rank() == 0: losses /= world_size reduced_losses = {k: v for k, v in zip(keys, losses)} return reduced_losses
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def reduce_dict(input_dict, average=True): world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): keys = [] values = [] for k in sorted(input_dict.keys()): keys.append(k) values.append(input_dict[k]) values = torch.stack(values, 0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: values /= world_size reduced_dict = {k: v for k, v in zip(keys, values)} return reduced_dict
def reduce(*_: Any) -> None: # Skip gradient reduction, do not alter status flags if not self.should_accumulate_grads and self._grad_to_be_reduced[ index]: assert param.grad is not None, "Reducing gradients during backward pass, cannot be None" if not self._bucket_flush_callback_set: Variable._execution_engine.queue_callback( self._flush_buckets) self._bucket_flush_callback_set = True # Make sure that this is not fired twice self._grad_to_be_reduced[index] = False param.grad.mul_(self.world_size_scaling) if self.reduce_fp16: param.grad.data = param.grad.data.half() # Future work includes clearing up the buffer if possible def cleanup() -> None: if dst_rank != self.global_rank: param.grad = None else: assert param.grad is not None param.grad.data = param.grad.data.to( dtype=param.dtype) # Async reduce for this buffer, log the future dst_global_rank = OSS.get_global_rank( self.process_group, dst_rank) self._work_handles.append( Workhandle( handle=dist.reduce(tensor=param.grad.data, dst=dst_global_rank, group=self.process_group, async_op=True), callback=cleanup, )) self._reduced_grads += 1 # Opportunistically try to empty the queue self._try_consume_work_handle() # If all the reduce operations have been called, # make sure that all the asynchronous calls have concluded before moving on # and execute the delayed actions (release gradients, unroll the buckets) if self._reduced_grads == self._reduced_grads_max: self._consume_work_handles()
def main(): args = parser.parse_args() if not args.cuda: args.dist_backend = 'gloo' # nccl doesn't work on CPUs dist.init_process_group(backend=args.dist_backend, init_method='env://') model = Model() if args.cuda: print_status("Using GPU") torch.cuda.set_device(args.local_rank) model.cuda() else: print_status("Using CPU") print_status("initialising DDP model") if args.cuda: ddp_model = DDP(model, device_ids=[torch.cuda.current_device()]) else: ddp_model = DDP(model) num_batches = args.batches if not args.weak_scale: print_status("Strong scaling") num_batches = num_batches // dist.get_world_size() batch_size = args.batch_size start_time = time.time() for _ in range(num_batches): # create random batch x = torch.randn(batch_size, 1, 100, 100) if args.cuda: x.cuda() y = ddp_model(x) rand_grad = torch.randn_like(y) y.backward(rand_grad) end_time = time.time() avg_time_tensor = torch.FloatTensor([end_time - start_time]) min_time_tensor = torch.FloatTensor([end_time - start_time]) max_time_tensor = torch.FloatTensor([end_time - start_time]) if args.cuda: avg_time_tensor = avg_time_tensor.cuda() min_time_tensor = min_time_tensor.cuda() max_time_tensor = max_time_tensor.cuda() dist.reduce(avg_time_tensor, 0, dist.reduce_op.SUM) dist.reduce(min_time_tensor, 0, dist.reduce_op.MIN) dist.reduce(max_time_tensor, 0, dist.reduce_op.MAX) avg_time_tensor /= dist.get_world_size() time_min, time_avg, time_max = min_time_tensor.item(), avg_time_tensor.item(), max_time_tensor.item() if dist.get_rank() == 0: print_status("Time : Min {} Avg {} Max {}".format(time_min, time_avg, time_max))
def reduce_scatter(tensor, tensor_list, op=ReduceOp.SUM, group=dist.group.WORLD, async_op=False): rank = dist.get_rank(group) if tensor is None: tensor = tensor_list[rank] if tensor.dim() == 0: tensor = tensor.view(-1) tensor[:] = tensor_list[rank] ops = [] for i in range(dist.get_world_size(group)): if i == rank: tmp = dist.reduce(tensor, rank, op, group, async_op=True) else: tmp = dist.reduce(tensor_list[i], i, op, group, async_op=True) ops.append(tmp) oplist = AsyncOpList(ops) if async_op: return oplist else: oplist.wait()
def _test_reduce_helper(self, group, group_id, rank, op, master_value, worker_value, expected_value, cuda=False): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) if cuda: tensor = tensor.cuda() dist.reduce(tensor, src, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) if cuda: tensor = tensor.cuda() dist.reduce(tensor, src, op, group_id) self._barrier()
def reduce(self, input, dst, op=ReduceOp.SUM, batched=False): """Reduces the input data across all parties.""" assert dist.is_initialized(), "initialize the communicator first" if batched: assert isinstance(input, list), "batched reduce input must be a list" reqs = [] result = [x.clone().data for x in input] for tensor in result: reqs.append( dist.reduce( tensor.data, dst, op=op, group=self.main_group, async_op=True ) ) for req in reqs: req.wait() else: assert torch.is_tensor( input.data ), "unbatched input for reduce must be a torch tensor" result = input.clone() dist.reduce(result.data, dst, op=op, group=self.main_group) return result if dst == self.get_rank() else None
def test(model, criterion, epoch, test_loader, device): model.eval() test_loss = 0.0 correct = 0 total = 0 loss_acc1 = torch.zeros(2).to(device) with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() loss_acc1[0] = test_loss / len(test_loader) loss_acc1[1] = 100.0 * correct / total #print('rank ', dist.get_rank(), ' test loss ', loss_acc1[0].item(), # ' test acc1 ', loss_acc1[1].item()) dist.reduce(tensor=loss_acc1, dst=0, op=dist.ReduceOp.SUM) loss_acc1.div_(dist.get_world_size() * 1.0) return loss_acc1[0].item(), loss_acc1[1].item()
def run(): modell = model.CNN() #modell = model.AlexNet() size = dist.get_world_size() rank = dist.get_rank() group_list = [] for i in range(size): group_list.append(i) group = dist.new_group(group_list) while(1): for param in modell.parameters(): #for dst in range(1, size): #dist.send(param.data, dst=dst) dist.broadcast(param.data, src=0, group=group) for param in modell.parameters(): tensor_temp = torch.zeros_like(param.data) dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group) param.data = tensor_temp / (size-1)
def allreduce_instrumented(timer, tensor, group): with timer('reduce'): rank = dist.get_rank() chunks = list(tensor.view(dist.get_world_size(), -1)) reqs = [ dist.reduce(chunk, i, op=dist.ReduceOp.SUM, group=group, async_op=True) for i, chunk in enumerate(chunks) ] [req.wait() for req in reqs] with timer('all_gather'): chunk = chunks[rank] dist.all_gather(chunks, chunk, group=group)
def valid(args, encoder, fc, validloader, logger): with torch.no_grad(): encoder.eval() fc.eval() correct = torch.tensor([0.0]).cuda() total = torch.tensor([0.0]).cuda() for data in validloader: img, label = data[:2] img, label = img.cuda(), label.cuda() feature = encoder(img) s = fc(feature) # acc _, predicted = torch.max(s.data, 1) correct += predicted.eq(label.data).sum() total += label.size(0) dist.reduce(correct, dst=0, op=dist.ReduceOp.SUM) dist.reduce(total, dst=0, op=dist.ReduceOp.SUM) if args.local_rank == 0: logger.info('valid-acc:{:.2%}'.format(correct.cpu().item() / total.cpu().item())) logger.info('--------------------------')
def pretrain_validation(args, index, model): if args.validation_data_path_prefix is None: return config = args.config logger = args.logger logger.info( f"Validation micro batch size: {args.train_micro_batch_size_per_gpu}") model.eval() dataset = PreTrainingDataset( args.tokenizer, os.path.join(args.validation_data_path_prefix, config['validation']['path']), args.logger, args.max_seq_length, index, PretrainDataType.VALIDATION, args.max_predictions_per_seq) data_batches = get_dataloader(args, dataset, eval_set=True) eval_loss = 0 nb_eval_steps = 0 for batch in tqdm(data_batches): batch = tuple(t.to(args.device) for t in batch) tmp_eval_loss = model.network(batch, log=False) dist.reduce(tmp_eval_loss, 0) # Reduce to get the loss from all the GPU's tmp_eval_loss = tmp_eval_loss / dist.get_world_size() eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps logger.info(f"Validation Loss for epoch {index + 1} is: {eval_loss}") if (not args.no_cuda and dist.get_rank() == 0) or (args.no_cuda and args.local_rank == -1): args.summary_writer.add_scalar(f'Validation/Loss', eval_loss, index + 1) return
def run(args, encoder, fc, criterion, optimizer, scheduler, trainloader, validloader, logger): # train for i_epoch in range(args.max_epoch): encoder.train() fc.train() trainloader.sampler.set_epoch(i_epoch) # 不可少 rank = dist.get_rank() correct = torch.tensor(0.0).cuda(rank) total = torch.tensor(0.0).cuda(rank) start_time = torch.tensor(time.time()).cuda(rank) for i_iter, data in enumerate(trainloader): img, label = data[:2] img, label = img.cuda(rank), label.cuda(rank) f = encoder(img) s = fc(f) loss = criterion(s, label) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # acc _, predicted = torch.max(s.data, 1) correct += predicted.eq(label.data).sum() total += s.shape[0] eta = (time.time() - start_time) / (i_iter + 1) * (len(trainloader) * ( args.max_epoch - i_epoch) - i_iter) / 3600 # print dist.reduce(loss, dst=0, op=dist.ReduceOp.SUM) dist.reduce(correct, dst=0, op=dist.ReduceOp.SUM) dist.reduce(total, dst=0, op=dist.ReduceOp.SUM) dist.reduce(eta, dst=0, op=dist.ReduceOp.SUM) if rank == 0: logger.info('loss:{:.4f} ' 'acc:{:.2%} ' 'ETA:{:.2f}h'.format( loss.cpu().item() / args.world_size, correct.cpu().item() / total.cpu().item(), eta.cpu().item() / args.world_size)) valid(args, encoder, fc, validloader)
def compute_train_stats_slave(x, y, Z_new, Lambda, cols, num_classes, group): train_values, train_indices = torch.max(predict(x, y, Z_new), 1) total_accu = accuracy( y.view(y.shape[0]).data.cpu().numpy(), train_indices.data.cpu().numpy()) total_cost = Softmax_Fx(x, y, Z_new, Lambda, cols, num_classes) total_cost_tensor = total_cost.data.cpu() total_accu_tensor = torch.DoubleTensor([total_accu]) total_len_tensor = torch.DoubleTensor([x.size()[0]]) dist.reduce(total_cost_tensor, 0, dist.reduce_op.SUM, group) dist.reduce(total_accu_tensor, 0, dist.reduce_op.SUM, group) dist.reduce(total_len_tensor, 0, dist.reduce_op.SUM, group)
def bucket_flush(*_: Any) -> None: assert self._bucket_list is not None handle = None for bucket in self._bucket_list: if not bucket.sent: # Reduce the bucket. Some parameters went unused and this bucket was not flushed bucket.buffer.mul_(self.world_size_scaling) bucket.sent = True handle = dist.reduce( tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True, ) # Only wait on the last handle if handle: handle.wait()
def _flush_buckets(self) -> None: if self._bucket_list is not None: last_handle = None for bucket in self._bucket_list: if not bucket.sent: # Normalize the bucket in one go bucket.buffer.mul_(self.world_size_scaling) # Reduce the bucket last_handle = dist.reduce( tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True, ) bucket.sent = True if last_handle is not None: last_handle.wait()
def _flush_reduce_calls(self) -> None: if self._bucket_list is not None: for bucket in self._bucket_list: if not bucket.sent: # Normalize the bucket in one go bucket.buffer.mul_(self.world_size_scaling) # Reduce the bucket self._work_handles.append( Workhandle( handle=dist.reduce( tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True, ), callback=None, ) ) bucket.sent = True self._consume_work_handles()
def get_train_stats_master(Lambda, Z_term, cols, num_classes, size, group): total_cost_tensor = torch.DoubleTensor([0.0]) total_accu_tensor = torch.DoubleTensor([0.0]) total_len_tensor = torch.DoubleTensor([0.0]) dist.reduce(total_cost_tensor, 0, dist.reduce_op.SUM, group) dist.reduce(total_accu_tensor, 0, dist.reduce_op.SUM, group) dist.reduce(total_len_tensor, 0, dist.reduce_op.SUM, group) total_cost = total_cost_tensor train_cost = total_cost + ( (Lambda / Variable(torch.DoubleTensor([2.0]).cuda())) * torch.dot(Z_term.view(cols * (num_classes)), Z_term.view(cols * (num_classes)))).data.cpu() train_accu = total_accu_tensor / (size - 1) return train_accu, train_cost / total_len_tensor.numpy()[0]
def reduce(*_: Any) -> None: # Skip gradient reduction, do not alter status flags if not self.should_accumulate_grads and self._grad_to_be_reduced[ index]: assert param.grad is not None, "Reducing gradients during backward pass, cannot be None" if not self._bucket_flush_callback_set: Variable._execution_engine.queue_callback( self._flush_buckets) self._bucket_flush_callback_set = True # Make sure that this is not fired twice self._grad_to_be_reduced[index] = False bucket = self.buckets[param.device][dst_rank] bucket.params_checked_in += 1 if bucket.full(): # Normalize the bucket in one go bucket.buffer.mul_(self.world_size_scaling) # Reduce the bucket bucket.sent = True self._work_handles.append( Workhandle( handle=dist.reduce( tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True, ), callback=None, )) self._reduced_grads += 1 # Opportunistically try to empty the queue self._try_consume_work_handle() # If all the reduce operations have been called, # make sure that all the asynchronous calls have concluded before moving on # and execute the delayed actions (release gradients, unroll the buckets) if self._reduced_grads == self._reduced_grads_max: self._consume_work_handles()
def reduce(*_: Any) -> None: # Skip gradient reduction, do not alter status flags if not self.should_accumulate_grads and self._grad_to_be_reduced[ index]: assert param.grad is not None, "Reducing gradients during backward pass, cannot be None" if not self._bucket_flush_callback_set: Variable._execution_engine.queue_callback( self._flush_reduce_calls) self._bucket_flush_callback_set = True # Make sure that this is not fired twice self._grad_to_be_reduced[index] = False param.grad.mul_(self.world_size_scaling) if self.reduce_fp16: param.grad.data = param.grad.data.half() # Future work includes clearing up the buffer if possible def cleanup() -> None: if dst_rank != self.global_rank: param.grad = None else: assert param.grad is not None param.grad.data = param.grad.data.to( dtype=param.dtype) # Async reduce for this buffer, log the future self._work_handles.append( Workhandle( handle=dist.reduce( tensor=param.grad.data, dst=self._local_to_global_rank[dst_rank], group=self.process_group, async_op=True, ), callback=cleanup, )) # Opportunistically try to empty the queue, free memory self._try_consume_work_handle()
def reduce(self, collectiveArgs, retFlag=False): if collectiveArgs.reduce_qcomm != 32: quantized = _downcast( collectiveArgs.ipTensor, collectiveArgs.allreduce_qcomm ) else: quantized = collectiveArgs.ipTensor retObj = dist.reduce( quantized, dst=collectiveArgs.dst, op=collectiveArgs.op, group=collectiveArgs.group, async_op=collectiveArgs.asyncOp, ) # synchronicity is maintained in runColl if collectiveArgs.asyncOp: retObj = retObj.get_future().then(_dequantize) else: retObj = _dequantize(quantized) if retFlag: return retObj
def reduce_bucket(*_: Any) -> None: # Skip gradient reduction, do not alter status flags if not self.should_accumulate_grads and self._grad_to_be_reduced[ index]: assert param.grad is not None, "Reducing gradients during backward pass, cannot be None" # Make sure that this is not fired twice self._grad_to_be_reduced[index] = False # Copy to the flat buffer, update the buffer state bucket = optimizer.buckets[param.device][dst_rank] assert bucket.append( param, use_gradient=True ), "Bucket overflow: max %s - current %s - adding %s" % ( bucket.max_size, bucket.current_offset, param.grad.numel(), ) if bucket.full(): bucket.buffer /= self.world_size optimizer.work_handles.append( Workhandle( handle=dist.reduce( tensor=bucket.buffer, dst=dst_rank, group=self.process_group, async_op=True, ), callback=bucket.unroll, )) # If all the reduce operations have been called, add the gatekeeper if len(optimizer.work_handles ) == optimizer._max_work_handles: gatekeeper()
def reduce(*_: Any) -> None: # Skip gradient reduction, do not alter status flags if not self.should_accumulate_grads and self._grad_to_be_reduced[ index]: assert param.grad is not None, "Reducing gradients during backward pass, cannot be None" if not self._bucket_flush_callback_set: Variable._execution_engine.queue_callback( self._flush_reduce_calls) self._bucket_flush_callback_set = True # Make sure that this is not fired twice self._grad_to_be_reduced[index] = False bucket = self.buckets[param.device][dst_rank] bucket.params_checked_in += 1 if bucket.all_checked_in: assert bucket.buffer is not None # Normalize the bucket in one go bucket.buffer.mul_(self.world_size_scaling) # Reduce the bucket bucket.sent = True self._work_handles.append( Workhandle( handle=dist.reduce( tensor=bucket.buffer, dst=bucket.destination, group=self.process_group, async_op=True, ), callback=None, )) # Opportunistically try to empty the queue self._try_consume_work_handle()
def _handle_trailing_buckets(self, flush_type: BucketFlush) -> None: """ Go through the buckets, flush them if not already empty .. warning: Could be that a bucket flush was already requested, needs to be handled carefully """ for bucket_list in self.buckets.values(): for bucket in bucket_list: if bucket.current_offset > 0: self.work_handles.append( Workhandle( handle=dist.broadcast( tensor=bucket.buffer, src=bucket.global_ref_rank, group=self.group, async_op=True, ) if flush_type == BucketFlush.Broadcast else dist.reduce( tensor=bucket.buffer, dst=bucket.global_ref_rank, group=self.group, async_op=True, ), callback=bucket.unroll, ) ) self._consume_work_handles()
def reduce(self, collectiveArgs, retFlag=False, pair=False): # pair=True mode does not support quantization if collectiveArgs.reduce_qcomm != 32 and not pair: assert collectiveArgs.ipTensor.dtype == torch.float32 with paramProfile( timer=collectiveArgs.quant_time, description="# PARAM: Reduce quantization #", ): quantized = _downcast( collectiveArgs.ipTensor, collectiveArgs.allreduce_qcomm ) else: quantized = ( collectiveArgs.ipTensor if not pair else collectiveArgs.ipTensor_pair ) retObj = dist.reduce( quantized, dst=collectiveArgs.srcOrDst, op=collectiveArgs.op, group=collectiveArgs.group, async_op=collectiveArgs.asyncOp, ) # synchronicity is maintained in runColl if collectiveArgs.reduce_qcomm != 32 and not pair: if collectiveArgs.asyncOp: retObj = retObj.get_future().then(_dequantize) else: with paramProfile( timer=collectiveArgs.dequant_time, description="# PARAM: Reduce de-quantization #", ): retObj = _dequantize(quantized) if collectiveArgs.asyncOp: collectiveArgs.waitObj.append(retObj) if retFlag: return retObj
def run(): size = dist.get_world_size() rank = dist.get_rank() model = Model() optimizer = optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM) model.train() train_set, samples = create_data_loader() start = monotonic() for epoch in range(EPOCHS): epoch_loss = 0.0 # for i, (data, target) in enumerate(train_set): for data, target in train_set: # if (rank == MASTER): # print('Epoch: {}, Minibatch: {}'.format( # epoch + 1, i + 1), end='\r') data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) criterion = nn.CrossEntropyLoss() loss = criterion(output, target) epoch_loss += loss loss.backward() average_gradients(model) optimizer.step() # if (rank == MASTER): # print() end = monotonic() avg_time = torch.Tensor([end - start]) epoch_loss *= samples / (math.ceil(samples / BATCH_SIZE)) samples = torch.Tensor([samples]) dist.reduce(epoch_loss, MASTER, op=dist.reduce_op.SUM) dist.reduce(samples, MASTER, op=dist.reduce_op.SUM) dist.reduce(avg_time, MASTER, op=dist.reduce_op.SUM) if (rank == MASTER): # print(float(samples)) # print(float(epoch_loss)) epoch_loss = float(epoch_loss) / float(samples) avg_time = float(avg_time) / (EPOCHS * size) print('{:.4f}, {:.4f}'.format(avg_time, epoch_loss))
def accumulate_metric(self, prediction, gt, accumulator, distributed=False): hist, correct_pixels, valid_pixels = compute_hist(prediction, gt, self.cfg.MODEL.NET1_CLASSES, 255) if distributed: # gather metric results hist = torch.tensor(hist).cuda() correct_pixels = torch.tensor(correct_pixels).cuda() valid_pixels = torch.tensor(valid_pixels).cuda() # aggregate result to rank 0 dist.reduce(hist, 0, dist.ReduceOp.SUM) dist.reduce(correct_pixels, 0, dist.ReduceOp.SUM) dist.reduce(valid_pixels, 0, dist.ReduceOp.SUM) hist = hist.cpu().numpy() correct_pixels = correct_pixels.cpu().item() valid_pixels = valid_pixels.cpu().item() accumulator['total_hist'] = accumulator.get('total_hist', 0.) + hist accumulator['total_correct_pixels'] = accumulator.get('total_correct_pixels', 0.) + correct_pixels accumulator['total_valid_pixels'] = accumulator.get('total_valid_pixels', 0.) + valid_pixels return accumulator
def main(args): args = options.set_default_args(args) if args.ddp_backend == 'apex': from apex.parallel import DistributedDataParallel as DDP else: from torch.nn.parallel import DistributedDataParallel as DDP ############################################################################ # Random seed ############################################################################ np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) ############################################################################ # Experiment & Logging ############################################################################ if is_master(args): if args.resume: # rank-0 device creates experiment dir and log to the file logging = get_logger(os.path.join(args.expname, 'log.txt'), log_=not args.debug) else: # rank-0 device creates experiment dir and log to the file logging = create_exp_dir(args.expname, debug=args.debug) else: # other devices only log to console (print) but not the file logging = get_logger(log_path=None, log_=False) args.model_path = os.path.join(args.expname, 'model.pt') args.var_path = os.path.join(args.expname, 'var.pt') ############################################################################ # Load data ############################################################################ logging('Loading data..') tr_data, va_data = options.load_data(args) train_step = 0 best_eval_ll = -float('inf') if args.resume: logging('Resuming from {}...'.format(args.resume)) model, opt = torch.load(args.model_path, map_location='cpu') model = model.to(args.device) for state in opt.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(args.device) best_eval_ll, train_step = torch.load(args.var_path) else: logging('Building model..') if args.model_name in ['srnn', 'srnn_zforce', 'srnn_hier']: model = eval(args.model_name).Model(args.n_mix, args.d_data, args.d_emb, args.d_mlp, args.d_rnn, args.d_lat, dropout=args.dropout, n_layer=args.n_layer) elif args.model_name in ['rnn', 'rnn_hier']: model = eval(args.model_name).Model(args.n_mix, args.d_data, args.d_emb, args.d_rnn, dropout=args.dropout, n_layer=args.n_layer) else: raise ValueError('unsupported model type {}'.format( args.model_name)) model = model.to(args.device) # create new optimizer opt = torch.optim.Adam(model.parameters(), lr=args.lr) if not args.test_only: # criterion params and model params crit_params, model_params = [], [] for n, p in model.named_parameters(): if 'crit' in n: crit_params.append(p) else: model_params.append(p) ############################################################################ # Distributed Data Parallel ############################################################################ if args.distributed: if args.ddp_backend == 'apex': torch.cuda.set_device(args.distributed_rank) para_model = DDP(model) else: para_model = DDP(model, device_ids=[args.device_id], output_device=args.device_id) else: para_model = model ############################################################################ # Log args ############################################################################ args.n_crit_param = sum([p.nelement() for p in crit_params]) args.n_model_param = sum([p.nelement() for p in model_params]) args.n_param = args.n_crit_param + args.n_model_param if is_master(args): logging('=' * 100) for k, v in args.__dict__.items(): logging(' - {} : {}'.format(k, v)) logging('=' * 100) ############################################################################ # Training ############################################################################ # linear cosine annealing kld_weight = min(1., args.init_kld + train_step * args.kld_incr) loss_sum = torch.Tensor([0]).to(args.device) kld_sum = torch.Tensor([0]).to(args.device) nll_sum = torch.Tensor([0]).to(args.device) gnorm_sum = 0 t = timeit.default_timer() for epoch in range(args.num_epochs): model.train() # make sure all data iterators use the same seed to shuffle data if args.distributed: np.random.seed(args.seed + epoch) #initalize the hidden state if args.pass_h: hidden = model.init_hidden(args.batch_size) else: hidden = None for x, y, mask in tr_data.get_masked_iter(shuffle=True): opt.zero_grad() ratio = 1. / torch.sum(mask) if args.kld: nll_loss, kld_loss, hidden = para_model(x, y, mask=mask, hidden=hidden) nll_loss = nll_loss.sum() * ratio kld_loss = kld_loss.sum() * ratio train_loss = nll_loss - kld_loss * kld_weight train_loss.backward() total_loss = nll_loss.detach() - kld_loss.detach() kld_sum += -kld_loss.detach() nll_sum += nll_loss.detach() else: nll_loss, hidden = para_model(x, y, mask=mask, hidden=hidden) train_loss = nll_loss.sum() * ratio train_loss.backward() total_loss = train_loss.detach() if args.clip > 0: gnorm = nn.utils.clip_grad_norm_(model.parameters(), args.clip) else: gnorm = 0 for n, p in model.named_parameters(): param_gnorm = p.grad.data.norm(2) gnorm += param_gnorm.item()**2 gnorm = gnorm**(1. / 2) opt.step() gnorm_sum += gnorm loss_sum += total_loss train_step += 1 # lr & kl annealling kld_weight = min(1., kld_weight + args.kld_incr) adjust_lr(opt, train_step, args.max_step, args.lr, args.end_lr) # log training if train_step % args.log_interval == 0: if args.distributed: dist.reduce(loss_sum, dst=0, op=dist.ReduceOp.SUM) loss_sum = loss_sum.div_(args.distributed_world_size) dist.reduce(nll_sum, dst=0, op=dist.ReduceOp.SUM) nll_sum = nll_sum.div_(args.distributed_world_size) dist.reduce(kld_sum, dst=0, op=dist.ReduceOp.SUM) kld_sum = kld_sum.div_(args.distributed_world_size) if is_master(args): cur_loss = loss_sum.item() / args.log_interval cur_nll = nll_sum.item() / args.log_interval cur_kld = kld_sum.item() / args.log_interval elapsed = (timeit.default_timer() - t) / 3600 logging('| total hrs [{:.2f}] | epoch {} step {} ' \ '| lr {:8.6f}, klw {:7.5f} | LL {:>9.4f} ' \ '| nll_loss {:>7.4f}, kld_loss {:>8.4f} ' \ '| gnorm {:.4f}'.format( elapsed, epoch, train_step, opt.param_groups[0]['lr'], kld_weight, -cur_loss, cur_nll, cur_kld, gnorm_sum / args.log_interval)) loss_sum = torch.Tensor([0]).to(args.device) kld_sum = torch.Tensor([0]).to(args.device) nll_sum = torch.Tensor([0]).to(args.device) gnorm_sum = 0 # validation if train_step % args.eval_interval == 0: eval_ll = evaluate(va_data, model, args) if is_master(args): logging('-' * 120) logging('Eval [{}] at step: {} | valid LL: {:>8.4f}'. format(train_step // args.eval_interval, train_step, eval_ll)) if eval_ll > best_eval_ll: best_eval_ll = eval_ll if not args.debug: logging('Save checkpoint. ' \ 'Best valid LL {:>9.4f}'.format(eval_ll)) torch.save([model, opt], args.model_path) torch.save([best_eval_ll, train_step], args.var_path) logging('-' * 120) # Reach maximum training step if train_step == args.max_step: break if train_step == args.max_step: break eval_ll = evaluate(va_data, model, args) if is_master(args): logging('-' * 120) logging('Eval [{}] | step: {}, LL: {:>8.4f}'.format( train_step // args.eval_interval, train_step, eval_ll)) logging('-' * 120) # evaluate the current model test_loss = evaluate(te_data, model, args) if is_master(args): logging('Test -- LL: {:>8.4f}'.format(test_loss))
def train(): #Launch recv td print("worker_id(rank)", worker_id, " size:", str(worker_num), " batch_size=", batch_size) init_processes(worker_id, worker_num, 'gloo') print("Worker End Connection Initialized") global sub_net, sub_optimizer, device is_cpu_mode = False sub_net.train() inputs = None outputs = None train_loss = 0 correct = 0 total = 0 iteration_num = 100 iter_n = 0 loss = None sub_optimizer.zero_grad() sta = time.time() #with torch.autograd.profiler.emit_nvtx(): while iter_n <= iteration_num: inputs = fake_input.to(device) targets = fake_target.to(device) outputs = sub_net(inputs) loss = criterion(outputs, targets) loss.backward() comm_time_sta = time.time() para_num = 0 ps_id = 0 for name, parameters in sub_net.named_parameters(): if (parameters.grad is not None): grad_content = parameters.grad.to("cpu") para_num += grad_content.numel() dist.reduce(tensor=grad_content, dst=ps_id, op=dist.ReduceOp.SUM) if worker_id == ps_id: grad_content = grad_content / worker_num dist.broadcast(tensor=grad_content, src=ps_id) parameters.grad = grad_content.to(device) ps_id = (ps_id + 1) % worker_num comm_time_ed = time.time() sub_optimizer.step() sub_optimizer.zero_grad() #print("iter=",iter_n," comm_time=",str(comm_time_ed-comm_time_ed)) iter_n = iter_n + 1 if iter_n % 10 == 0: ed = time.time() print("iter_n=", iter_n, " time=", (ed - sta * 1.0), "comm_num=", para_num) if (iter_n > 0 and iter_n % 10 == 0): cpu_node_idx = (iter_n / 10) % worker_num if worker_id == cpu_node_idx and is_cpu_mode == False: print("switch to cpu") os.environ["CUDA_VISIBLE_DEVICES"] = '' device = 'cpu' sub_net = sub_net.to(device) sub_optimizer = optim.SGD(sub_net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) is_cpu_mode = True elif (not worker_id == cpu_node_idx) and is_cpu_mode == True: print("switch to cuda") os.environ["CUDA_VISIBLE_DEVICES"] = '1' device = 'cuda' sub_net = sub_net.to(device) sub_optimizer = optim.SGD(sub_net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) is_cpu_mode = False if iter_n == iteration_num: exit(0)
elif rank == 1: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.recv(tensor, 0) dist.barrier() if rank == 0: print_header("reduce") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.reduce(tensor, 0) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.reduce(tensor, 0) dist.barrier() if rank == 0: print_header("all reduce") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42)