def run_epoch(self) -> float: logger.info("Computing reward") rewards = self.individual_pool.compute_all_local_rewards() logger.info("Pushing reward") # Sum the rewards across all machines c10d.all_reduce(rewards, self.process_group) # Divide the rewards by the number of machines. We do this because # there is no "average" all_reduce operator. rewards /= self.num_nodes self.iteration += 1 self.individual_pool.apply_global_reward(rewards, self.iteration) most_recent_avg_rewards = float(torch.mean(rewards)) new_parent_reward = self.individual_pool.compute_local_reward( self.individual_pool.parent_tensors ) logger.info( "ITERATION: {0} MEAN REWARD: {1}, NEW PARENT REWARD: {2}".format( self.iteration, most_recent_avg_rewards, new_parent_reward ) ) return new_parent_reward
def _test_all_reduce_helper( self, group, group_id, rank, op, master_value, worker_value, expected_value, cuda=False, rank_to_GPU=None, ): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.all_reduce(tensor, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.all_reduce(tensor, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) self._barrier()
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def _queue_reduction(self, bucket_idx): grads_batch = self.buckets[bucket_idx] grads_batch_coalesced = [] # coalesce the bucket for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # reduce to the first GPU in self.device_ids if len(self.device_ids) > 1: nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # now work on the first gpu reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]