コード例 #1
0
    def run_epoch(self) -> float:
        logger.info("Computing reward")
        rewards = self.individual_pool.compute_all_local_rewards()
        logger.info("Pushing reward")

        # Sum the rewards across all machines
        c10d.all_reduce(rewards, self.process_group)

        # Divide the rewards by the number of machines.  We do this because
        # there is no "average" all_reduce operator.
        rewards /= self.num_nodes

        self.iteration += 1
        self.individual_pool.apply_global_reward(rewards, self.iteration)
        most_recent_avg_rewards = float(torch.mean(rewards))
        new_parent_reward = self.individual_pool.compute_local_reward(
            self.individual_pool.parent_tensors
        )
        logger.info(
            "ITERATION: {0} MEAN REWARD: {1}, NEW PARENT REWARD: {2}".format(
                self.iteration, most_recent_avg_rewards, new_parent_reward
            )
        )

        return new_parent_reward
コード例 #2
0
    def _test_all_reduce_helper(
        self,
        group,
        group_id,
        rank,
        op,
        master_value,
        worker_value,
        expected_value,
        cuda=False,
        rank_to_GPU=None,
    ):
        for src in group:
            if rank == src:
                tensor = _build_tensor(src + 1).fill_(master_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.all_reduce(tensor, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1,
                                                       expected_value))
            else:
                tensor = _build_tensor(src + 1).fill_(worker_value)
                if cuda:
                    tensor = tensor.cuda(rank_to_GPU[rank][0])
                dist.all_reduce(tensor, op, group_id)
                self.assertEqual(tensor, _build_tensor(src + 1,
                                                       expected_value))

        self._barrier()
コード例 #3
0
        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(
                            grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
コード例 #4
0
    def _queue_reduction(self, bucket_idx):
        grads_batch = self.buckets[bucket_idx]
        grads_batch_coalesced = []

        # coalesce the bucket
        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
            with torch.cuda.device(dev_id):
                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
                grads_batch_coalesced.append(dev_grads_batch_coalesced)

        # reduce to the first GPU in self.device_ids
        if len(self.device_ids) > 1:
            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)

        # now work on the first gpu
        reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group)
        self.reduction_works[bucket_idx] = reduction_work
        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]