Beispiel #1
0
    def _get_loss_inputs_dict(self, batch, shuffle):
        """Return a feed dict from a batch.

        Arguments:
            batch (SampleBatch): batch of data to derive inputs from
            shuffle (bool): whether to shuffle batch sequences. Shuffle may
                be done in-place. This only makes sense if you're further
                applying minibatch SGD after getting the outputs.

        Returns:
            feed dict of data
        """

        # Get batch ready for RNNs, if applicable.
        pad_batch_to_sequences_of_same_size(
            batch,
            shuffle=shuffle,
            max_seq_len=self._max_seq_len,
            batch_divisibility_req=self._batch_divisibility_req,
            feature_keys=[k for k, v in self._loss_inputs])

        # Build the feed dict from the batch.
        feed_dict = {}
        for k, ph in self._loss_inputs:
            feed_dict[ph] = batch[k]

        state_keys = [
            "state_in_{}".format(i) for i in range(len(self._state_inputs))
        ]
        for k in state_keys:
            feed_dict[self._loss_input_dict[k]] = batch[k]
        if state_keys:
            feed_dict[self._seq_lens] = batch["seq_lens"]

        return feed_dict
Beispiel #2
0
    def test_pad_batch_fixed_max(self):
        """Test pad_batch_to_sequences_of_same_size when dynamic_max = False"""
        view_requirements = {
            "state_in_0":
            ViewRequirement(
                "state_out_0",
                shift="-3:-1",
                used_for_training=False,
                used_for_compute_actions=True,
                batch_repeat_value=1,
            )
        }
        max_seq_len = 20
        num_seqs = np.random.randint(1, 20)
        seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs))
        sum_seq_lens = np.sum(seq_lens)
        s1 = SampleBatch(
            {
                "a": np.arange(sum_seq_lens),
                "b": np.arange(sum_seq_lens),
                "seq_lens": seq_lens,
                "state_in_0": [[0]] * num_seqs,
            },
            _max_seq_len=max_seq_len,
        )

        pad_batch_to_sequences_of_same_size(
            s1,
            max_seq_len=max_seq_len,
            feature_keys=["a", "b"],
            view_requirements=view_requirements,
        )
        check(s1.max_seq_len, max_seq_len)
        check(s1["a"].shape[0], max_seq_len * num_seqs)
        check(s1["b"].shape[0], max_seq_len * num_seqs)
Beispiel #3
0
    def learn_on_batch(self, postprocessed_batch):
        # Callback handling.
        learn_stats = {}
        self.callbacks.on_learn_on_batch(
            policy=self, train_batch=postprocessed_batch, result=learn_stats
        )

        pad_batch_to_sequences_of_same_size(
            postprocessed_batch,
            max_seq_len=self._max_seq_len,
            shuffle=False,
            batch_divisibility_req=self.batch_divisibility_req,
            view_requirements=self.view_requirements,
        )

        self._is_training = True
        postprocessed_batch = self._lazy_tensor_dict(postprocessed_batch)
        postprocessed_batch.set_training(True)
        stats = self._learn_on_batch_helper(postprocessed_batch)
        stats.update(
            {
                "custom_metrics": learn_stats,
                NUM_AGENT_STEPS_TRAINED: postprocessed_batch.count,
            }
        )
        return convert_to_numpy(stats)
Beispiel #4
0
    def learn_on_batch(self, postprocessed_batch):
        # Set Model to train mode.
        if self.model:
            self.model.train()

        for k, v in postprocessed_batch.items():
            if 'state_in' in k[:8]:
                # assume all traj has the same length
                postprocessed_batch[k] = np.tile(
                    v, (postprocessed_batch.count // v.shape[0], 1))
            # print(k, len(postprocessed_batch[k]))
        postprocessed_batch.seq_lens = None  # remove to use .copy()

        c = 0
        for ep in range(self.ppo_epochs):
            for mb in minibatches(postprocessed_batch, self.minibatch_size):
                c += 1
                # pad batch for rnn
                pad_batch_to_sequences_of_same_size(
                    mb,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss = ppo_surrogate_loss(self, self.model, self.dist_class,
                                          minibatch)
                # compute gradient
                self.optimizer.zero_grad()
                loss.backward()
                # grad norm
                # apply_grad_clipping(self, self.optimizer, loss)
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.optimizer.step()
                # log stats
                # stats['ppo_loss'] += loss_out.item()
        # stats['ppo_loss'] /= c
        # add more info about the loss
        # TODO: move this to inner loop and use average instead (0)
        # stats.update(kl_and_loss_stats(self, train_batch))
        # compute the loss

        # log stats
        # stats = {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        # TODO: move this to inner loop and use average instead (0)

        stats = kl_and_loss_stats(self, postprocessed_batch)
        return {LEARNER_STATS_KEY: stats}
Beispiel #5
0
    def compute_gradients(self,
                          postprocessed_batch: SampleBatch) -> ModelGradients:

        assert len(self.devices) == 1

        # If not done yet, see whether we have to zero-pad this batch.
        if not postprocessed_batch.zero_padded:
            pad_batch_to_sequences_of_same_size(
                batch=postprocessed_batch,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

        postprocessed_batch.set_training(True)
        self._lazy_tensor_dict(postprocessed_batch, device=self.devices[0])

        # Do the (maybe parallelized) gradient calculation step.
        tower_outputs = self._multi_gpu_parallel_grad_calc(
            [postprocessed_batch])

        all_grads, grad_info = tower_outputs[0]

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.stats_fn(postprocessed_batch))

        fetches = self.extra_compute_grad_fetches()

        return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
Beispiel #6
0
        def learn_on_batch(self, postprocessed_batch):
            # Callback handling.
            learn_stats = {}
            self.callbacks.on_learn_on_batch(
                policy=self,
                train_batch=postprocessed_batch,
                result=learn_stats)

            if not isinstance(postprocessed_batch, SampleBatch) or \
                    not postprocessed_batch.zero_padded:
                pad_batch_to_sequences_of_same_size(
                    postprocessed_batch,
                    max_seq_len=self._max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )
            else:
                postprocessed_batch["seq_lens"] = postprocessed_batch.seq_lens

            self._is_training = True
            postprocessed_batch["is_training"] = True
            stats = self._learn_on_batch_eager(postprocessed_batch)
            stats.update({"custom_metrics": learn_stats})
            return stats
Beispiel #7
0
 def compute_gradients(self, samples):
     # Get batch ready for RNNs, if applicable.
     pad_batch_to_sequences_of_same_size(
         samples,
         shuffle=False,
         max_seq_len=self._max_seq_len,
         batch_divisibility_req=self.batch_divisibility_req)
     return self._compute_gradients_eager(samples)
Beispiel #8
0
    def learn_on_batch(
            self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]:
        # Get batch ready for RNNs, if applicable.
        pad_batch_to_sequences_of_same_size(
            postprocessed_batch,
            max_seq_len=self.max_seq_len,
            shuffle=False,
            batch_divisibility_req=self.batch_divisibility_req)

        train_batch = self._lazy_tensor_dict(postprocessed_batch)
        loss_out = force_list(
            self._loss(self, self.model, self.dist_class, train_batch))
        # Call Model's custom-loss with Policy loss outputs and train_batch.
        if self.model:
            loss_out = self.model.custom_loss(loss_out, train_batch)
        assert len(loss_out) == len(self._optimizers)
        # assert not any(torch.isnan(l) for l in loss_out)
        fetches = self.extra_compute_grad_fetches()

        # Loop through all optimizers.
        grad_info = {"allreduce_latency": 0.0}
        for i, opt in enumerate(self._optimizers):
            # Erase gradients in all vars of this optimizer.
            opt.zero_grad()
            # Recompute gradients of loss over all variables.
            loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
            grad_info.update(self.extra_grad_process(opt, loss_out[i]))

            if self.distributed_world_size:
                grads = []
                for param_group in opt.param_groups:
                    for p in param_group["params"]:
                        if p.grad is not None:
                            grads.append(p.grad)

                start = time.time()
                if torch.cuda.is_available():
                    # Sadly, allreduce_coalesced does not work with CUDA yet.
                    for g in grads:
                        torch.distributed.all_reduce(
                            g, op=torch.distributed.ReduceOp.SUM)
                else:
                    torch.distributed.all_reduce_coalesced(
                        grads, op=torch.distributed.ReduceOp.SUM)

                for param_group in opt.param_groups:
                    for p in param_group["params"]:
                        if p.grad is not None:
                            p.grad /= self.distributed_world_size

                grad_info["allreduce_latency"] += time.time() - start

            # Step the optimizer.
            opt.step()

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.extra_grad_info(train_batch))
        return dict(fetches, **{LEARNER_STATS_KEY: grad_info})
Beispiel #9
0
        def compute_gradients(self, samples):
            pad_batch_to_sequences_of_same_size(
                samples,
                shuffle=False,
                max_seq_len=self._max_seq_len,
                batch_divisibility_req=self.batch_divisibility_req)

            self._is_training = True
            samples["is_training"] = True
            return self._compute_gradients_eager(samples)
Beispiel #10
0
        def learn_on_batch(self, postprocessed_batch):
            # Callback handling.
            self.callbacks.on_learn_on_batch(policy=self,
                                             train_batch=postprocessed_batch)

            # Get batch ready for RNNs, if applicable.
            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                shuffle=False,
                max_seq_len=self._max_seq_len,
                batch_divisibility_req=self.batch_divisibility_req)
            return self._learn_on_batch_eager(postprocessed_batch)
Beispiel #11
0
    def load_batch_into_buffer(
        self,
        batch: SampleBatch,
        buffer_index: int = 0,
    ) -> int:
        # Set the is_training flag of the batch.
        batch.set_training(True)

        # Shortcut for 1 CPU only: Store batch in `self._loaded_batches`.
        if len(self.devices) == 1 and self.devices[0].type == "cpu":
            assert buffer_index == 0
            pad_batch_to_sequences_of_same_size(
                batch=batch,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )
            self._lazy_tensor_dict(batch)
            self._loaded_batches[0] = [batch]
            return len(batch)

        # Batch (len=28, seq-lens=[4, 7, 4, 10, 3]):
        # 0123 0123456 0123 0123456789ABC

        # 1) split into n per-GPU sub batches (n=2).
        # [0123 0123456] [012] [3 0123456789 ABC]
        # (len=14, 14 seq-lens=[4, 7, 3] [1, 10, 3])
        slices = batch.timeslices(num_slices=len(self.devices))

        # 2) zero-padding (max-seq-len=10).
        # - [0123000000 0123456000 0120000000]
        # - [3000000000 0123456789 ABC0000000]
        for slice in slices:
            pad_batch_to_sequences_of_same_size(
                batch=slice,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

        # 3) Load splits into the given buffer (consisting of n GPUs).
        slices = [
            slice.to_device(self.devices[i]) for i, slice in enumerate(slices)
        ]
        self._loaded_batches[buffer_index] = slices

        # Return loaded samples per-device.
        return len(slices[0])
Beispiel #12
0
    def _get_loss_inputs_dict(self, train_batch: SampleBatch, shuffle: bool):
        """Return a feed dict from a batch.

        Args:
            train_batch: batch of data to derive inputs from.
            shuffle: whether to shuffle batch sequences. Shuffle may
                be done in-place. This only makes sense if you're further
                applying minibatch SGD after getting the outputs.

        Returns:
            Feed dict of data.
        """

        # Get batch ready for RNNs, if applicable.
        if not isinstance(train_batch,
                          SampleBatch) or not train_batch.zero_padded:
            pad_batch_to_sequences_of_same_size(
                train_batch,
                max_seq_len=self._max_seq_len,
                shuffle=shuffle,
                batch_divisibility_req=self._batch_divisibility_req,
                feature_keys=list(self._loss_input_dict_no_rnn.keys()),
                view_requirements=self.view_requirements,
            )

        # Mark the batch as "is_training" so the Model can use this
        # information.
        train_batch.set_training(True)

        # Build the feed dict from the batch.
        feed_dict = {}
        for key, placeholders in self._loss_input_dict.items():
            a = tree.map_structure(
                lambda ph, v: feed_dict.__setitem__(ph, v),
                placeholders,
                train_batch[key],
            )
            del a

        state_keys = [
            "state_in_{}".format(i) for i in range(len(self._state_inputs))
        ]
        for key in state_keys:
            feed_dict[self._loss_input_dict[key]] = train_batch[key]
        if state_keys:
            feed_dict[self._seq_lens] = train_batch[SampleBatch.SEQ_LENS]

        return feed_dict
Beispiel #13
0
        def learn_on_batch(self, postprocessed_batch):
            # Callback handling.
            self.callbacks.on_learn_on_batch(policy=self,
                                             train_batch=postprocessed_batch)

            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                shuffle=False,
                max_seq_len=self._max_seq_len,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

            self._is_training = True
            postprocessed_batch["is_training"] = True
            return self._learn_on_batch_eager(postprocessed_batch)
Beispiel #14
0
        def compute_gradients(self, postprocessed_batch: SampleBatch) -> \
                Tuple[ModelGradients, Dict[str, TensorType]]:

            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                shuffle=False,
                max_seq_len=self._max_seq_len,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

            self._is_training = True
            self._lazy_tensor_dict(postprocessed_batch)
            postprocessed_batch.set_training(True)
            grads_and_vars, grads, stats = self._compute_gradients_helper(
                postprocessed_batch)
            return convert_to_numpy((grads, stats))
Beispiel #15
0
    def learn_on_batch(self, postprocessed_batch):
        # Get batch ready for RNNs, if applicable.
        pad_batch_to_sequences_of_same_size(
            postprocessed_batch,
            max_seq_len=self.max_seq_len,
            shuffle=False,
            batch_divisibility_req=self.batch_divisibility_req)

        train_batch = self._lazy_tensor_dict(postprocessed_batch)
        loss_out = self._loss(self, self.model, self.dist_class, train_batch)
        self._optimizer.zero_grad()
        loss_out.backward()

        info = {}
        info.update(self.extra_grad_process())

        if self.distributed_world_size:
            grads = []
            for p in self.model.parameters():
                if p.grad is not None:
                    grads.append(p.grad)
            start = time.time()
            if torch.cuda.is_available():
                # Sadly, allreduce_coalesced does not work with CUDA yet.
                for g in grads:
                    torch.distributed.all_reduce(
                        g, op=torch.distributed.ReduceOp.SUM)
            else:
                torch.distributed.all_reduce_coalesced(
                    grads, op=torch.distributed.ReduceOp.SUM)
            for p in self.model.parameters():
                if p.grad is not None:
                    p.grad /= self.distributed_world_size
            info["allreduce_latency"] = time.time() - start

        self._optimizer.step()

        info.update(self.extra_grad_info(train_batch))
        return {
            LEARNER_STATS_KEY: info
        }
Beispiel #16
0
    def _get_loss_inputs_dict(self, train_batch, shuffle):
        """Return a feed dict from a batch.

        Args:
            train_batch (SampleBatch): batch of data to derive inputs from.
            shuffle (bool): whether to shuffle batch sequences. Shuffle may
                be done in-place. This only makes sense if you're further
                applying minibatch SGD after getting the outputs.

        Returns:
            feed dict of data
        """

        # Get batch ready for RNNs, if applicable.
        pad_batch_to_sequences_of_same_size(
            train_batch,
            shuffle=shuffle,
            max_seq_len=self._max_seq_len,
            batch_divisibility_req=self._batch_divisibility_req,
            feature_keys=list(self._loss_input_dict_no_rnn.keys()),
            view_requirements=self.view_requirements,
        )

        # Mark the batch as "is_training" so the Model can use this
        # information.
        train_batch["is_training"] = True

        # Build the feed dict from the batch.
        feed_dict = {}
        for key, placeholder in self._loss_input_dict.items():
            feed_dict[placeholder] = train_batch[key]

        state_keys = [
            "state_in_{}".format(i) for i in range(len(self._state_inputs))
        ]
        for key in state_keys:
            feed_dict[self._loss_input_dict[key]] = train_batch[key]
        if state_keys:
            feed_dict[self._seq_lens] = train_batch["seq_lens"]

        return feed_dict
Beispiel #17
0
def do_minibatch_sgd(samples, policies, local_worker, num_sgd_iter,
                     sgd_minibatch_size, standardize_fields):
    """Execute minibatch SGD.

    Arguments:
        samples (SampleBatch): batch of samples to optimize.
        policies (dict): dictionary of policies to optimize.
        local_worker (RolloutWorker): master rollout worker instance.
        num_sgd_iter (int): number of epochs of optimization to take.
        sgd_minibatch_size (int): size of minibatches to use for optimization.
        standardize_fields (list): list of sample field names that should be
            normalized prior to optimization.

    Returns:
        averaged info fetches over the last SGD epoch taken.
    """
    # Get batch
    global nepochs
    global seg_buf
    if isinstance(samples, SampleBatch):
        samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples}, samples.count)

    fetches = {}
    for policy_id, policy in policies.items():
        model = policy.model

        dist_class = policy.dist_class

        if policy_id not in samples.policy_batches:
            continue

        batch = samples.policy_batches[policy_id]
        for field in standardize_fields:
            batch[field] = standardized(batch[field])

        seg_buf.append(batch)

        for i in range(num_sgd_iter):
            iter_extra_fetches = defaultdict(list)
            #pass the whole batch to the worker, then let it break it down into minibatches so we can handle policy and value function training separately
            batch_fetches = (local_worker.learn_on_batch(
                MultiAgentBatch({policy_id: batch}, batch.count)))[policy_id]

            for k, v in batch_fetches.get(LEARNER_STATS_KEY, {}).items():
                iter_extra_fetches[k].append(v)

            logger.debug("{} {}".format(i, averaged(iter_extra_fetches)))

        fetches[policy_id] = averaged(iter_extra_fetches)

        nepochs += 1
        if nepochs % 16 == 0:

            def forward(seg):
                logits, state = model.from_batch(seg)
                return logits, state

            REPLAY_MB_SIZE = 512
            # #compute the probability distributions on the replay buffer before replay buffer training
            for seg in seg_buf:
                np_data = {}
                np_data["obs"] = th.from_numpy(seg.data["obs"]).to(
                    th.cuda.current_device())
                logits, state = tu.minibatched_call(forward,
                                                    REPLAY_MB_SIZE,
                                                    seg=np_data)
                seg.data["oldpd"] = logits.cpu().numpy()

            replay_batch = SampleBatch.concat_samples(seg_buf)
            #train on replay buffer
            for i in range(3):
                for mb in minibatches(replay_batch, REPLAY_MB_SIZE):
                    pad_batch_to_sequences_of_same_size(
                        mb,
                        max_seq_len=20,
                        shuffle=False,
                        batch_divisibility_req=1)
                    mb["obs"] = th.from_numpy(mb["obs"]).to(
                        th.cuda.current_device())

                    logits, vpredaux = model.forward_aux(mb)

                    oldpd = dist_class(
                        th.from_numpy(mb['oldpd']).to(
                            th.cuda.current_device()))
                    pd = dist_class(logits, model)
                    pol_distance = oldpd.kl(pd).mean()

                    vpredtrue = model.value_function()
                    vtarg = th.from_numpy(mb[Postprocessing.VALUE_TARGETS]).to(
                        th.cuda.current_device())
                    vf_aux = 0.5 * th.mean(th.pow(vpredaux - vtarg, 2.0))
                    vf_true = 0.5 * th.mean(th.pow(vpredtrue - vtarg, 2.0))
                    loss = pol_distance + vf_aux + vf_true

                    policy.aux_learn(loss)
            seg_buf.clear()
    return fetches
Beispiel #18
0
    def learn_on_batch(self, postprocessed_batch):
        # if isinstance(postprocessed_batch, SampleBatch):
        #     postprocessed_batch = MultiAgentBatch({DEFAULT_POLICY_ID: postprocessed_batch},
        #                                           postprocessed_batch.count)

        # postprocessed_batch = postprocessed_batch.policy_batches[DEFAULT_POLICY_ID]

        # print(type(postprocessed_batch))
        # print(postprocessed_batch.keys())
        # print(postprocessed_batch['agent_index'])
        # print(postprocessed_batch['seq_lens'])

        # Set Model to train mode.
        if self.model:
            self.model.train()
        # Turn the values into tensors

        # train_batch = self._lazy_tensor_dict(postprocessed_batch)
        # print(postprocessed_batch.keys())
        # print(postprocessed_batch['agent_index'].shape)
        # print(train_batch['obs'].shape)
        # print(train_batch[''])
        # stats = {}
        rew = defaultdict(float)
        traj = {}
        # print(postprocessed_batch.count)
        for k, v in postprocessed_batch.items():
            if 'state_in' in k[:8]:
                # assume all traj has the same length
                postprocessed_batch[k] = np.tile(
                    v, (postprocessed_batch.count // v.shape[0], 1))
            # print(k, len(postprocessed_batch[k]))
        postprocessed_batch.seq_lens = None  # remove to use split_by_episode
        # very slow
        # need a way to get traj from a specific partner faster
        # could try split_by_episode
        # for i,row in enumerate(postprocessed_batch.rows()):
        #     # print(i)
        #     # print(row['state_in_0'])
        #     # print(row['state_out_0'])

        #     partner_id = tuple(row['partner_id'])
        #     # print(partner_id)
        #     # print(row['rewards'])
        #     rew[partner_id] += row['rewards']
        #     for k,v in row.items():
        #         row[k] = [v]
        #     row = SampleBatch(row)
        #     if partner_id not in traj:
        #         traj[partner_id] = row
        #     else:
        #         # print('concat',i)
        #         traj[partner_id] = traj[partner_id].concat(row)

        for i, ep in enumerate(postprocessed_batch.split_by_episode()):
            # print(i)
            # print(ep['state_in_0'])
            # print(ep['state_out_0'])

            # assume a fixed set of partners in one episode
            partner_id = tuple(ep['partner_id'][0])
            # print(partner_id)
            # print(ep['rewards'])
            rew[partner_id] += sum(ep['rewards'])
            # for k,v in ep.items():
            #     ep[k] = [v]
            # ep = SampleBatch(ep)
            if partner_id not in traj:
                traj[partner_id] = ep
            else:
                # print('concat',i)
                traj[partner_id] = traj[partner_id].concat(ep)

        rew_list = list(rew.items())
        rew_list.sort(key=lambda x: x[1])
        lowest_rew_partner = rew_list[0][0]
        # print(rew_list)
        train_traj = traj[lowest_rew_partner]
        # print(train_traj.count)
        stats = {'timesteps_used': train_traj.count}
        # print('traj.seq_lens:', train_traj.seq_lens)
        c = 0
        for ep in range(self.ppo_epochs):
            # batch.shuffle()
            for mb in minibatches(train_traj, self.minibatch_size):
                c += 1
                # minibatch = MultiAgentBatch({DEFAULT_POLICY_ID: mb}, mb.count)
                # minibatch = mb.copy()
                pad_batch_to_sequences_of_same_size(
                    mb,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )
                # print(mb.seq_lens)
                # for k in mb.keys():
                #     print(k, len(mb[k]))
                # minibatch = mb.copy()
                # minibatch['advantages'] = standardize(minibatch['advantages'])
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss = ppo_surrogate_loss(self, self.model, self.dist_class,
                                          minibatch)
                # compute gradient
                self.optimizer.zero_grad()
                loss.backward()
                # grad norm
                # apply_grad_clipping(self, self.optimizer, loss)
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.optimizer.step()
                # log stats
                # stats['ppo_loss'] += loss_out.item()
        # stats['ppo_loss'] /= c
        # add more info about the loss
        # TODO: move this to inner loop and use average instead (0)
        # stats.update(kl_and_loss_stats(self, train_batch))
        # compute the loss

        # log stats
        # stats = {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        # TODO: move this to inner loop and use average instead (0)

        stats.update(kl_and_loss_stats(self, postprocessed_batch))
        return {LEARNER_STATS_KEY: stats}
Beispiel #19
0
    def learn_on_batch(
            self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]:
        # Get batch ready for RNNs, if applicable.
        pad_batch_to_sequences_of_same_size(
            postprocessed_batch,
            max_seq_len=self.max_seq_len,
            shuffle=False,
            batch_divisibility_req=self.batch_divisibility_req,
            _use_trajectory_view_api=self.config["_use_trajectory_view_api"],
        )

        train_batch = self._lazy_tensor_dict(postprocessed_batch)

        # Calculate the actual policy loss.
        loss_out = force_list(
            self._loss(self, self.model, self.dist_class, train_batch))

        # Call Model's custom-loss with Policy loss outputs and train_batch.
        if self.model:
            loss_out = self.model.custom_loss(loss_out, train_batch)

        # Give Exploration component that chance to modify the loss (or add
        # its own terms).
        if hasattr(self, "exploration"):
            loss_out = self.exploration.get_exploration_loss(
                loss_out, train_batch)

        assert len(loss_out) == len(self._optimizers)

        # assert not any(torch.isnan(l) for l in loss_out)
        fetches = self.extra_compute_grad_fetches()

        # Loop through all optimizers.
        grad_info = {"allreduce_latency": 0.0}

        for i, opt in enumerate(self._optimizers):
            # Erase gradients in all vars of this optimizer.
            opt.zero_grad()
            # Recompute gradients of loss over all variables.
            loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
            grad_info.update(self.extra_grad_process(opt, loss_out[i]))

            if self.distributed_world_size:
                grads = []
                for param_group in opt.param_groups:
                    for p in param_group["params"]:
                        if p.grad is not None:
                            grads.append(p.grad)

                start = time.time()
                if torch.cuda.is_available():
                    # Sadly, allreduce_coalesced does not work with CUDA yet.
                    for g in grads:
                        torch.distributed.all_reduce(
                            g, op=torch.distributed.ReduceOp.SUM)
                else:
                    torch.distributed.all_reduce_coalesced(
                        grads, op=torch.distributed.ReduceOp.SUM)

                for param_group in opt.param_groups:
                    for p in param_group["params"]:
                        if p.grad is not None:
                            p.grad /= self.distributed_world_size

                grad_info["allreduce_latency"] += time.time() - start

        # Step the optimizer
        for i, opt in enumerate(self._optimizers):
            xm.optimizer_step(
                opt, barrier=True)  # HERE IS THE DIFFERENCE FOR TPU USE

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.extra_grad_info(train_batch))
        if self.model:
            grad_info["model"] = self.model.metrics()
        return dict(fetches, **{LEARNER_STATS_KEY: grad_info})
Beispiel #20
0
    def learn_on_batch(self, postprocessed_batch):
        grad_info = {"allreduce_latency": 0.0}

        def minibatches(samples, sgd_minibatch_size):
            """Return a generator yielding minibatches from a sample batch.

            Arguments:
                samples (SampleBatch): batch of samples to split up.
                sgd_minibatch_size (int): size of minibatches to return.

            Returns:
                generator that returns mini-SampleBatches of size sgd_minibatch_size.
            """
            if not sgd_minibatch_size:
                yield samples
                return

            if isinstance(samples, MultiAgentBatch):
                raise NotImplementedError(
                    "Minibatching not implemented for multi-agent in simple mode"
                )

            samples.shuffle()

            i = 0
            slices = []
            while i < samples.count:
                slices.append((i, i + sgd_minibatch_size))
                i += sgd_minibatch_size
            random.shuffle(slices)

            for i, j in slices:
                yield samples.slice(i, j)

        #train policy function
        train_batch = None
        for minibatch in minibatches(postprocessed_batch, 1024):
            # Get batch ready for RNNs, if applicable.
            pad_batch_to_sequences_of_same_size(
                minibatch,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req)
            train_batch = self._lazy_tensor_dict(minibatch)
            loss_out = force_list(
                self._loss(self, self.model, self.dist_class, train_batch,
                           True))

            for i, opt in enumerate(self._optimizers):
                opt.zero_grad()
                pi_loss = loss_out[i]
                self.backprop(grad_info, opt, pi_loss, False)
                opt.step()

        #train value function
        for vtrain_i in range(3):
            for minibatch in minibatches(postprocessed_batch, 1024):
                # Get batch ready for RNNs, if applicable.
                pad_batch_to_sequences_of_same_size(
                    minibatch,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req)

                train_batch = self._lazy_tensor_dict(minibatch)
                loss_out = force_list(
                    self._loss(self, self.model, self.dist_class, train_batch,
                               False))

                for i, opt in enumerate(self._optimizers):
                    opt.zero_grad()
                    vf_loss = loss_out[i]
                    self.backprop(grad_info, opt, vf_loss, False)
                    opt.step()

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(
            self.extra_grad_info(train_batch)
        )  #is it ok just to update this with the last minibatch?
        return {LEARNER_STATS_KEY: grad_info}
Beispiel #21
0
def analyze_rnn_batch(batch, max_seq_len):
    count = batch.count

    # Check prev_reward/action, next_obs consistency.
    for idx in range(count):
        # If timestep tracked by batch, good.
        if "t" in batch:
            ts = batch["t"][idx]
        # Else, ts
        else:
            ts = batch["obs"][idx][3]
        obs_t = batch["obs"][idx]
        a_t = batch["actions"][idx]
        r_t = batch["rewards"][idx]
        state_in_0 = batch["state_in_0"][idx]
        state_in_1 = batch["state_in_1"][idx]

        # Check postprocessing outputs.
        if "2xobs" in batch:
            postprocessed_col_t = batch["2xobs"][idx]
            assert (obs_t == postprocessed_col_t / 2.0).all()

        # Check state-in/out and next-obs values.
        if idx > 0:
            next_obs_t_m_1 = batch["new_obs"][idx - 1]
            state_out_0_t_m_1 = batch["state_out_0"][idx - 1]
            state_out_1_t_m_1 = batch["state_out_1"][idx - 1]
            # Same trajectory as for t-1 -> Should be able to match.
            if (batch[SampleBatch.AGENT_INDEX][idx]
                    == batch[SampleBatch.AGENT_INDEX][idx - 1]
                    and batch[SampleBatch.EPS_ID][idx]
                    == batch[SampleBatch.EPS_ID][idx - 1]):
                assert batch["unroll_id"][idx - 1] == batch["unroll_id"][idx]
                assert (obs_t == next_obs_t_m_1).all()
                assert (state_in_0 == state_out_0_t_m_1).all()
                assert (state_in_1 == state_out_1_t_m_1).all()
            # Different trajectory.
            else:
                assert batch["unroll_id"][idx - 1] != batch["unroll_id"][idx]
                assert not (obs_t == next_obs_t_m_1).all()
                assert not (state_in_0 == state_out_0_t_m_1).all()
                assert not (state_in_1 == state_out_1_t_m_1).all()
                # Check initial 0-internal states.
                if ts == 0:
                    assert (state_in_0 == 0.0).all()
                    assert (state_in_1 == 0.0).all()

        # Check initial 0-internal states (at ts=0).
        if ts == 0:
            assert (state_in_0 == 0.0).all()
            assert (state_in_1 == 0.0).all()

        # Check prev. a/r values.
        if idx < count - 1:
            prev_actions_t_p_1 = batch["prev_actions"][idx + 1]
            prev_rewards_t_p_1 = batch["prev_rewards"][idx + 1]
            # Same trajectory as for t+1 -> Should be able to match.
            if batch[SampleBatch.AGENT_INDEX][idx] == \
                    batch[SampleBatch.AGENT_INDEX][idx + 1] and \
                    batch[SampleBatch.EPS_ID][idx] == \
                    batch[SampleBatch.EPS_ID][idx + 1]:
                assert (a_t == prev_actions_t_p_1).all()
                assert r_t == prev_rewards_t_p_1
            # Different (new) trajectory. Assume t-1 (prev-a/r) to be
            # always 0.0s. [3]=ts
            elif ts == 0:
                assert (prev_actions_t_p_1 == 0).all()
                assert prev_rewards_t_p_1 == 0.0

    pad_batch_to_sequences_of_same_size(batch,
                                        max_seq_len=max_seq_len,
                                        shuffle=False,
                                        batch_divisibility_req=1)

    # Check after seq-len 0-padding.
    cursor = 0
    for i, seq_len in enumerate(batch["seq_lens"]):
        state_in_0 = batch["state_in_0"][i]
        state_in_1 = batch["state_in_1"][i]
        for j in range(seq_len):
            k = cursor + j
            ts = batch["t"][k]
            obs_t = batch["obs"][k]
            a_t = batch["actions"][k]
            r_t = batch["rewards"][k]

            # Check postprocessing outputs.
            if "2xobs" in batch:
                postprocessed_col_t = batch["2xobs"][k]
                assert (obs_t == postprocessed_col_t / 2.0).all()

            # Check state-in/out and next-obs values.
            if j > 0:
                next_obs_t_m_1 = batch["new_obs"][k - 1]
                # state_out_0_t_m_1 = batch["state_out_0"][k - 1]
                # state_out_1_t_m_1 = batch["state_out_1"][k - 1]
                # Always same trajectory as for t-1.
                assert batch["unroll_id"][k - 1] == batch["unroll_id"][k]
                assert (obs_t == next_obs_t_m_1).all()
                # assert (state_in_0 == state_out_0_t_m_1).all())
                # assert (state_in_1 == state_out_1_t_m_1).all())
            # Check initial 0-internal states.
            elif ts == 0:
                assert (state_in_0 == 0.0).all()
                assert (state_in_1 == 0.0).all()

        for j in range(seq_len, max_seq_len):
            k = cursor + j
            obs_t = batch["obs"][k]
            a_t = batch["actions"][k]
            r_t = batch["rewards"][k]
            assert (obs_t == 0.0).all()
            assert (a_t == 0.0).all()
            assert (r_t == 0.0).all()

        cursor += max_seq_len
Beispiel #22
0
    def compute_gradients(self,
                          postprocessed_batch: SampleBatch) -> ModelGradients:

        if not isinstance(postprocessed_batch, SampleBatch) or \
                not postprocessed_batch.zero_padded:
            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

        # Mark the batch as "is_training" so the Model can use this
        # information.
        postprocessed_batch.is_training = True

        # Single device case: Use batch as-is (no slicing).
        if len(self.devices) == 1:
            batches = [self._lazy_tensor_dict(postprocessed_batch)]
        # Multi-GPU case: Slice inputs into n (roughly) equal batches.
        else:
            len_ = len(postprocessed_batch)
            batches = []
            start = 0
            for i, device in enumerate(self.devices):
                shard_len = len_ // (len(self.devices) - i)
                batch = self._lazy_tensor_dict(postprocessed_batch.slice(
                    start, start + shard_len),
                                               device=device)
                batches.append(batch)
                len_ -= shard_len
                start += shard_len

            # Copy weights of main model to all towers.
            state_dict = self.model.state_dict()
            for tower in self.model_gpu_towers:
                tower.load_state_dict(state_dict)

        # Do the (maybe parallelized) gradient calculation step.
        tower_outputs = self._multi_gpu_parallel_grad_calc(batches)

        # Multi device (GPU) case.
        if len(self.devices) > 1:
            # Mean-reduce over GPU-towers.
            all_grads = []
            for i in range(len(tower_outputs[0][0])):
                if tower_outputs[0][0][i] is not None:
                    all_grads.append(
                        torch.mean(torch.stack(
                            [t[0][i].to(self.device) for t in tower_outputs]),
                                   dim=0))
                else:
                    all_grads.append(None)
            # Set main model's grads to mean-reduced values.
            for i, p in enumerate(self.model.parameters()):
                p.grad = all_grads[i]
            # Reduce stats over towers as well.
            from ray.rllib.execution.train_ops import all_tower_reduce
            grad_info = tree.map_structure_with_path(
                lambda p, *t: all_tower_reduce(p, *t),
                *[t[1] for t in tower_outputs])
        # Single device case.
        else:
            all_grads, grad_info = tower_outputs[0]

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.extra_grad_info(postprocessed_batch))

        fetches = self.extra_compute_grad_fetches()

        return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
Beispiel #23
0
    def compute_gradients(self,
                          postprocessed_batch: SampleBatch) -> ModelGradients:

        if not isinstance(postprocessed_batch, SampleBatch) or \
                not postprocessed_batch.zero_padded:
            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                max_seq_len=self.max_seq_len,
                shuffle=False,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )
        else:
            postprocessed_batch["seq_lens"] = postprocessed_batch.seq_lens

        # Mark the batch as "is_training" so the Model can use this
        # information.
        postprocessed_batch.is_training = True
        train_batch = self._lazy_tensor_dict(postprocessed_batch)

        # Calculate the actual policy loss.
        loss_out = force_list(
            self._loss(self, self.model, self.dist_class, train_batch))

        # Call Model's custom-loss with Policy loss outputs and train_batch.
        if self.model:
            loss_out = self.model.custom_loss(loss_out, train_batch)

        # Give Exploration component that chance to modify the loss (or add
        # its own terms).
        if hasattr(self, "exploration"):
            loss_out = self.exploration.get_exploration_loss(
                loss_out, train_batch)

        assert len(loss_out) == len(self._optimizers)

        # assert not any(torch.isnan(l) for l in loss_out)
        fetches = self.extra_compute_grad_fetches()

        # Loop through all optimizers.
        grad_info = {"allreduce_latency": 0.0}

        all_grads = []
        for i, opt in enumerate(self._optimizers):
            # Erase gradients in all vars of this optimizer.
            opt.zero_grad()
            # Recompute gradients of loss over all variables.
            loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
            grad_info.update(self.extra_grad_process(opt, loss_out[i]))

            grads = []
            # Note that return values are just references;
            # Calling zero_grad would modify the values.
            for param_group in opt.param_groups:
                for p in param_group["params"]:
                    if p.grad is not None:
                        grads.append(p.grad)
                        all_grads.append(p.grad.data.cpu().numpy())
                    else:
                        all_grads.append(None)

            if self.distributed_world_size:
                start = time.time()
                if torch.cuda.is_available():
                    # Sadly, allreduce_coalesced does not work with CUDA yet.
                    for g in grads:
                        torch.distributed.all_reduce(
                            g, op=torch.distributed.ReduceOp.SUM)
                else:
                    torch.distributed.all_reduce_coalesced(
                        grads, op=torch.distributed.ReduceOp.SUM)

                for param_group in opt.param_groups:
                    for p in param_group["params"]:
                        if p.grad is not None:
                            p.grad /= self.distributed_world_size

                grad_info["allreduce_latency"] += time.time() - start

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.extra_grad_info(train_batch))

        return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
Beispiel #24
0
    def compute_gradients(self,
                          postprocessed_batch: SampleBatch) -> ModelGradients:

        # For multi-GPU, split the batch into n slices (n=#GPUs).
        if len(self.devices) == 1:
            batches = [postprocessed_batch]
        else:
            from ray.rllib.utils.sgd import minibatches
            batches = list(
                minibatches(postprocessed_batch,
                            len(postprocessed_batch) // len(self.devices),
                            shuffle=False))

        if not isinstance(postprocessed_batch, SampleBatch) or \
                not postprocessed_batch.zero_padded:
            for b in batches:
                pad_batch_to_sequences_of_same_size(
                    b,
                    max_seq_len=self.max_seq_len,
                    shuffle=False,
                    batch_divisibility_req=self.batch_divisibility_req,
                    view_requirements=self.view_requirements,
                )

        for b, d in zip(batches, self.devices):
            b.is_training = True
            self._lazy_tensor_dict(b, device=d)

        # Multi-GPU case: Slice inputs into n (roughly) equal batches.
        if len(self.devices) > 1:
            # Copy weights of main model to all towers.
            state_dict = self.model.state_dict()
            for tower in self.model_gpu_towers:
                tower.load_state_dict(state_dict)

        # Do the (maybe parallelized) gradient calculation step.
        tower_outputs = self._multi_gpu_parallel_grad_calc(batches)

        # Multi device (GPU) case.
        if len(self.devices) > 1:
            # Mean-reduce over GPU-towers.
            all_grads = []
            for i in range(len(tower_outputs[0][0])):
                if tower_outputs[0][0][i] is not None:
                    all_grads.append(
                        torch.mean(torch.stack(
                            [t[0][i].to(self.device) for t in tower_outputs]),
                                   dim=0))
                else:
                    all_grads.append(None)
            # Set main model's grads to mean-reduced values.
            for i, p in enumerate(self.model.parameters()):
                p.grad = all_grads[i]
            # Reduce stats over towers as well.
            from ray.rllib.execution.train_ops import all_tower_reduce
            grad_info = tree.map_structure_with_path(
                lambda p, *t: all_tower_reduce(p, *t),
                *[t[1] for t in tower_outputs])
        # Single device case.
        else:
            all_grads, grad_info = tower_outputs[0]

        grad_info["allreduce_latency"] /= len(self._optimizers)
        grad_info.update(self.extra_grad_info(postprocessed_batch))

        fetches = self.extra_compute_grad_fetches()

        return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})