Esempio n. 1
0
    def run_batch(self, train_summary=False):
        """Collect trajectories for a single batch and train (if self.train).

        Args:
          train_summary: return a Summary of the training step (losses, etc.).

        Returns:
          result: None (if not self.train) or the return value of agent.train.
        """
        last_obs = self.last_obs
        shapes = (self.n_steps, self.envs.n_envs)
        values = np.zeros(shapes, dtype=np.float32)
        rewards = np.zeros(shapes, dtype=np.float32)
        dones = np.zeros(shapes, dtype=np.float32)
        all_obs, all_actions = [], []
        mb_states = self.states  # save the initial states at the beginning of each mb for later training.

        for n in range(self.n_steps):
            actions, values[n, :], states = self.agent.step(
                last_obs, self.states)
            actions = mask_unused_argument_samples(actions)

            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(
                actions, size=last_obs['screen'].shape[1:3])
            obs_raw = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n, :], dones[n, :] = zip(*[(t.reward, t.last())
                                               for t in obs_raw])
            self.states = states

            for t in obs_raw:
                if t.last():
                    self.cumulative_score += self._summarize_episode(t)

        next_values = self.agent.get_value(last_obs, states)

        returns, advs = compute_returns_and_advs(rewards, dones, values,
                                                 next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs = flatten_first_dims(advs)

        self.last_obs = last_obs

        if self.train:
            return self.agent.train(obs,
                                    mb_states,
                                    actions,
                                    returns,
                                    advs,
                                    summary=train_summary)
        else:
            return None
Esempio n. 2
0
def stack_and_flatten_actions(lst, axis=0):
    fn_id_list, arg_dict_list = zip(*lst)
    fn_id = np.stack(fn_id_list, axis=axis)
    fn_id = flatten_first_dims(fn_id)
    arg_ids = stack_ndarray_dicts(arg_dict_list, axis=axis)
    arg_ids = flatten_first_dims_dict(arg_ids)
    return (fn_id, arg_ids)
Esempio n. 3
0
    def run_batch(self, train_summary):

        last_obs = self.last_obs
        shapes   = (self.n_steps, self.envs.n_envs)
        values   = np.zeros(np.concatenate([[2], shapes]), dtype=np.float32) #first dim: manager values, second dim: worker values
        rewards  = np.zeros(shapes, dtype=np.float32)
        dones    = np.zeros(shapes, dtype=np.float32)
        all_obs, all_actions = [], []
        mb_states = self.states #first dim: manager values, second dim: worker values
        s = np.zeros((self.n_steps, self.envs.n_envs, self.d), dtype=np.float32)
        mb_last_c_goals = np.zeros((self.n_steps, self.envs.n_envs, self.c, self.d), dtype=np.float32)
        mb_last_mo = np.zeros((self.n_steps, self.envs.n_envs, self.c, self.d), dtype=np.float32)

        for n in range(self.n_steps):
            actions, values[:,n,:], states, s[n,:,:], self.last_c_goals, self.lc_manager_outputs = self.agent.step(last_obs, self.states, self.last_c_goals, self.lc_manager_outputs)
            actions = mask_unused_argument_samples(actions)

            all_obs.append(last_obs)
            all_actions.append(actions)
            mb_last_c_goals[n,:,:,:] = self.last_c_goals
            mb_last_mo[n,:,:,:] = self.lc_manager_outputs
            pysc2_actions = actions_to_pysc2(actions, size=last_obs['screen'].shape[1:3])
            obs_raw  = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n,:], dones[n,:] = zip(*[(t.reward,t.last()) for t in obs_raw])
            self.states = states

            for t in obs_raw:
                if t.last():
                    self.cumulative_score += self._summarize_episode(t)

        returns, returns_intr, adv_m, adv_w = compute_returns_and_advantages(
            rewards, dones, values, s, mb_last_c_goals[:,:,-1,:], self.discount, self.T, self.envs.n_envs, self.c
        )
        s_diff = compute_sdiff(s, self.c, self.T, self.envs.n_envs, self.d)
        # last_c_goals = compute_last_c_goals(goals, self.envs.n_envs, self.T, self.c, self.d)
        actions = stack_and_flatten_actions(all_actions[self.c:self.c+self.T])
        obs = stack_ndarray_dicts(all_obs)
        obs = { k:obs[k][self.c:self.c+self.T] for k in obs }
        obs = flatten_first_dims_dict(obs)
        returns = flatten_first_dims(returns)
        returns_intr = flatten_first_dims(returns_intr)
        adv_m = flatten_first_dims(adv_m)
        adv_w = flatten_first_dims(adv_w)
        s_diff = flatten_first_dims(s_diff)
        mb_last_c_goals = flatten_first_dims(mb_last_c_goals[self.c:self.c+self.T])
        prep_lc_mo = flatten_first_dims(mb_last_mo[self.c:self.c+self.T])
        self.last_obs = last_obs

        if self.train:
            return self.agent.train(
                obs,
                mb_states,
                actions,
                returns, returns_intr,
                adv_m, adv_w,
                s_diff,
                mb_last_c_goals,
                prep_lc_mo,
                summary=train_summary
            )
        else:
            return None
Esempio n. 4
0
    def run_batch(self, train_summary=False):
        """Collect trajectories for a single batch and train (if self.train).

        Args:
          train_summary: return a Summary of the training step (losses, etc.).

        Returns:
          result: None (if not self.train) or the return value of agent.train.
        """
        nbatch = self.envs.n_envs*self.n_steps
        assert nbatch % self.nminibatches == 0
        nbatch_train = nbatch // self.nminibatches
        last_obs = self.last_obs
        shapes   = (self.n_steps, self.envs.n_envs)
        values   = np.zeros(shapes, dtype=np.float32)
        rewards  = np.zeros(shapes, dtype=np.float32)
        dones    = np.zeros(shapes, dtype=np.float32)
        all_actions, all_obs = [], []
        mb_states = self.states # save the initial states at the beginning of each mb for later training.

        for n in range(self.n_steps):
            actions, values[n,:], states = self.agent.step(last_obs, self.states) # TODO: would be better if we could get the logprobs here instead of having to calls get_log_probs later.
            actions = mask_unused_argument_samples(actions)

            all_obs.append(last_obs)
            all_actions.append(actions)

            pysc2_actions = actions_to_pysc2(actions, size=last_obs['screen'].shape[1:3])
            obs_raw  = self.envs.step(pysc2_actions)
            last_obs = self.preproc.preprocess_obs(obs_raw)
            rewards[n,:], dones[n,:] = zip(*[(t.reward,t.last()) for t in obs_raw])
            self.states = states

            for t in obs_raw:
                if t.last():
                    self.cumulative_score += self._summarize_episode(t)

        next_values = self.agent.get_value(last_obs, states)

        returns, advs = compute_returns_and_advs(rewards, dones, values, next_values, self.discount)

        actions = stack_and_flatten_actions(all_actions)
        obs     = flatten_first_dims_dict(stack_ndarray_dicts(all_obs))
        returns = flatten_first_dims(returns)
        advs    = flatten_first_dims(advs)
        values = flatten_first_dims(values)
        
        self.last_obs = last_obs

        if self.train:
            mbloss = []
            old_log_probs = self.agent.get_log_probs(obs, self.states, actions)
            if self.states is None:
                # print('train')
                inds = np.arange(nbatch)
                for i in range(self.noptepochs):
                    # print(f'opt {i}')
                    np.random.shuffle(inds)
                    # print('inds', inds)
                    for start in range(0, nbatch, nbatch_train):
                        end = start + nbatch_train
                        # print(f'mb {start}:{end}')
                        mbinds = inds[start:end]
                        mb_obs = { k:obs[k][mbinds] for k in obs }
                        mb_actions = (
                            actions[0][mbinds],
                            { arg_id : actions[1][arg_id][mbinds] for arg_id in actions[1] }
                        )
                        # print(returns.shape, advs.shape, old_log_probs.shape)
                        # print(old_log_probs)
                        mbinputs = [a[mbinds] for a in (returns, advs, old_log_probs, values)]
                        _step, _loss, _summary = self.agent.train(mb_obs, mb_states, mb_actions, *mbinputs, summary=train_summary)
                        # print(f'loss {_step}:{_loss}')
                    mbloss.append(_loss)
                return _step, np.mean(mbloss), _summary
            else:
                raise NotImplementedError('No recurrent policy for PPO yet.')
        else:
            return None