コード例 #1
0
    def store_observation(self, obs):
        """Store a single observation in the buffer at the next available index.

        Overwrites old observations if necessary.
        Parameters
        ----------
        obs: nest of np.array

        Returns
        -------
        idx: int
            Index at which the obs is stored. To be used for `store_effect`
            later.

        """
        if self.obs is None:
            self.obs = nest.map_structure(self._init_obs_data, obs)

        def _store_ob(item):
            buffer, ob = item
            buffer[self.next_idx] = ob

        nest.map_structure(_store_ob, nest.zip_structure(self.obs, obs))

        ret = self.next_idx
        self.next_idx = (self.next_idx + 1) % self.size
        self.num_in_buffer = min(self.size, self.num_in_buffer + 1)

        return ret
コード例 #2
0
    def init_rollout_storage(self):
        """Initialize rollout storage."""
        def _to_torch(o):
            return torch.from_numpy(o).to(self.device)

        self._ob = nest.map_structure(_to_torch, self.env.reset())
        data = self.act(self._ob)
        if 'action' not in data:
            raise ValueError('the key "action" must be in the dict returned '
                             'act_fn')
        if 'value' not in data:
            raise ValueError('the key "value" must be in the dict returned '
                             'act_fn')
        state = None
        if 'state' in data:
            state = data['state']

        if state is None:
            self.init_state = None
            self.recurrent = False
        else:
            self.recurrent = True

            def _init_state(s):
                return torch.zeros(size=s.shape,
                                   device=self.device,
                                   dtype=s.dtype)

            self.init_state = nest.map_structure(_init_state, state)
            self._state = self.init_state

        self._initialized = True
コード例 #3
0
ファイル: buffer.py プロジェクト: tienhoangvan/dl
    def store_effect(self, idx, step_data):
        """Store effects of action taken after obeserving obs stored at idx.

        The reason `store_observation` and `store_effect` is broken
        up into two functions is so that one can call
        `encode_recent_observation` in between.
        Paramters
        ---------
        idx: int
            Index in buffer of recent observation
            (returned by `store_observation`).
        data: dict
            The data to store in the buffer.
        """
        if self.data == {}:
            self._init_replay_data(step_data)
        if not nest.has_same_structure(self.data, step_data):
            raise ValueError("The data passed to ReplayBuffer must the same"
                             " at all time steps.")

        def _insert(item):
            buffer, x = item
            buffer[idx] = x

        nest.map_structure(_insert, nest.zip_structure(self.data, step_data))
コード例 #4
0
    def env_step_and_store_transition(self):
        """Step env and store transition in replay buffer."""
        if self._ob is None:
            self.manual_reset()

        def _remove_batch_dim(ob):
            return ob[0]

        def _to_torch(ob):
            return torch.from_numpy(ob).to(self.device)[None]

        idx = self.buffer.store_observation(
            nest.map_structure(_remove_batch_dim, self._ob))
        ob = self.buffer.encode_recent_observation()
        with torch.no_grad():
            data = self.act(nest.map_structure(_to_torch, ob))
            for k in data:
                data[k] = data[k].cpu().numpy()
        self._ob, r, done, _ = self.env.step(data['action'])
        data['reward'] = r
        data['done'] = done
        # remove batch dimensions
        for k in data:
            data[k] = data[k][0]
        self.buffer.store_effect(idx, data)
        if done:
            self._ob = self.env.reset()
コード例 #5
0
    def _state_reset(self, dones):
        if self.recurrent:

            def _state_item_reset(x):
                x[0, dones].zero_()

            nest.map_structure(_state_item_reset, self._state)
コード例 #6
0
ファイル: dummy_vec_env.py プロジェクト: takuma-ynd/dl
    def step_async(self, actions):
        def _numpy_check(ac):
            if not isinstance(ac, np.ndarray):
                raise ValueError("You must pass actions as nested numpy arrays"
                                 " to DummyVecEnv.")

        nest.map_structure(_numpy_check, actions)
        self.actions = actions
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        type=int,
                        default=None,
                        help="checkpoint timestep")
    parser.add_argument('-n', type=int, default=1, help="number of episodes")
    parser.add_argument('--base',
                        default=False,
                        action='store_true',
                        help="visualize the base_policy")
    args = parser.parse_args()

    t = get_best_eval() if args.t is None else args.t
    env, pi, device, obs, ckpt = _load_env_and_policy('/logdir', t)

    def _to_torch(x):
        return torch.from_numpy(x).to(device)

    def _to_numpy(x):
        return x.cpu().numpy()

    video_dir = '/logdir/video'
    os.makedirs(video_dir, exist_ok=True)
    tmp_dir = os.path.join(video_dir, 'tmp')
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    if args.base:
        output_path = os.path.join(video_dir, 'base_policy.mp4')
    else:
        output_path = os.path.join(video_dir, f'{ckpt:09d}.mp4')

    if os.path.exists(output_path):
        return

    video_writer = VideoWriter(img_dir=tmp_dir)
    for i in range(args.n):
        obs = env.reset()
        video_writer.add_frame(env.render(mode='rgb_array'))

        done = False
        while not done:
            if args.base:
                action = np.zeros_like(obs['action']['torque'])
            else:
                obs = nest.map_structure(_to_torch, obs)
                with torch.no_grad():
                    action = pi(obs).action
                action = nest.map_structure(_to_numpy, action)
            obs, _, done, _ = env.step(action)
            video_writer.add_frame(env.render(mode='rgb_array'))

    video_writer.make_video(output_path)
コード例 #8
0
ファイル: buffer.py プロジェクト: tienhoangvan/dl
    def _encode_sample(self, idxes):
        batch = nest.map_structure(lambda x: x[idxes], self.data)

        def _batch(obs):
            return np.concatenate([ob[np.newaxis, :] for ob in obs], 0)

        obs = [self._encode_observation(idx) for idx in idxes]
        batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs))
        next_obs = [self._encode_observation(idx + 1) for idx in idxes]
        batch['next_obs'] = nest.map_structure(_batch,
                                               nest.zip_structure(*next_obs))
        return batch
コード例 #9
0
ファイル: misc.py プロジェクト: takuma-ynd/dl
def _get_env_ob_norm(env, steps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps):
        ob, _, done, _ = env.step(env.action_space.sample())
        if done:
            ob = env.reset()
        obs.append(ob)
    obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
コード例 #10
0
    def insert(self, step_data):
        """Insert new data into storage.

        Transfers to the correct device if needed.
        """
        if self.data is None:
            self.init_data(step_data)

        if self.rollout_complete:
            raise ValueError("Tried to insert data when the rollout is "
                             " complete. Call rollout.reset() to reset.")

        if self.step >= self.num_steps:
            self.extend_storage()

        if set(step_data.keys()) != self.keys:
            raise ValueError("The same data must be provided at every step.")

        def _copy_data(item):
            storage, step_data = item
            if step_data.device != self.device:
                storage[self.step].copy_(step_data.to(self.device))
            else:
                storage[self.step].copy_(step_data)

        def _check_shape(data, key):
            if data.shape[0] != self.num_processes:
                raise ValueError(f"data '{key}' is expected to have its "
                                 f"0th dimension equal to the number "
                                 f"of processes: {self.num_processes}")

        for k in self.keys:
            nest.map_structure(partial(_check_shape, key=k), step_data[k])
            nest.map_structure(_copy_data,
                               nest.zip_structure(self.data[k], step_data[k]))

        if self.step == 0:
            self.data['return'].fill_(0.)
            self.data['q_mc'].fill_(0.)
            done = torch.zeros_like(self.data['done'][0])
        else:
            done = self.data['done'][self.step - 1]
        if len(step_data['reward'].shape) == 2:
            r = torch.logical_not(done.unsqueeze(-1)) * step_data['reward'].to(
                self.device)
        else:
            r = torch.logical_not(done) * step_data['reward'].to(self.device)
        self.data['return'] += r

        self.sequence_lengths += torch.logical_not(step_data['done'].cpu())
        self.step = self.step + 1
        self.rollout_complete = bool(torch.all(step_data['done']))
コード例 #11
0
ファイル: misc.py プロジェクト: tienhoangvan/dl
def _get_env_ob_norm(env, steps, eps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps):
        ob, _, done, _ = env.step(env.action_space.sample())
        if done:
            ob = env.reset()
        obs.append(ob)
    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    data = nest.zip_structure(obs, unpack_space(env.observation_space))
    mean = nest.map_structure(_compute_mean, data)
    std = nest.map_structure(_compute_std(eps), data)
    return mean, std
コード例 #12
0
def _get_venv_ob_norm(env, steps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps // env.num_envs):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if np.any(done):
            ob = env.reset(force=False)
        obs.append(ob)
    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
コード例 #13
0
    def update(self, mean, var, count):
        if self.count == 0:
            self.mean = mean
            self.var = var
            self.count = count

        else:
            self.batch_count = count
            self.new_count = count + self.count
            nest.map_structure(
                self._update, nest.zip_structure(self.mean, self.var, mean, var)
            )
            self.count = self.new_count
コード例 #14
0
ファイル: subproc_vec_env.py プロジェクト: takuma-ynd/dl
    def step_async(self, actions):
        self._assert_not_closed()

        def _numpy_check(ac):
            if not isinstance(ac, np.ndarray):
                raise ValueError("You must pass actions as nested numpy arrays"
                                 " to SubprocVecEnv.")
        nest.map_structure(_numpy_check, actions)
        for i, remote in enumerate(self.remotes):
            if self._dones[i]:
                continue
            action = nest.map_structure(lambda ac: ac[i], actions)
            remote.send(('step', action))
        self.waiting = True
コード例 #15
0
    def rollout_step(self):
        """Compute one environment step."""
        with torch.no_grad():
            if self.recurrent:
                outs = self.act(self._ob, state_in=self._state)
            else:
                outs = self.act(self._ob, state_in=None)
        cpu_action = nest.map_structure(lambda ac: ac.cpu().numpy(),
                                        outs['action'])
        ob, r, done, infos = self.env.step(cpu_action)
        data = {}
        data['obs'] = self._ob
        data['action'] = outs['action']
        data['reward'] = torch.from_numpy(r).float().to(self.device)
        data['done'] = torch.from_numpy(done).to(self.device)
        data['vpred'] = outs['value']
        for key in outs:
            if key not in ['action', 'value', 'state']:
                data[key] = outs[key]

        def _to_torch(o):
            return torch.from_numpy(o).to(self.device)

        self._ob = nest.map_structure(_to_torch, ob)
        if self.recurrent:
            self._state = outs['state']

        self._step += 1
        truncated = self._get_truncated_envs(infos)
        if self._dones is not None:
            prev_step_not_done = torch.logical_not(self._dones)
            truncated = truncated & prev_step_not_done
        at_end_of_rollout = (self.rollout_length
                             and self._step >= self.rollout_length)
        if at_end_of_rollout or torch.any(truncated):
            next_vpred = self._get_next_value()

        if self.rollout_length:
            assert self._step <= self.rollout_length
        if at_end_of_rollout:
            self._state_reset(data['done'])
            to_augment = torch.logical_not(data['done']) | truncated
            data['done'][:] = True
        else:
            to_augment = truncated
        if torch.any(to_augment):
            data['reward'][to_augment] += self.gamma * next_vpred[to_augment]

        self._dones = data['done']
        self.storage.insert(data)
コード例 #16
0
    def _encode_sample(self, idxes):
        batch = {}

        def _batch(obs):
            return np.concatenate([ob[np.newaxis, :] for ob in obs], 0)

        obs = [self._encode_observation(idx) for idx in idxes]
        batch['obs'] = nest.map_structure(_batch, nest.zip_structure(*obs))
        for k in self.data.keys():
            batch[k] = self.data[k][idxes]
        next_obs = [self._encode_observation(idx + 1) for idx in idxes]
        batch['next_obs'] = nest.map_structure(_batch,
                                               nest.zip_structure(*next_obs))
        return batch
コード例 #17
0
ファイル: dummy_vec_env.py プロジェクト: takuma-ynd/dl
    def step_wait(self):
        active = [False for _ in range(self.num_envs)]

        for e in range(self.num_envs):
            if self.transitions[e] is None or not self.transitions[e][
                    2]:  # if episode is over:
                action = nest.map_structure(lambda ac: ac[e], self.actions)
                self.transitions[e] = self.envs[e].step(action)
                active[e] = True

        obs, rs, dones, infos = zip(*self.transitions)
        for e, info in enumerate(infos):
            info['active'] = active[e]
        obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
        return obs, np.stack(rs), np.stack(dones), infos
コード例 #18
0
ファイル: misc.py プロジェクト: tienhoangvan/dl
def _get_venv_ob_norm(env, steps, eps):
    ob = env.reset()
    obs = [ob]
    for _ in range(steps // env.num_envs):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if np.any(done):
            ob = env.reset(force=False)
        obs.append(ob)

    obs = nest.map_structure(np.concatenate, nest.zip_structure(*obs))
    data = nest.zip_structure(obs, unpack_space(env.observation_space))
    mean = nest.map_structure(_compute_mean, data)
    std = nest.map_structure(_compute_std(eps), data)
    return mean, std
コード例 #19
0
ファイル: eval.py プロジェクト: takuma-ynd/dl
    def __call__(self, ob):
        """__call__."""
        with torch.no_grad():

            def _to_torch(o):
                return torch.from_numpy(o).to(self.device)

            ob = nest.map_structure(_to_torch, ob)
            if self.state is None:
                out = self.net(ob)
            else:
                out = self.net(ob, self.state)
            if hasattr(out, 'state_out'):
                self.state = out.state_out
            return nest.map_structure(lambda x: x.cpu().numpy(), out.action)
コード例 #20
0
ファイル: misc.py プロジェクト: takuma-ynd/dl
def _get_venv_ob_norm(env, steps):
    # Only collect obs from the first environment. This is hacky and
    # inefficient but is that simplest solution given that environments sync
    # their resets.
    ob = env.reset()
    obs = [nest.map_structure(lambda x: x[0], ob)]
    for _ in range(steps):
        ob, _, done, _ = env.step(
            np.array([env.action_space.sample() for _ in range(env.num_envs)]))
        if done[0]:
            ob = env.reset()
        obs.append(nest.map_structure(lambda x: x[0], ob))
    obs = nest.map_structure(np.stack, nest.zip_structure(*obs))
    mean = nest.map_structure(lambda x: np.mean(x, axis=0), obs)
    std = nest.map_structure(lambda x: np.std(x, axis=0), obs)
    return mean, std
コード例 #21
0
ファイル: obs_norm_wrappers.py プロジェクト: takuma-ynd/dl
    def _normalize(self, obs):
        if not self.should_norm:
            return obs
        if self.mean is None or self.std is None:
            self.find_norm_params()
        obs = nest.map_structure(np.asarray, obs)
        obs = nest.map_structure(np.float32, obs)
        if not nest.has_same_structure(self.mean, obs):
            raise ValueError("mean and obs do not have the same structure!")

        def norm(item):
            ob, mean, std = item
            return (ob - mean) / std

        return nest.map_structure(norm,
                                  nest.zip_structure(obs, self.mean, self.std))
コード例 #22
0
ファイル: obs_norm_wrappers.py プロジェクト: tienhoangvan/dl
    def __init__(self,
                 venv,
                 norm=True,
                 steps=10000,
                 mean=None,
                 std=None,
                 eps=1e-2,
                 log=True,
                 log_prob=0.01):
        """Init."""
        super().__init__(venv)
        self.steps = steps
        self.should_norm = norm
        self.eps = eps
        self.log = log
        self.log_prob = log_prob
        self.t = 0
        self._eval = False
        self.mean = None
        self.std = None
        self._dones = np.zeros(self.num_envs, dtype=np.bool)

        if mean is not None and std is not None:
            if not nest.has_same_structure(mean, std):
                raise ValueError("mean and std must have the same structure.")
            self.mean = mean
            self.std = nest.map_structure(lambda x: np.maximum(x, self.eps),
                                          std)
コード例 #23
0
    def _encode_observation(self, idx):
        def _encode(ob, idx):
            end_idx = idx + 1  # make noninclusive
            start_idx = end_idx - self.obs_history_len
            # if there weren't enough obs ever in the buffer for context
            if start_idx < 0 and self.num_in_buffer != self.size:
                start_idx = 0
            for idx in range(start_idx, end_idx - 1):
                if self.data['done'][idx % self.size]:
                    start_idx = idx + 1
            missing_context = self.obs_history_len - (end_idx - start_idx)
            # if zero padding is needed for missing context
            # or we are on the boundry of the buffer
            if start_idx < 0 or missing_context > 0:
                obs = [np.zeros_like(ob[0]) for _ in range(missing_context)]
                for idx in range(start_idx, end_idx):
                    obs.append(ob[idx % self.size])
                return np.concatenate(obs, 0)
            else:
                # this optimization has potential to saves about 30% compute
                # time
                s = ob.shape[2:]
                return ob[start_idx:end_idx].reshape(-1, *s)

        return nest.map_structure(partial(_encode, idx=idx), self.obs)
コード例 #24
0
    def __call__(self, ob, state_in=None):
        """Produce decision from model."""
        if self.t < self.policy_training_start:
            outs = self.pi(ob, state_in, deterministic=True)
        else:
            outs = self.pi(ob, state_in)

        def _res_norm(ac):
            return ac.abs().sum(dim=1).mean()
        residual_norm = nest.map_structure(_res_norm, outs.action)
        if isinstance(residual_norm, torch.Tensor):
            logger.add_scalar('actor/l1_residual_norm', residual_norm, self.t,
                              time.time())
            self.t += outs.action.shape[0]
        else:
            self.t += nest.flatten(outs.action)[0].shape[0]
            for k, v in residual_norm.items():
                logger.add_scalar(f'actor/{k}_residual_norm', v, self.t,
                                  time.time())
        data = {'action': outs.action,
                'value': self.vf(ob).value,
                'logp': outs.dist.log_prob(outs.action),
                'dist': outs.dist.to_tensors()}
        if outs.state_out:
            data['state'] = outs.state_out
        return data
コード例 #25
0
def get_norm_params(n, difficulty, use_domain_rand):
    term_fn = 'position_close_to_goal' if difficulty < 4 else 'pos_and_rot_close_to_goal'
    env = make_training_env(32, MPPGStateMachine, difficulty, 'torque_and_position',
                            frameskip=3,
                            sim=True,
                            visualization=False,
                            reward_fn='competition_reward',
                            termination_fn=term_fn,
                            initializer='training_init',
                            episode_length=3750,
                            monitor=False,
                            seed=0,
                            norm_observations=True,
                            max_torque=0.0,
                            max_position=0.0,  # set all residual actions to 0
                            denylist_states=['FailureState'],
                            domain_randomization=use_domain_rand
                            )
    env.steps = n
    env.find_norm_params()

    def get_var(std):
        if std is not None:
            return std ** 2
    return env.mean, nest.map_structure(get_var, env.std), env
コード例 #26
0
ファイル: trainer.py プロジェクト: cbschaff/rsa
    def step(self):
        # Get batch.
        if self._diter is None:
            self._diter = self.dtrain.__iter__()
        try:
            batch = self._diter.__next__()
        except StopIteration:
            self.epochs += 1
            self._diter = None
            return self.epochs
        batch = nest.map_structure(lambda x: x.to(self.device), batch)

        # compute loss
        ob, ac = batch
        self.model.train()
        loss = -self.model(ob).log_prob(ac).mean()

        logger.add_scalar('train/loss',
                          loss.detach().cpu().numpy(), self.t, time.time())

        # update model
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

        # increment step
        self.t += min(
            len(self.data) - (self.t % len(self.data)), self.batch_size)
        return self.epochs
コード例 #27
0
 def store_observation(self, obs):
     inds = []
     for i, buf in enumerate(self.buffers):
         inds.append(
             buf.store_observation(nest.map_structure(lambda x: x[i], obs)))
     self._update_num_in_buffer()
     return inds
コード例 #28
0
ファイル: alpha_zero.py プロジェクト: takuma-ynd/dl
    def sample(self, batch_size):
        """Sample a batch of self play data."""
        batch = self.buffer.sample(batch_size)

        def _to_torch(x):
            return torch.from_numpy(x).to(self.device)

        return nest.map_structure(_to_torch, batch)
コード例 #29
0
ファイル: dummy_vec_env.py プロジェクト: takuma-ynd/dl
 def _reset_done_envs(self):
     obs = []
     for e in range(self.num_envs):
         if self.transitions[e] is None or self.transitions[e][2]:
             self.transitions[e] = None
             obs.append(self.envs[e].reset())
         else:
             obs.append(self.transitions[e][0])
     return nest.map_structure(np.stack, nest.zip_structure(*obs))
コード例 #30
0
ファイル: distributions.py プロジェクト: takuma-ynd/dl
    def log_prob(self, ac):
        """Log prob."""
        def _log_prob(item):
            dist, action = item
            return dist.log_prob(action)

        log_probs = nest.map_structure(_log_prob,
                                       nest.zip_structure(self.dists, ac))
        return sum(nest.flatten(log_probs))