def act(self, ac: Any) -> None: self._firsts[0] = False state = self._q.popleft() rews = [] def add_reward(subspace, substate, subval): if isinstance(subspace.eltype, types.Discrete): r = 1 if (substate == subval).all() else 0 elif isinstance(subspace.eltype, types.Real): diff = subval - substate diff = diff[:] r = -0.5 * np.dot(diff, diff) else: raise Exception( f"unrecognized action space eltype {subspace.eltype}") rews.append(r) types.multimap(add_reward, self.ac_space, state, ac) rew = sum(rews) / len(rews) if self._step < self._delay_steps: # don't give any reward for guessing un-observed states rew = 0 self._rews[0] = rew self._q.append( types_np.sample(self.ac_space, bshape=(self.num, ), rng=self._rng)) self._step += 1 if self._step >= self._episode_len: self._reset()
def act(self, ac: Any) -> None: _, ob, _ = self.observe() info = self.get_info() # We have to wait for the first call to act() to initialize the _trajectories list, because # sometimes the environment returns observations with dtypes that do not match self.env.ob_space. if self._trajectories is None: self._ob_actual_dtype = multimap(lambda x: x.dtype, ob) self._ac_actual_dtype = multimap(lambda x: x.dtype, ac) self._trajectories = [ self._new_trajectory_dict() for _ in range(self.env.num) ] for i in range(self.env.num): # With non-dict spaces, the `ob` and/or `ac` is a numpy array of shape [batch, obs_shape...] so separating # each trajectory into its own structure was relatively simple. # Take ob[i] then append it to self._trajectories[i]['ob']. # # With dict spaces, the returned ob becomes a nested dict # { # 'obs_key1': [batch, obs1_shape...], # 'obs_key2': [batch, obs2_shape...] # } # So to separate each trajectory, we have to take ob['obs_key1'][i] then append it to # self._trajectories[i]['ob']['obs_key1'] self._trajectories[i]["ob"] = concat( [ self._trajectories[i]["ob"], multimap(lambda x: x[i:i + 1], ob) ], axis=0, ) self._trajectories[i]["act"] = concat( [ self._trajectories[i]["act"], multimap(lambda x: x[i:i + 1], ac) ], axis=0, ) self._trajectories[i]["info"].append(info[i]) super().act(ac) reward, _, first = self.observe() for i in range(self.env.num): self._trajectories[i]["reward"].append(reward[i]) # For each completed trajectory, write it out for i in range(self.env.num): if first[i]: self._write_and_reset_trajectory(i)
def step(self, ac): _, prev_ob, _ = self.env.observe() self.env.act(np.array([ac])) rew, ob, first = self.env.observe() if first[0]: ob = prev_ob return multimap(lambda x: x[0], ob), rew[0], first[0], self.env.get_info()[0]
def _vt2space(vt: ValType): from gym import spaces def tt2space(tt: TensorType): if isinstance(tt.eltype, Discrete): if tt.ndim == 0: return spaces.Discrete(tt.eltype.n) else: return spaces.Box( low=0, high=tt.eltype.n - 1, shape=tt.shape, dtype=types_np.dtype(tt), ) elif isinstance(tt.eltype, Real): return spaces.Box( shape=tt.shape, dtype=types_np.dtype(tt), low=float("-inf"), high=float("inf"), ) else: raise NotImplementedError space = multimap(tt2space, vt) def dict2dict_space(d): if isinstance(d, dict): return spaces.Dict({k: dict2dict_space(v) for k, v in d.items()}) else: return d return dict2dict_space(space)
def observe(self) -> Tuple[Any, Any, Any]: return ( np.array([self.last_rew], "f"), multimap(lambda val: np.expand_dims(np.array(val), axis=0), self.last_ob), np.array([self.last_first], bool), )
def concat(xs: Sequence[Any], axis: int = 0) -> Any: """ Concatenate the (leaf) arrays from xs :param xs: list of trees with the same shape, where the leaf values are numpy arrays :param axis: axis to concatenate along """ return multimap(lambda *xs: np.concatenate(xs, axis=axis), *xs)
def stack(xs: Sequence[Any], axis: int = 0) -> Any: """ Stack the (leaf) arrays from xs :param xs: list of trees with the same shape, where the leaf values are numpy arrays :param axis: axis to stack along """ return multimap(lambda *xs: np.stack(xs, axis=axis), *xs)
def stack(xs: Sequence[Any], dim: int = 0) -> Any: """ Stack the (leaf) tensors from xs :param xs: list of trees with the same shape, where the leaf values are torch tensors :param dim: dimension to stack along """ return multimap(lambda *xs: th.stack(xs, dim=dim), *xs)
def sample(vt: ValType, bshape: Tuple) -> Any: """ :param vt: ValType to create sample for :param bshape: batch shape to prepend to the shape of each torch tensor created by this function :returns: tree of torch tensors matching vt """ return multimap(partial(_sample_tensor, bshape=bshape), vt)
def zeros(vt: ValType, bshape: Tuple) -> Any: """ :param vt: ValType to create zeros for :param bshape: batch shape to prepend to the shape of each tensor created by this function :returns: tree of torch tensors matching vt """ return multimap( lambda subdt: th.zeros(bshape + subdt.shape, dtype=dtype(subdt)), vt)
def sample( vt: ValType, bshape: Tuple, rng: Optional[np.random.RandomState] = None ) -> Any: """ :param vt: ValType to create sample for :param bshape: batch shape to prepend to the shape of each numpy array created by this function :param rng: np.random.RandomState to use for sampling :returns: tree of numpy arrays matching vt """ return multimap(partial(_sample_tensor, bshape=bshape, rng=rng), vt)
def act(self, ac: Any) -> None: # Check we got an action consistent with num_envs=1 _assert_num_envs_1(ac) aczero = multimap(lambda x: x[0], ac) self.last_ob, self.last_rew, self.last_first, self.info = self.gym_env.step( aczero) if self.render_mode == "rgb_array": self.info["rgb"] = self.gym_env.render(mode="rgb_array") elif self.render_mode is not None: self.gym_env.render(mode=self.render_mode) if self.last_first: self.last_ob = self.gym_env.reset()
def _new_trajectory_dict(self): assert self._ob_actual_dtype is not None, ( "Not supposed to happen; self._ob_actual_dtype should have been set" " in the first act() call before _new_trajectory_dict is called") traj_dict = dict( reward=list(), ob=zeros(self.env.ob_space, (0, )), info=list(), act=zeros(self.env.ac_space, (0, )), ) traj_dict["ob"] = multimap( lambda arr, my_dtype: arr.astype(my_dtype), traj_dict["ob"], self._ob_actual_dtype, ) traj_dict["act"] = multimap( lambda arr, my_dtype: arr.astype(my_dtype), traj_dict["act"], self._ac_actual_dtype, ) return traj_dict
def split(x: Any, sections: Sequence[int]) -> Sequence[Any]: """ Split the (leaf) arrays from the tree x Examples: split([1,2,3,4], [1,2,3,4]) => [[1], [2], [3], [4]] split([1,2,3,4], [1,3,4]) => [[1], [2, 3], [4]] :param x: a tree where the leaf values are numpy arrays :param sections: list of indices to split at (not sizes of each split) :returns: list of trees with length `len(sections)` with the same shape as x where each leaf is the corresponding section of the leaf in x """ result = [] start = 0 for end in sections: select_tree = multimap(lambda arr: arr[start:end], x) start = end result.append(select_tree) return result
def reset(self): _rew, ob, first = self.env.observe() if not first[0]: print("Warning: early reset ignored") return multimap(lambda x: x[0], ob)
def act(self, ac: Any) -> None: types.multimap(self._assert_val_matches_space, self.ac_space, ac) self.env.act(ac=ac)
def observe(self) -> Tuple[np.ndarray, Any, np.ndarray]: rew, ob, first = self.env.observe() types.multimap(self._assert_val_matches_space, self.ob_space, ob) assert rew.dtype is np.dtype(np.float32) and rew.shape == (self.num, ) assert first.dtype is np.dtype(np.bool) and first.shape == (self.num, ) return rew, ob, first