def _generate_time_step(batched, observation, step_type, discount, prev_action=None, action_spec=None, reward=None, reward_spec=ts.TensorSpec(()), env_id=None, env_info={}): flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): md = np if reward is not None: reward = np.float32(reward) discount = np.float32(discount) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") md = torch if reward is not None: reward = to_tensor(reward, dtype=torch.float32) discount = to_tensor(discount, dtype=torch.float32) if batched: batch_size = flat_observation[0].shape[0] outer_dims = (batch_size, ) if env_id is None: env_id = md.arange(batch_size, dtype=md.int32) if reward is not None: assert reward.shape[:1] == outer_dims if prev_action is not None: flat_action = nest.flatten(prev_action) assert flat_action[0].shape[:1] == outer_dims else: outer_dims = () if env_id is None: env_id = md.zeros((), dtype=md.int32) step_type = md.full(outer_dims, step_type, dtype=md.int32) if reward is None: reward = md.zeros(outer_dims + reward_spec.shape, dtype=md.float32) discount = md.ones(outer_dims, dtype=md.float32) * discount if prev_action is None: prev_action = nest.map_structure( lambda spec: md.zeros(outer_dims + spec.shape, dtype=getattr( md, ts.torch_dtype_to_str(spec.dtype))), action_spec) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def _step(self, time_step: TimeStep, state, calc_rewards=True): """ Args: time_step (TimeStep): input time step data, where the observation is skill-augmened observation. The skill should be a one-hot vector. state (Tensor): state for DIAYN (previous skill) which should be a one-hot vector. calc_rewards (bool): if False, only return the losses. Returns: AlgStep: output: empty tuple () state: skill info (DIAYNInfo): """ observations_aug = time_step.observation step_type = time_step.step_type observation, skill = observations_aug prev_skill = state.detach() # normalize observation for easier prediction if self._observation_normalizer is not None: observation = self._observation_normalizer.normalize(observation) if self._encoding_net is not None: feature, _ = self._encoding_net(observation) skill_pred, _ = self._discriminator_net(feature) if self._skill_spec.is_discrete: loss = torch.nn.CrossEntropyLoss(reduction='none')( input=skill_pred, target=torch.argmax(prev_skill, dim=-1)) else: # nn.MSELoss doesn't support reducing along a dim loss = torch.sum(math_ops.square(skill_pred - prev_skill), dim=-1) valid_masks = (step_type != to_tensor(StepType.FIRST)).to( torch.float32) loss *= valid_masks intrinsic_reward = () if calc_rewards: intrinsic_reward = -loss.detach() intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgStep( output=(), state=skill, info=DIAYNInfo(reward=intrinsic_reward, loss=loss))
def __init__(self, env, reward_weights=None): """ Args: env (AlfEnvironment): An AlfEnvironment instance to be wrapped. reward_weights (list[float] | tuple[float]): a list/tuple of weights for the rewards; if None, then the first dimension will be 1 and the other dimensions will be 0s. """ super(ScalarRewardWrapper, self).__init__(env) reward_spec = env.reward_spec() assert reward_spec.ndim == 1, ( "This wrapper only supports vector rewards! Reward tensor rank: %d" % reward_spec.ndim) rewards_n = reward_spec.shape[0] if reward_weights is None: reward_weights = [1.] + [0.] * (rewards_n - 1) assert (isinstance(reward_weights, (list, tuple)) and len(reward_weights) == rewards_n) self._np_reward_weights = np.array(reward_weights) self._tensor_reward_weights = to_tensor(reward_weights)
def termination(observation, prev_action, reward, env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set to ``StepType.LAST``. Called by ``env.step()`` if 'Done'. ``discount`` should not be sent in and will be set as 0. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's statically known rank is not 0 or 1. """ flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): reward = np.float32(reward) if env_id is None: env_id = np.int32(0) step_type = StepType.LAST discount = np.float32(0.0) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") reward = to_tensor(reward, dtype=torch.float32) assert reward.dim() <= 1, "Expected reward to be a scalar or vector." if reward.dim() == 0: shape = [] if env_id is None: env_id = torch.tensor(0, dtype=torch.int32) else: flat_action = nest.flatten(prev_action) assert flat_observation[0].shape[:1] == reward.shape assert flat_action[0].shape[:1] == reward.shape shape = reward.shape env_id = torch.arange(shape[0], dtype=torch.int32) step_type = torch.full(shape, StepType.LAST, dtype=torch.int32) discount = torch.full(shape, 0.0, dtype=torch.float32) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def transition(observation, prev_action, reward, discount=1.0, env_id=None, env_info={}): """Returns a ``TimeStep`` with ``step_type`` set equal to ``StepType.MID``. Called by ``env.step()`` if not 'Done'. The batch size is inferred from the shape of ``reward``. If ``discount`` is a scalar, and ``observation`` contains tensors, then ``discount`` will be broadcasted to match ``reward.shape``. Args: observation (nested tensors): current observations of the env. prev_action (nested tensors): previous actions to the the env. reward (float): A scalar, or 1D NumPy array, or tensor. discount (float): (optional) A scalar, or 1D NumPy array, or tensor. env_id (torch.int32): (optional) A scalar or 1D tensor of the environment ID(s). env_info (dict): extra info returned by the environment. Returns: TimeStep: Raises: ValueError: If observations are tensors but reward's rank is not 0 or 1. """ flat_observation = nest.flatten(observation) if all(map(_is_numpy_array, flat_observation)): reward = np.float32(reward) if env_id is None: env_id = np.int32(0) step_type = StepType.MID discount = np.float32(discount) return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info) else: assert all( map(torch.is_tensor, flat_observation)), ("Elements in observation must be Tensor") # TODO: If reward.shape.rank == 2, and static # batch sizes are available for both flat_observation and reward, # check that these match. reward = to_tensor(reward, dtype=torch.float32) assert reward.dim() <= 1, "Expected reward to be a scalar or vector." if reward.dim() == 0: shape = [] if env_id is None: env_id = torch.tensor(0, dtype=torch.int32) else: flat_action = nest.flatten(prev_action) assert flat_observation[0].shape[:1] == reward.shape assert flat_action[0].shape[:1] == reward.shape shape = reward.shape env_id = torch.arange(shape[0], dtype=torch.int32) step_type = torch.full(shape, StepType.MID, dtype=torch.int32) discount = to_tensor(discount, dtype=torch.float32) if discount.dim() == 0: discount = torch.full(shape, discount, dtype=torch.float32) else: assert reward.shape == discount.shape return TimeStep(step_type, reward, discount, observation, prev_action, env_id, env_info=env_info)
def _create_trajectories(self): # Order of args for timestep_* methods: # reward, env_id, env_info ts0 = timestep_first([0, 0], [1, 2], dict(x=to_tensor([1, 0]), y=to_tensor([1, 1]))) ts1 = timestep_mid([1, 2], [1, 2], dict(x=to_tensor([1, 2]), y=to_tensor([0, 3]))) ts2 = timestep_last([3, 4], [1, 2], dict(x=to_tensor([-1, -2]), y=to_tensor([1, -1]))) ts3 = timestep_first([0, 0], [1, 2], dict(x=to_tensor([1, 1]), y=to_tensor([1, 1]))) ts4 = timestep_mid([5, 6], [1, 2], dict(x=to_tensor([2, -2]), y=to_tensor([-1, -6]))) ts5 = timestep_last([7, 8], [1, 2], dict(x=to_tensor([10, 10]), y=to_tensor([5, 5]))) return [ts0, ts1, ts2, ts3, ts4, ts5]
def _create_timestep(reward, env_id, step_type, env_info): return TimeStep(step_type=to_tensor(step_type), reward=to_tensor(reward), env_info=env_info, env_id=to_tensor(env_id))