def _set_default_specs(self): self.observation_spec = ts.TensorSpec((3, 3), torch.float32) self.action_spec = ts.BoundedTensorSpec([7], dtype=torch.float32, minimum=-1.0, maximum=1.0) self.time_step_spec = ds.time_step_spec(self.observation_spec, self.action_spec, ts.TensorSpec(()))
def time_step_spec(self): """Describes the ``TimeStep`` fields returned by ``step()``. Override this method to define an environment that uses non-standard values for any of the items returned by ``step()``. For example, an environment with tensor-valued rewards. Returns: A ``TimeStep`` namedtuple containing (possibly nested) ``TensorSpec``s defining the step_type, reward, discount, observation, prev_action, and end_id. """ return time_step_spec(self.observation_spec(), self.action_spec(), self.reward_spec())
def __init__(self, gym_env, env_id=None, discount=1.0, auto_reset=True, simplify_box_bounds=True): """ Args: gym_env (gym.Env): An instance of OpenAI gym environment. env_id (int): (optional) ID of the environment. discount (float): Discount to use for the environment. auto_reset (bool): whether or not to reset the environment when done. simplify_box_bounds (bool): whether or not to simplify redundant arrays to values for spec bounds. """ super(AlfGymWrapper, self).__init__() self._gym_env = gym_env self._discount = discount if env_id is None: env_id = 0 self._env_id = np.int32(env_id) self._action_is_discrete = isinstance(self._gym_env.action_space, gym.spaces.Discrete) # TODO: Add test for auto_reset param. self._auto_reset = auto_reset self._observation_spec = tensor_spec_from_gym_space( self._gym_env.observation_space, simplify_box_bounds) self._action_spec = tensor_spec_from_gym_space( self._gym_env.action_space, simplify_box_bounds) if hasattr(self._gym_env, "reward_space"): self._reward_spec = tensor_spec_from_gym_space( self._gym_env.reward_space, simplify_box_bounds) else: self._reward_spec = TensorSpec(()) self._time_step_spec = ds.time_step_spec( self._observation_spec, self._action_spec, self._reward_spec) self._info = None self._done = True self._zero_info = self._obtain_zero_info() self._env_info_spec = nest.map_structure(TensorSpec.from_array, self._zero_info)
def __init__(self, observation_spec, action_spec, env_id=None, episode_end_probability=0.1, discount=1.0, reward_fn=None, batch_size=None, seed=42, render_size=(2, 2, 3), min_duration=0, max_duration=None): """Initializes the environment. Args: observation_spec (nested TensorSpec): tensor spec for observations action_spec (nested TensorSpec): tensor spec for actions. env_id (int): (optional) ID of the environment. episode_end_probability (float): Probability an episode will end when the environment is stepped. discount (float): Discount to set in time_steps. reward_fn (Callable): Callable that takes in step_type, action, an observation(s), and returns a tensor of rewards. batch_size (int): (Optional) Number of observations generated per call. If this value is not `None`, then all actions are expected to have an additional major axis of size `batch_size`, and all outputs will have an additional major axis of size `batch_size`. seed (int): Seed to use for rng used in observation generation. render_size (tuple of ints): Size of the random render image to return when calling render. min_duration (int): Number of steps at the beginning of the episode during which the episode can not terminate. max_duration (int): Optional number of steps after which the episode terminates regarless of the termination probability. Raises: ValueError: If batch_size argument is not None and does not match the shapes of discount or reward. """ self._batch_size = batch_size self._observation_spec = observation_spec self._action_spec = action_spec self._time_step_spec = ds.time_step_spec( self._observation_spec, action_spec, ts.TensorSpec(())) self._episode_end_probability = episode_end_probability discount = np.asarray(discount, dtype=np.float32) if env_id is None: self._env_id = np.int32(0) else: self._env_id = np.int32(env_id) if self._batch_size: if not discount.shape: discount = np.tile(discount, self._batch_size) if self._batch_size != len(discount): raise ValueError( 'Size of discounts must equal the batch size.') self._discount = discount if reward_fn is None: # Return a reward whose size matches the batch size if self._batch_size is None: self._reward_fn = lambda *_: np.float32(0) else: self._reward_fn = ( lambda *_: np.zeros(self._batch_size, dtype=np.float32)) else: self._reward_fn = reward_fn self._done = True self._num_steps = 0 self._min_duration = min_duration self._max_duration = max_duration self._rng = np.random.RandomState(seed) self._render_size = render_size super(RandomAlfEnvironment, self).__init__()
def _test_preprocess_experience(self, train_reward_function, td_steps, reanalyze_ratio, expected): """ The following summarizes how the data is generated: .. code-block:: python # position: 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' scale = 1. for current model 2. for target model observation = [position] * 3 reward = position if train_reward_function and td_steps!=-1 else position * (step_type == LAST) value = 0.5 * position * scale action_probs = scale * [position, position+1, position] for env 0 scale * [position+1, position, position] for env 1 action = 1 for env 0 0 for env 1 """ reanalyze_td_steps = 2 num_unroll_steps = 4 batch_size = 2 obs_dim = 3 observation_spec = alf.TensorSpec([obs_dim]) action_spec = alf.BoundedTensorSpec((), minimum=0, maximum=1, dtype=torch.int32) reward_spec = alf.TensorSpec(()) time_step_spec = ds.time_step_spec(observation_spec, action_spec, reward_spec) global _mcts_model_id _mcts_model_id = 0 muzero = MuzeroAlgorithm(observation_spec, action_spec, model_ctor=_create_mcts_model, mcts_algorithm_ctor=MockMCTSAlgorithm, num_unroll_steps=num_unroll_steps, td_steps=td_steps, train_game_over_function=True, train_reward_function=train_reward_function, reanalyze_ratio=reanalyze_ratio, reanalyze_td_steps=reanalyze_td_steps, data_transformer_ctor=partial(FrameStacker, stack_size=2)) data_transformer = FrameStacker(observation_spec, stack_size=2) time_step = common.zero_tensor_from_nested_spec( time_step_spec, batch_size) dt_state = common.zero_tensor_from_nested_spec( data_transformer.state_spec, batch_size) state = muzero.get_initial_predict_state(batch_size) transformed_time_step, dt_state = data_transformer.transform_timestep( time_step, dt_state) alg_step = muzero.rollout_step(transformed_time_step, state) alg_step_spec = dist_utils.extract_spec(alg_step) experience = ds.make_experience(time_step, alg_step, state) experience_spec = ds.make_experience(time_step_spec, alg_step_spec, muzero.train_state_spec) replay_buffer = ReplayBuffer(data_spec=experience_spec, num_environments=batch_size, max_length=16, keep_episodic_info=True) # 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' dt_state = common.zero_tensor_from_nested_spec( data_transformer.state_spec, batch_size) for i in range(len(step_type0)): step_type = [step_type0[i], step_type1[i]] step_type = [ ds.StepType.MID if c == 'M' else (ds.StepType.FIRST if c == 'F' else ds.StepType.LAST) for c in step_type ] step_type = torch.tensor(step_type, dtype=torch.int32) reward = reward = torch.full([batch_size], float(i)) if not train_reward_function or td_steps == -1: reward = reward * (step_type == ds.StepType.LAST).to( torch.float32) time_step = time_step._replace( discount=(step_type != ds.StepType.LAST).to(torch.float32), step_type=step_type, observation=torch.tensor([[i, i + 1, i], [i + 1, i, i]], dtype=torch.float32), reward=reward, env_id=torch.arange(batch_size, dtype=torch.int32)) transformed_time_step, dt_state = data_transformer.transform_timestep( time_step, dt_state) alg_step = muzero.rollout_step(transformed_time_step, state) experience = ds.make_experience(time_step, alg_step, state) replay_buffer.add_batch(experience) state = alg_step.state env_ids = torch.tensor([0] * 14 + [1] * 14, dtype=torch.int64) positions = torch.tensor(list(range(14)) + list(range(14)), dtype=torch.int64) experience = replay_buffer.get_field(None, env_ids.unsqueeze(-1).cpu(), positions.unsqueeze(-1).cpu()) experience = experience._replace(replay_buffer=replay_buffer, batch_info=BatchInfo( env_ids=env_ids, positions=positions), rollout_info_field='rollout_info') processed_experience = muzero.preprocess_experience(experience) import pprint pprint.pprint(processed_experience.rollout_info) alf.nest.map_structure(lambda x, y: self.assertEqual(x, y), processed_experience.rollout_info, expected)
def test_preprocess_experience(self): """ The following summarizes how the data is generated: .. code-block:: python # position: 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' reward = position if train_reward_function and td_steps!=-1 else position * (step_type == LAST) action = t + 1 for env 0 t for env 1 """ num_unroll_steps = 4 batch_size = 2 obs_dim = 3 observation_spec = alf.TensorSpec([obs_dim]) action_spec = alf.BoundedTensorSpec((1, ), minimum=0, maximum=1, dtype=torch.float32) reward_spec = alf.TensorSpec(()) time_step_spec = ds.time_step_spec(observation_spec, action_spec, reward_spec) repr_learner = PredictiveRepresentationLearner( observation_spec, action_spec, num_unroll_steps=num_unroll_steps, decoder_ctor=partial(SimpleDecoder, target_field='reward', decoder_net_ctor=partial( EncodingNetwork, fc_layer_params=(4, ))), encoding_net_ctor=LSTMEncodingNetwork, dynamics_net_ctor=LSTMEncodingNetwork) time_step = common.zero_tensor_from_nested_spec( time_step_spec, batch_size) state = repr_learner.get_initial_predict_state(batch_size) alg_step = repr_learner.rollout_step(time_step, state) alg_step = alg_step._replace(output=torch.tensor([[1.], [0.]])) alg_step_spec = dist_utils.extract_spec(alg_step) experience = ds.make_experience(time_step, alg_step, state) experience_spec = ds.make_experience(time_step_spec, alg_step_spec, repr_learner.train_state_spec) replay_buffer = ReplayBuffer(data_spec=experience_spec, num_environments=batch_size, max_length=16, keep_episodic_info=True) # 01234567890123 step_type0 = 'FMMMLFMMLFMMMM' step_type1 = 'FMMMMMLFMMMMLF' for i in range(len(step_type0)): step_type = [step_type0[i], step_type1[i]] step_type = [ ds.StepType.MID if c == 'M' else (ds.StepType.FIRST if c == 'F' else ds.StepType.LAST) for c in step_type ] step_type = torch.tensor(step_type, dtype=torch.int32) reward = reward = torch.full([batch_size], float(i)) time_step = time_step._replace( discount=(step_type != ds.StepType.LAST).to(torch.float32), step_type=step_type, observation=torch.tensor([[i, i + 1, i], [i + 1, i, i]], dtype=torch.float32), reward=reward, env_id=torch.arange(batch_size, dtype=torch.int32)) alg_step = repr_learner.rollout_step(time_step, state) alg_step = alg_step._replace(output=i + torch.tensor([[1.], [0.]])) experience = ds.make_experience(time_step, alg_step, state) replay_buffer.add_batch(experience) state = alg_step.state env_ids = torch.tensor([0] * 14 + [1] * 14, dtype=torch.int64) positions = torch.tensor(list(range(14)) + list(range(14)), dtype=torch.int64) experience = replay_buffer.get_field(None, env_ids.unsqueeze(-1).cpu(), positions.unsqueeze(-1).cpu()) experience = experience._replace(replay_buffer=replay_buffer, batch_info=BatchInfo( env_ids=env_ids, positions=positions), rollout_info_field='rollout_info') processed_experience = repr_learner.preprocess_experience(experience) pprint.pprint(processed_experience.rollout_info) # yapf: disable expected = PredictiveRepresentationLearnerInfo( action=torch.tensor( [[[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 5.]], [[ 3., 4., 5., 5., 5.]], [[ 4., 5., 5., 5., 5.]], [[ 5., 5., 5., 5., 5.]], [[ 6., 7., 8., 9., 9.]], [[ 7., 8., 9., 9., 9.]], [[ 8., 9., 9., 9., 9.]], [[ 9., 9., 9., 9., 9.]], [[10., 11., 12., 13., 14.]], [[11., 12., 13., 14., 14.]], [[12., 13., 14., 14., 14.]], [[13., 14., 14., 14., 14.]], [[14., 14., 14., 14., 14.]], [[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 6.]], [[ 3., 4., 5., 6., 6.]], [[ 4., 5., 6., 6., 6.]], [[ 5., 6., 6., 6., 6.]], [[ 6., 6., 6., 6., 6.]], [[ 7., 8., 9., 10., 11.]], [[ 8., 9., 10., 11., 12.]], [[ 9., 10., 11., 12., 12.]], [[10., 11., 12., 12., 12.]], [[11., 12., 12., 12., 12.]], [[12., 12., 12., 12., 12.]], [[13., 13., 13., 13., 13.]]]).unsqueeze(-1), mask=torch.tensor( [[[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, True, True, True, True]], [[ True, True, True, True, True]], [[ True, True, True, True, False]], [[ True, True, True, False, False]], [[ True, True, False, False, False]], [[ True, False, False, False, False]], [[ True, False, False, False, False]]]), target=torch.tensor( [[[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 4.]], [[ 2., 3., 4., 4., 4.]], [[ 3., 4., 4., 4., 4.]], [[ 4., 4., 4., 4., 4.]], [[ 5., 6., 7., 8., 8.]], [[ 6., 7., 8., 8., 8.]], [[ 7., 8., 8., 8., 8.]], [[ 8., 8., 8., 8., 8.]], [[ 9., 10., 11., 12., 13.]], [[10., 11., 12., 13., 13.]], [[11., 12., 13., 13., 13.]], [[12., 13., 13., 13., 13.]], [[13., 13., 13., 13., 13.]], [[ 0., 1., 2., 3., 4.]], [[ 1., 2., 3., 4., 5.]], [[ 2., 3., 4., 5., 6.]], [[ 3., 4., 5., 6., 6.]], [[ 4., 5., 6., 6., 6.]], [[ 5., 6., 6., 6., 6.]], [[ 6., 6., 6., 6., 6.]], [[ 7., 8., 9., 10., 11.]], [[ 8., 9., 10., 11., 12.]], [[ 9., 10., 11., 12., 12.]], [[10., 11., 12., 12., 12.]], [[11., 12., 12., 12., 12.]], [[12., 12., 12., 12., 12.]], [[13., 13., 13., 13., 13.]]])) # yapf: enable alf.nest.map_structure(lambda x, y: self.assertEqual(x, y), processed_experience.rollout_info, expected)