def _relabel_given_goal(relabel_goal): obs_dim = relabel_goal.shape[0] all_trajectories = nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec) last_traj_idx = len(all_trajectories) for traj_idx, cur_trajectory in enumerate(all_trajectories): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) if traj_idx == len(all_trajectories) - 1: next_obs = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: next_obs = tf.concat([ all_trajectories[traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = traj_idx + 1 full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) if last_traj_idx == len(all_trajectories): last_observation = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: last_observation = tf.concat([ all_trajectories[last_traj_idx].observation[:obs_dim], relabel_goal ], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))
def _train(self, experience, weights=None): del weights # unused experience = self._as_trajectory(experience) reward, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observation, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) policy_choice, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info[mixture_policy.MIXTURE_AGENT_ID], self._time_step_spec.reward) original_infos, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info[mixture_policy.SUBPOLICY_INFO], self._original_info_spec) partitioned_nested_infos = nest_utils.batch_nested_tensors( _dynamic_partition_of_nested_tensors(original_infos, policy_choice, self._num_agents)) partitioned_nested_rewards = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( reward, policy_choice, self._num_agents) ] partitioned_nested_actions = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( action, policy_choice, self._num_agents) ] partitioned_nested_observations = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( observation, policy_choice, self._num_agents) ] loss = 0 for k in range(self._num_agents): per_policy_experience = trajectory.single_step( observation=partitioned_nested_observations[k], action=partitioned_nested_actions[k], policy_info=partitioned_nested_infos[k], reward=partitioned_nested_rewards[k], discount=tf.zeros_like(partitioned_nested_rewards[k])) loss_info = self._agents[k].train(per_policy_experience) loss += loss_info.loss common.function_in_tf1()(self._update_mixture_distribution)(experience) return tf_agent.LossInfo(loss=(loss), extra=())
def testTrainMaskingRewardMultipleEpisodesRewardOnLast(self): # Test that train reacts correctly to experience when there are: # * Multiple MDP episodes # * Rewards on the tf.StepType.LAST transitions # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (L, F) -> (F, L) -> (L, F) # observation : [1, 2] [1, 2] [1, 2] [1, 2] # action : [0] [1] [2] [3] # reward : 0 3 0 4 # ~is_boundary: 1 0 1 0 # is_last : 1 0 1 0 # valid reward: 0*1 3*0 0*1 4*0 # # The second & fourth action & reward should be masked out due to being on a # boundary (step_type=(L, F)) transition. # # The expected_loss is = 0.0 in this case. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet(self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST ]) next_step_type = tf.constant([ ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST ]) reward = tf.constant([0, 3, 0, 4], dtype=tf.float32) discount = tf.constant([1, 0, 1, 0], dtype=tf.float32) observations = tf.constant([[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors( trajectory.Trajectory(step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.LAST should be counted. expected_loss = 0.0 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def _distribution(self, time_step, policy_state): batched = nest_utils.is_batched_nested_tensors(time_step, self._time_step_spec) if not batched: time_step = nest_utils.batch_nested_tensors(time_step) policy_dist_step = self._wrapped_policy.distribution( time_step, policy_state) policy_state = policy_dist_step.state policy_info = policy_dist_step.info policy_logits = policy_dist_step.action.logits_parameter() action_size = tf.shape(policy_logits)[-1] greedy_probs = tf.one_hot(tf.argmax(policy_logits, -1), action_size) uniform_probs = (tf.ones(tf.shape(policy_logits)) / tf.cast(action_size, tf.float32)) epsilon = self._get_epsilon() mixed_probs = (1 - epsilon) * greedy_probs + epsilon * uniform_probs if not batched: mixed_probs = tf.squeeze(mixed_probs, 0) policy_state = nest_utils.unbatch_nested_tensors(policy_state) policy_info = nest_utils.unbatch_nested_tensors(policy_info) mixed_dist = tfp.distributions.Categorical( probs=mixed_probs, dtype=policy_dist_step.action.dtype) return policy_step.PolicyStep(mixed_dist, policy_state, policy_info)
def _action(self, time_step, policy_state, seed): del seed def _mode(dist, spec): action = dist.mode() return tf.reshape(action, [ -1, ] + spec.shape.as_list()) # TODO(oars): Remove batched data checks when tf_env is batched. time_step_batched = nest_utils.is_batched_nested_tensors( time_step, self._time_step_spec) if not time_step_batched: time_step = nest_utils.batch_nested_tensors( time_step, self._time_step_spec) distribution_step = self._wrapped_policy.distribution( time_step, policy_state) actions = nest.map_structure(_mode, distribution_step.action, self._action_spec) if not time_step_batched: actions = nest_utils.unbatch_nested_tensors( actions, self._action_spec) return policy_step.PolicyStep(actions, distribution_step.state, distribution_step.info)
def testBatchedSingleTensor(self): tensor = tf.zeros([5, 2, 3], dtype=tf.float32) spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32) batched_tensor = nest_utils.batch_nested_tensors(tensor, spec) self.assertEqual(batched_tensor.shape.as_list(), [5, 2, 3])
def get_passable(self): if self._num_envs == 1: return nest_utils.batch_nested_tensors( tf.cast(self._envs[0].passable, tf.float32)) else: return tf.stack(lambda env: tf.cast(env.passable, tf.float32), self._envs)
def _action(self, time_step, policy_state, seed): if seed is not None: raise NotImplementedError( 'seed is not supported; but saw seed: {}'.format(seed)) def _action_fn(*flattened_time_step_and_policy_state): packed_py_time_step, packed_py_policy_state = tf.nest.pack_sequence_as( structure=(self._py_policy.time_step_spec, self._py_policy.policy_state_spec), flat_sequence=flattened_time_step_and_policy_state) py_action_step = self._py_policy.action( time_step=packed_py_time_step, policy_state=packed_py_policy_state) return tf.nest.flatten(py_action_step) with tf.name_scope('action'): flattened_input_tensors = tf.nest.flatten( (nest_utils.unbatch_nested_tensors(time_step), policy_state)) flat_action_step = tf.compat.v1.py_func(_action_fn, flattened_input_tensors, self._policy_step_dtypes, stateful=True, name='action_py_func') action_step = tf.nest.pack_sequence_as( structure=self.policy_step_spec, flat_sequence=flat_action_step) return action_step._replace( action=nest_utils.batch_nested_tensors(action_step.action))
def get_distance_to_goal(self): if self._num_envs == 1: return nest_utils.batch_nested_tensors( tf.cast(self._envs[0].distance_to_goal, tf.float32)) else: return tf.stack( lambda env: tf.cast(env.distance_to_goal, tf.float32), self._envs)
def get_num_blocks(self): if self._num_envs == 1: return nest_utils.batch_nested_tensors( tf.cast(self._envs[0].n_clutter_placed, tf.float32)) else: return tf.stack( lambda env: tf.cast(env.n_clutter_placed, tf.float32), self._envs)
def testTrainMaskingPartialEpisodeMultipleEpisodesRewardOnFirst(self): # Test that train reacts correctly to experience when there are: # * Multiple MDP episodes # * Rewards on the tf.StepType.FIRST transitions # * Partial episode at end of experience # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (L, F) -> (F, M) -> (M, M) # observation : [1, 2] [1, 2] [1, 2] [1, 2] # action : [0] [1] [2] [3] # reward : 3 0 4 0 # ~is_boundary: 1 0 1 1 # is_last : 1 0 0 0 # valid reward: 3*1 0*0 4*0 0*0 # # The second action & reward should be masked out due to being on a # boundary (step_type=(L, F)) transition. The third & fourth transitions # should get masked out for everything due to it being an incomplete episode # (notice there is no trailing step_type=(F,L)). # # The expected_loss is > 0.0 in this case, matching the expected_loss of the # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test, # because the partial second episode should be masked out. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.MID]) next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.MID, ts.StepType.MID]) reward = tf.constant([3, 0, 4, 0], dtype=tf.float32) discount = tf.constant([1, 0, 1, 0], dtype=tf.float32) observations = tf.constant( [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors(trajectory.Trajectory( step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.FIRST should be counted. expected_loss = 10.8935775757 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def get_deliberate_placement(self): if self._num_envs == 1: return nest_utils.batch_nested_tensors( tf.cast(self._envs[0].deliberate_agent_placement, tf.float32)) else: return tf.stack( lambda env: tf.cast(env.deliberate_agent_placement, tf.float32 ), self._envs)
def get_shortest_path_length(self): if self._num_envs == 1: return nest_utils.batch_nested_tensors( tf.cast(self._envs[0].shortest_path_length, tf.float32)) else: return tf.stack( lambda env: tf.cast(env.shortest_path_length, tf.float32), self._envs)
def testTrainMaskingRewardMultipleBanditEpisodes(self): # Test that train reacts correctly to experience when there are multiple # Bandit episodes. Bandit episodes are encoded differently than # MDP episodes. They (each) have only a single transition with # step_type=StepType.FIRST and next_step_type=StepType.LAST. This test # helps ensure that LAST->FIRST->LAST transitions are handled correctly. # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (F, L) # observation : [1, 2] [1, 2] # action : [0] [2] # reward : 3 4 # ~is_boundary: 0 0 # is_last : 1 1 # valid reward: 3*1 4*1 # # All bandit transitions are valid and none are masked. # # The expected_loss is > 0.0 in this case, matching the expected_loss of the # testMaskingRewardMultipleEpisodesRewardOnFirst policy_gradient_loss test. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet(self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ts.StepType.FIRST, ts.StepType.FIRST]) next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.LAST]) reward = tf.constant([3, 4], dtype=tf.float32) discount = tf.constant([0, 0], dtype=tf.float32) observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [2]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors( trajectory.Trajectory(step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.FIRST should be counted. expected_loss = 12.2091741562 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def testBatchNestedTensors(self): shape = [2, 3] batch_shape = [1] + shape specs = self.nest_spec(shape) tensors = self.zeros_from_spec(specs) tf.nest.assert_same_structure(tensors, specs) batched_tensors = nest_utils.batch_nested_tensors(tensors, specs) tf.nest.assert_same_structure(specs, batched_tensors) assert_shapes = lambda t: self.assertEqual(t.shape.as_list(), batch_shape) tf.nest.map_structure(assert_shapes, batched_tensors)
def testRandomPyPolicyGeneratesActionTensors(self): array_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10) observation = tf.ones([3], tf.float32) time_step = ts.restart(observation) observation_spec = tensor_spec.TensorSpec.from_tensor(observation) time_step_spec = ts.time_step_spec(observation_spec) tf_py_random_policy = tf_py_policy.TFPyPolicy( random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec, action_spec=array_action_spec)) batched_time_step = nest_utils.batch_nested_tensors(time_step) action_step = tf_py_random_policy.action(time_step=batched_time_step) action, new_policy_state = self.evaluate( [action_step.action, action_step.state]) self.assertEqual((1,) + array_action_spec.shape, action.shape) self.assertTrue(np.all(action >= array_action_spec.minimum)) self.assertTrue(np.all(action <= array_action_spec.maximum)) self.assertEqual(new_policy_state, ())
def _apply_actor_network(self, time_step, step_type, policy_state, mask=None): observation = time_step if self._observation_normalizer: observation = self._observation_normalizer.normalize(observation) if tf.is_tensor(observation): if not nest_utils.is_batched_nested_tensors( observation, self.time_step_spec.observation): observation = nest_utils.batch_nested_tensors(observation) else: if not nest_utils.get_outer_array_shape( observation, self.time_step_spec.observation): observation = nest_utils.batch_nested_array(observation) alpha = np.array([self.alpha])[None] return self._actor_network((observation, alpha), step_type, policy_state, training=self._training)
def _distribution(self, time_step, policy_state): batched = nest_utils.is_batched_nested_tensors(time_step, self._time_step_spec) if not batched: time_step = nest_utils.batch_nested_tensors(time_step) policy_dist_step = self._wrapped_policy.distribution( time_step, policy_state) policy_state = policy_dist_step.state policy_mean_action = policy_dist_step.action.mean() policy_info = policy_dist_step.info if not batched: policy_state = nest_utils.unbatch_nested_tensors(policy_state) policy_mean_action = nest_utils.unbatch_nested_tensors( policy_mean_action) policy_info = nest_utils.unbatch_nested_tensors(policy_info) gaussian_dist = tfp.distributions.MultivariateNormalDiag( loc=policy_mean_action, scale_diag=tf.ones_like(policy_mean_action) * self._scale) return policy_step.PolicyStep(gaussian_dist, policy_state, policy_info)
def relabel_function(cur_episode, last_step, reward_fn, full_buffer): all_data = cur_episode.gather_all() # add all actual interaction to the replay buffer all_data = nest_utils.unbatch_nested_tensors(all_data) for cur_trajectory in nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec): # was already added by previous iteration if cur_trajectory.step_type.numpy() != 2: full_buffer.add_batch( nest_utils.batch_nested_tensors(cur_trajectory)) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_step.observation[0], next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj)) def _relabel_given_goal(relabel_goal): obs_dim = relabel_goal.shape[0] all_trajectories = nest_utils.unstack_nested_tensors( all_data, full_buffer.data_spec) last_traj_idx = len(all_trajectories) for traj_idx, cur_trajectory in enumerate(all_trajectories): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) if traj_idx == len(all_trajectories) - 1: next_obs = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: next_obs = tf.concat([ all_trajectories[traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = traj_idx + 1 full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) full_buffer.add_batch( nest_utils.batch_nested_tensors(new_traj)) if last_traj_idx == len(all_trajectories): last_observation = tf.concat( [last_step.observation[0, :obs_dim], relabel_goal], axis=0) else: last_observation = tf.concat([ all_trajectories[last_traj_idx].observation[:obs_dim], relabel_goal ], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj)) # relabel with last time step achieved in the episode if FLAGS.goal_relabel_type == 0 or (FLAGS.goal_relabel_type == 1 and last_step.reward.numpy()[0] <= 0.): obs_dim = last_step.observation.shape[1] // 2 _relabel_given_goal(last_step.observation[0, :obs_dim]) elif FLAGS.goal_relabel_type == 2 and last_step.reward.numpy()[0] <= 0.: goals = [ [1.2, 0., 2.5, 0., -1., -1.], [2., 0., 2.4, 0., 0., 0.], [0.8, 0., 1.2, 0., 0., 0.], [-0.1, -0.3, 0.3, -0.3, 0., 0.], [-0.6, -1., -0.2, -1., 0., 0.], [-1.8, -1., -1.4, -1., 0., 0.], [-2.8, -0.8, -2.4, -1., -1., -1.], [-2.4, 0., -2.4, -1., -1., -1.], [-1.2, 0., -2.4, -1., -1., -1.], [0.0, 0.0, -2.5, -1, -1., -1.], ] goals = np.stack(goals).astype('float32') print('unrelabelled goal:', last_step.observation[0, 6:].numpy()) relabel_goal_idxs = np.arange(goals.shape[0]) np.random.shuffle(relabel_goal_idxs) obs_dim = last_step.observation.shape[1] // 2 relabel_count = 0 for goal_idx in relabel_goal_idxs: chosen_goal = goals[goal_idx] if (chosen_goal == last_step.observation[0, obs_dim:].numpy()).all(): continue print('goal for relabelling:', chosen_goal) _relabel_given_goal(relabel_goal=tf.constant(chosen_goal)) relabel_count += 1 if relabel_count >= FLAGS.num_relabelled_goals: break else: print('not adding relabelled trajectories')
def decode_and_batch_fn(proto): """Decodes a proto object, and batch output tensors.""" sample = decoder(proto) return nest_utils.batch_nested_tensors(sample)
def testWrongShapeRaisesValueError(self): tensor = tf.zeros([3, 3], dtype=tf.float32) spec = tensor_spec.TensorSpec([2, 3], dtype=tf.float32) with self.assertRaises(ValueError): nest_utils.batch_nested_tensors(tensor, spec)
def data_multiplier(offline_data, reward_fn): def _custom_print(some_traj): # pylint: disable=unused-variable np.set_printoptions(precision=2, suppress=True) print('step', some_traj.step_type.numpy(), 'obs', some_traj.observation.numpy(), 'action', some_traj.action.numpy(), 'reward', some_traj.reward.numpy(), 'next_step', some_traj.next_step_type.numpy(), 'discount', some_traj.discount.numpy()) all_data = nest_utils.unbatch_nested_tensors(offline_data.gather_all()) all_trajs = nest_utils.unstack_nested_tensors(all_data, offline_data.data_spec) for idx, traj in enumerate(all_trajs): # print('index:', idx) if traj.step_type.numpy() == 0: ep_start_idx = idx # print('new start index:', ep_start_idx) # TODO(architsh): remove this and change to else: # elif idx in [12, 24, 36, 48, 60, 72, 84, 96, 108]: else: # print('adding new trajectory') obs_dim = traj.observation.shape[0] // 2 relabel_goal = traj.observation[:obs_dim] # print('new goal:', relabel_goal) last_traj_idx = len(all_trajs[ep_start_idx:idx + 1]) for traj_idx, cur_trajectory in enumerate( all_trajs[ep_start_idx:idx + 1]): if cur_trajectory.step_type.numpy() != 2: new_obs = tf.concat( [cur_trajectory.observation[:obs_dim], relabel_goal], axis=0) next_obs = tf.concat([ all_trajs[ep_start_idx + traj_idx + 1].observation[:obs_dim], relabel_goal ], axis=0) new_reward = tf.constant(reward_fn(obs=next_obs)) # terminate episode if new_reward.numpy() > 0.0: new_traj = cur_trajectory._replace( observation=new_obs, next_step_type=tf.constant(2), reward=new_reward, discount=tf.constant(0., dtype=tf.float32)) last_traj_idx = ep_start_idx + traj_idx + 1 # _custom_print(new_traj) offline_data.add_batch( nest_utils.batch_nested_tensors(new_traj)) break else: new_traj = cur_trajectory._replace( observation=new_obs, reward=new_reward, ) # _custom_print(new_traj) offline_data.add_batch( nest_utils.batch_nested_tensors(new_traj)) last_observation = tf.concat( [all_trajs[last_traj_idx].observation[:obs_dim], relabel_goal], axis=0) last_traj = cur_trajectory._replace( # pylint: disable=undefined-loop-variable step_type=tf.constant(2), observation=last_observation, next_step_type=tf.constant(0), reward=tf.constant(0.0), discount=tf.constant(1., dtype=tf.float32)) # _custom_print(last_traj) offline_data.add_batch(nest_utils.batch_nested_tensors(last_traj))
def copy_replay_buffer(small_buffer, big_buffer): """Copy small buffer into the big buffer.""" all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all()) for trajectory in nest_utils.unstack_nested_tensors( all_data, big_buffer.data_spec): big_buffer.add_batch(nest_utils.batch_nested_tensors(trajectory))
def testTrainMaskingRewardSingleEpisodeRewardOnFirst(self): # Test that train reacts correctly to experience when there are: # * A single MDP episode # * Rewards on the tf.StepType.FIRST transitions # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (L, F) # observation : [1, 2] [1, 2] # action : [0] [1] # reward : 3 4 # ~is_boundary: 1 0 # is_last : 1 0 # valid reward: 3*1 4*0 # # The second action & reward should be masked out due to being on a # boundary (step_type=(L, F)) transition. # # The expected_loss is > 0.0 in this case, matching the expected_loss of the # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST]) next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST]) reward = tf.constant([3, 4], dtype=tf.float32) discount = tf.constant([1, 0], dtype=tf.float32) observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [1]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors(trajectory.Trajectory( step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.FIRST should be counted. expected_loss = 10.8935775757 expected_policy_gradient_loss = 10.8935775757 expected_policy_network_regularization_loss = 0 expected_entropy_regularization_loss = 0 expected_value_estimation_loss = 0 expected_value_network_regularization_loss = 0 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss) self.assertAllClose(loss_info.extra.policy_gradient_loss, expected_policy_gradient_loss) self.assertAllClose(loss_info.extra.policy_network_regularization_loss, expected_policy_network_regularization_loss) self.assertAllClose(loss_info.extra.entropy_regularization_loss, expected_entropy_regularization_loss) self.assertAllClose(loss_info.extra.value_estimation_loss, expected_value_estimation_loss) self.assertAllClose(loss_info.extra.value_network_regularization_loss, expected_value_network_regularization_loss)
def _get_step(self) -> EnvStep: if self._start_on_next_step: self._start_new_episode() if StepType.is_last(self._step_type): # This is the last (terminating) observation of the environment. self._start_on_next_step = True self._num_total_steps += 1 self._num_episodes += 1 # The policy is not run on the terminal step, so we just carry over the # reward, action, and policy_info from the previous step. return EnvStep(self._step_type, tf.cast(self._cur_step_num, dtype=tf.int64), self._time_step.observation, self._action, self._time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._action, self._policy_state, self._policy_info = self._policy.action( self._time_step, self._policy_state) # Update type of log-probs to tf.float32... a bit of a bug in TF-Agents. if hasattr(self._policy_info, 'log_probability'): self._policy_info = policy_step.set_log_probability( self._policy_info, tf.cast(self._policy_info.log_probability, tf.float32)) # Sample action from policy. env_action = self._action if self._env.batch_size is not None: env_action = nest_utils.batch_nested_tensors(env_action) # Sample next step from environment. self._next_time_step = self._env.step(env_action) if self._env.batch_size is not None: self._next_time_step = nest_utils.unbatch_nested_tensors( self._next_time_step) self._next_step_type = self._next_time_step.step_type self._cur_step_num += 1 if (self._episode_step_limit and self._cur_step_num >= self._episode_step_limit): self._next_step_type = tf.convert_to_tensor( # Overwrite step type. value=StepType.LAST, dtype=self._first_step_type.dtype) self._next_step_type = tf.reshape(self._next_step_type, tf.shape(self._first_step_type)) step = EnvStep( self._step_type, tf.cast(self._cur_step_num - 1, tf.int64), self._time_step.observation, self._action, # Immediate reward given by next time step. self._next_time_step.reward, self._time_step.discount, self._policy_info, {}, {}) self._num_steps += 1 self._num_total_steps += 1 if StepType.is_first(self._step_type): self._num_total_episodes += 1 self._time_step = self._next_time_step self._step_type = self._next_step_type return step