Example #1
0
    def _relabel_given_goal(relabel_goal):
        obs_dim = relabel_goal.shape[0]
        all_trajectories = nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec)
        last_traj_idx = len(all_trajectories)
        for traj_idx, cur_trajectory in enumerate(all_trajectories):
            if cur_trajectory.step_type.numpy() != 2:
                new_obs = tf.concat(
                    [cur_trajectory.observation[:obs_dim], relabel_goal],
                    axis=0)

                if traj_idx == len(all_trajectories) - 1:
                    next_obs = tf.concat(
                        [last_step.observation[0, :obs_dim], relabel_goal],
                        axis=0)
                else:
                    next_obs = tf.concat([
                        all_trajectories[traj_idx + 1].observation[:obs_dim],
                        relabel_goal
                    ],
                                         axis=0)

                new_reward = tf.constant(reward_fn(obs=next_obs))

                # terminate episode
                if new_reward.numpy() > 0.0:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        next_step_type=tf.constant(2),
                        reward=new_reward,
                        discount=tf.constant(0., dtype=tf.float32))
                    last_traj_idx = traj_idx + 1
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))
                    break
                else:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        reward=new_reward,
                    )
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))

        if last_traj_idx == len(all_trajectories):
            last_observation = tf.concat(
                [last_step.observation[0, :obs_dim], relabel_goal], axis=0)
        else:
            last_observation = tf.concat([
                all_trajectories[last_traj_idx].observation[:obs_dim],
                relabel_goal
            ],
                                         axis=0)

        last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
            step_type=tf.constant(2),
            observation=last_observation,
            next_step_type=tf.constant(0),
            reward=tf.constant(0.0),
            discount=tf.constant(1., dtype=tf.float32))
        full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))
Example #2
0
    def testUnstackNestedTensors(self):
        shape = [5, 8]
        batch_size = 7

        specs = self.nest_spec(shape, include_sparse=False)
        batched_tensors = self.zeros_from_spec(specs, batch_size=batch_size)
        tf.nest.assert_same_structure(batched_tensors, specs)

        tensors = nest_utils.unstack_nested_tensors(batched_tensors, specs)
        self.assertEqual(batch_size, len(tensors))

        for t in tensors:
            tf.nest.assert_same_structure(specs, t)
        assert_shapes = lambda t: self.assertEqual(t.shape.as_list(), shape)
        tf.nest.map_structure(assert_shapes, tensors)
        def loop_body(counter, time_step, policy_state):
            """Runs a step in environment.

      While loop will call multiple times.

      Args:
        counter: Episode counters per batch index. Shape [batch_size].
        time_step: TimeStep tuple with elements shape [batch_size, ...].
        policy_state: Poicy state tensor shape [batch_size, policy_state_dim].
          Pass empty tuple for non-recurrent policies.

      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            action_step = self.policy.action(time_step, policy_state)

            # TODO: TF2 while_loop seems to either ignore
            # parallel_iterations or doesn't properly propagate control dependencies
            # from one step to the next. Without this dep, self.env.step() is called
            with tf.control_dependencies(tf.nest.flatten([time_step])):
                # in parallel.
                next_time_step = self.env.step(action_step.action)

            policy_state = action_step.state

            if self._is_bandit_env:
                # For Bandits we create episodes of length 1.
                # Since the `next_time_step` is always of type LAST we need to replace
                # the step type of the current `time_step` to FIRST.
                batch_size = tf.shape(input=time_step.discount)
                time_step = time_step._replace(
                    step_type=tf.fill(batch_size, ts.StepType.FIRST))

            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)

            observer_ops = [observer(traj) for observer in self._observers]
            transition_observer_ops = [
                observer((time_step, action_step, next_time_step))
                for observer in self._transition_observers
            ]
            with tf.control_dependencies(
                [tf.group(observer_ops + transition_observer_ops)]):
                time_step, next_time_step, policy_state = tf.nest.map_structure(
                    tf.identity, (time_step, next_time_step, policy_state))

            # While loop counter is only incremented for episode reset episodes.
            # For Bandits, this is every trajectory, for MDPs, this is at boundaries.
            if self._is_bandit_env:
                counter += tf.ones(batch_size, dtype=tf.int32)
            else:
                counter += tf.cast(traj.is_boundary(), dtype=tf.int32)

            if not tf.reduce_any(tf.less(counter, 1)):
                # all episodes have finished:
                for ep_id in range(self._num_episodes):
                    episode = self._temp_rb._get_episode(ep_id)
                    if episode.observation['task_agn_rew'][-1] == 1:
                        rew_type = episode.observation['task_agn_rew'].dtype
                        ep_len = episode.observation['task_agn_rew'].shape[0]
                        start = max(-self._ep_history_unsafe, -ep_len)
                        if self._unsafe_label == 'constant':
                            discount = tf.ones((-start, ), dtype=rew_type)
                        elif self._unsafe_label == 'exp':
                            discount = 0.99**tf.reverse(tf.range(
                                -start, dtype=rew_type),
                                                        axis=[0])
                        elif self._unsafe_label == 'linear':
                            discount = (tf.range(-start, dtype=rew_type) +
                                        1) / -start
                        discount = tf.pad(discount, [[ep_len + start, 0]])
                        obs = episode.observation
                        obs['task_agn_rew'] = discount
                        episode._replace(observation=obs)
                    trajs = nest_utils.unstack_nested_tensors(
                        episode, self._final_rb.data_spec)
                    for traj in trajs:
                        self._final_rb.add_batch(traj)
                self._temp_rb.clear()

            return [counter, next_time_step, policy_state]
Example #4
0
def copy_replay_buffer(small_buffer, big_buffer):
    """Copy small buffer into the big buffer."""
    all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all())
    for trajectory in nest_utils.unstack_nested_tensors(
            all_data, big_buffer.data_spec):
        big_buffer.add_batch(nest_utils.batch_nested_tensors(trajectory))
Example #5
0
def data_multiplier(offline_data, reward_fn):
    def _custom_print(some_traj):  # pylint: disable=unused-variable
        np.set_printoptions(precision=2, suppress=True)
        print('step', some_traj.step_type.numpy(), 'obs',
              some_traj.observation.numpy(),
              'action', some_traj.action.numpy(), 'reward',
              some_traj.reward.numpy(), 'next_step',
              some_traj.next_step_type.numpy(), 'discount',
              some_traj.discount.numpy())

    all_data = nest_utils.unbatch_nested_tensors(offline_data.gather_all())
    all_trajs = nest_utils.unstack_nested_tensors(all_data,
                                                  offline_data.data_spec)

    for idx, traj in enumerate(all_trajs):
        # print('index:', idx)
        if traj.step_type.numpy() == 0:
            ep_start_idx = idx
            # print('new start index:', ep_start_idx)
        # TODO(architsh): remove this and change to else:
        # elif idx in [12, 24, 36, 48, 60, 72, 84, 96, 108]:
        else:
            # print('adding new trajectory')
            obs_dim = traj.observation.shape[0] // 2
            relabel_goal = traj.observation[:obs_dim]
            # print('new goal:', relabel_goal)

            last_traj_idx = len(all_trajs[ep_start_idx:idx + 1])
            for traj_idx, cur_trajectory in enumerate(
                    all_trajs[ep_start_idx:idx + 1]):
                if cur_trajectory.step_type.numpy() != 2:
                    new_obs = tf.concat(
                        [cur_trajectory.observation[:obs_dim], relabel_goal],
                        axis=0)

                    next_obs = tf.concat([
                        all_trajs[ep_start_idx + traj_idx +
                                  1].observation[:obs_dim], relabel_goal
                    ],
                                         axis=0)

                    new_reward = tf.constant(reward_fn(obs=next_obs))
                    # terminate episode
                    if new_reward.numpy() > 0.0:
                        new_traj = cur_trajectory._replace(
                            observation=new_obs,
                            next_step_type=tf.constant(2),
                            reward=new_reward,
                            discount=tf.constant(0., dtype=tf.float32))
                        last_traj_idx = ep_start_idx + traj_idx + 1
                        # _custom_print(new_traj)
                        offline_data.add_batch(
                            nest_utils.batch_nested_tensors(new_traj))
                        break
                    else:
                        new_traj = cur_trajectory._replace(
                            observation=new_obs,
                            reward=new_reward,
                        )
                        # _custom_print(new_traj)
                        offline_data.add_batch(
                            nest_utils.batch_nested_tensors(new_traj))

            last_observation = tf.concat(
                [all_trajs[last_traj_idx].observation[:obs_dim], relabel_goal],
                axis=0)
            last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
                step_type=tf.constant(2),
                observation=last_observation,
                next_step_type=tf.constant(0),
                reward=tf.constant(0.0),
                discount=tf.constant(1., dtype=tf.float32))
            # _custom_print(last_traj)
            offline_data.add_batch(nest_utils.batch_nested_tensors(last_traj))
Example #6
0
def relabel_function(cur_episode, last_step, reward_fn, full_buffer):
    all_data = cur_episode.gather_all()

    # add all actual interaction to the replay buffer
    all_data = nest_utils.unbatch_nested_tensors(all_data)
    for cur_trajectory in nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec):
        # was already added by previous iteration
        if cur_trajectory.step_type.numpy() != 2:
            full_buffer.add_batch(
                nest_utils.batch_nested_tensors(cur_trajectory))

    last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
        step_type=tf.constant(2),
        observation=last_step.observation[0],
        next_step_type=tf.constant(0),
        reward=tf.constant(0.0),
        discount=tf.constant(1., dtype=tf.float32))
    full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))

    def _relabel_given_goal(relabel_goal):
        obs_dim = relabel_goal.shape[0]
        all_trajectories = nest_utils.unstack_nested_tensors(
            all_data, full_buffer.data_spec)
        last_traj_idx = len(all_trajectories)
        for traj_idx, cur_trajectory in enumerate(all_trajectories):
            if cur_trajectory.step_type.numpy() != 2:
                new_obs = tf.concat(
                    [cur_trajectory.observation[:obs_dim], relabel_goal],
                    axis=0)

                if traj_idx == len(all_trajectories) - 1:
                    next_obs = tf.concat(
                        [last_step.observation[0, :obs_dim], relabel_goal],
                        axis=0)
                else:
                    next_obs = tf.concat([
                        all_trajectories[traj_idx + 1].observation[:obs_dim],
                        relabel_goal
                    ],
                                         axis=0)

                new_reward = tf.constant(reward_fn(obs=next_obs))

                # terminate episode
                if new_reward.numpy() > 0.0:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        next_step_type=tf.constant(2),
                        reward=new_reward,
                        discount=tf.constant(0., dtype=tf.float32))
                    last_traj_idx = traj_idx + 1
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))
                    break
                else:
                    new_traj = cur_trajectory._replace(
                        observation=new_obs,
                        reward=new_reward,
                    )
                    full_buffer.add_batch(
                        nest_utils.batch_nested_tensors(new_traj))

        if last_traj_idx == len(all_trajectories):
            last_observation = tf.concat(
                [last_step.observation[0, :obs_dim], relabel_goal], axis=0)
        else:
            last_observation = tf.concat([
                all_trajectories[last_traj_idx].observation[:obs_dim],
                relabel_goal
            ],
                                         axis=0)

        last_traj = cur_trajectory._replace(  # pylint: disable=undefined-loop-variable
            step_type=tf.constant(2),
            observation=last_observation,
            next_step_type=tf.constant(0),
            reward=tf.constant(0.0),
            discount=tf.constant(1., dtype=tf.float32))
        full_buffer.add_batch(nest_utils.batch_nested_tensors(last_traj))

    # relabel with last time step achieved in the episode
    if FLAGS.goal_relabel_type == 0 or (FLAGS.goal_relabel_type == 1
                                        and last_step.reward.numpy()[0] <= 0.):
        obs_dim = last_step.observation.shape[1] // 2
        _relabel_given_goal(last_step.observation[0, :obs_dim])

    elif FLAGS.goal_relabel_type == 2 and last_step.reward.numpy()[0] <= 0.:
        goals = [
            [1.2, 0., 2.5, 0., -1., -1.],
            [2., 0., 2.4, 0., 0., 0.],
            [0.8, 0., 1.2, 0., 0., 0.],
            [-0.1, -0.3, 0.3, -0.3, 0., 0.],
            [-0.6, -1., -0.2, -1., 0., 0.],
            [-1.8, -1., -1.4, -1., 0., 0.],
            [-2.8, -0.8, -2.4, -1., -1., -1.],
            [-2.4, 0., -2.4, -1., -1., -1.],
            [-1.2, 0., -2.4, -1., -1., -1.],
            [0.0, 0.0, -2.5, -1, -1., -1.],
        ]
        goals = np.stack(goals).astype('float32')
        print('unrelabelled goal:', last_step.observation[0, 6:].numpy())
        relabel_goal_idxs = np.arange(goals.shape[0])
        np.random.shuffle(relabel_goal_idxs)
        obs_dim = last_step.observation.shape[1] // 2

        relabel_count = 0
        for goal_idx in relabel_goal_idxs:
            chosen_goal = goals[goal_idx]
            if (chosen_goal == last_step.observation[0,
                                                     obs_dim:].numpy()).all():
                continue
            print('goal for relabelling:', chosen_goal)
            _relabel_given_goal(relabel_goal=tf.constant(chosen_goal))

            relabel_count += 1
            if relabel_count >= FLAGS.num_relabelled_goals:
                break

    else:
        print('not adding relabelled trajectories')
Example #7
0
def copy_replay_buffer(small_buffer, big_buffer):
  """Copy small buffer into the big buffer."""
  all_data = nest_utils.unbatch_nested_tensors(small_buffer.gather_all())
  for trajectory in nest_utils.unstack_nested_tensors(  # pylint: disable=redefined-outer-name
      all_data, big_buffer.data_spec):
    big_buffer.add_batch(trajectory)