Ejemplo n.º 1
0
 def _setup_specs(self):
   self._policy_step_spec = policy_step.PolicyStep(
       action=self._action_spec,
       state=self._policy_state_spec,
       info=self._info_spec)
   self._trajectory_spec = trajectory.from_transition(self._time_step_spec,
                                                      self._policy_step_spec,
                                                      self._time_step_spec)
Ejemplo n.º 2
0
 def update(self, observation, reward, is_terminal, action_step):
     time_step = self.env.current_time_step()
     self.train_py_env.set_next(observation, reward, is_terminal)
     next_time_step = self.env.step(action_step.action)
     traj = trajectory.from_transition(time_step, action_step,
                                       next_time_step)
     self.replay_buffer.add_batch(traj)
     self.update_network(traj)
def collect_step(environment, policy, replay_buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    # batch = tf.nest.map_structure(lambda t: tf.expand_dims(t, 0), traj)
    # Add trajectory to the replay buffer
    replay_buffer.add_batch(traj)
Ejemplo n.º 4
0
 def collect(self, env, policy):
     time_step = env.current_time_step()
     action_step = policy.action(time_step)
     next_time_step = env.step(action_step.action)
     traj = trajectory.from_transition(time_step, action_step,
                                       next_time_step)
     self.buffer.add_batch(traj)
     return traj
Ejemplo n.º 5
0
def collect_step(env, time_step, py_policy, replay_buffer):
  """Steps the environment and collects experience into the replay buffer."""
  action_step = py_policy.action(time_step)
  next_time_step = env.step(action_step.action)
  if not time_step.is_last():
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)
  return next_time_step
Ejemplo n.º 6
0
def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)
Ejemplo n.º 7
0
    def timestamp_data(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(
            time_step, action_step, next_time_step)

        self._replay_buffer.add_batch(timestamp_trajectory)
Ejemplo n.º 8
0
    def collect_data(env, policy, buffer, steps):
        for _ in range(steps):
            time_step = env.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = env.step(action_step.action)
            traject = trajectory.from_transition(time_step, action_step,
                                                 next_time_step)

            buffer.add_batch(traject)
Ejemplo n.º 9
0
 def collect_step(self, env: tf_py_environment.TFPyEnvironment,
                  policy: tf_policy.Base,
                  replay_buffer: TFUniformReplayBuffer):
     time_step = env.current_time_step()
     action_step = policy.action(time_step)
     next_time_step = env.step(action_step.action)
     traj = trajectory.from_transition(time_step, action_step,
                                       next_time_step)
     replay_buffer.add_batch(traj)
Ejemplo n.º 10
0
 def predict_and_fine_tune(self):
     prev_step = self.time_step
     action_step = q_agent.agent.policy.action(self.time_step)
     self.time_step = fine_tune_env.step(action_step.action)
     traj = trajectory.from_transition(prev_step, action_step, self.time_step)
     replay_buffer.add_batch(traj)
     experience, _ = next(iterator)
     train_loss = q_agent.agent.train(experience)
     return action_step.action
Ejemplo n.º 11
0
def collect_step(environment, policySteps, buffer, alg):
    time_step, total_agents_action, next_time_step = one_step(
        environment, policySteps, alg)
    traj = trajectory.from_transition(time_step,
                                      total_agents_action,
                                      next_time_step,
                                      alg=alg,
                                      joint_action=True)
    buffer.add_batch(traj)
    def collect_step(environment, policy, buffer, drone):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        options.drones_locations[drone] = next_time_step.observation
        traj = trajectory.from_transition(
            time_step, action_step, next_time_step)

        buffer.add_batch(traj)
Ejemplo n.º 13
0
    def collect_step(self):
        time_step = self.env.current_time_step()
        action_step = self.agent.policy.action(time_step)
        next_time_step = self.env.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step,
                                          next_time_step)

        # Add trajectory to the replay buffer
        self.replay_buffer.add_batch(traj)
Ejemplo n.º 14
0
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer, with extra copies per 0.001 reward (jury-rigged experience prioritisation weighting)
    for i in range(int((traj.reward // 0.001) + 1)):
        buffer.add_batch(traj)
Ejemplo n.º 15
0
def step(
    environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer
) -> typing.Tuple[float, bool]:
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)
    return next_time_step.reward.numpy()[0], next_time_step.is_last()
Ejemplo n.º 16
0
    def run_agent(self,
                  policy,
                  recorder,
                  env_name,
                  py_env,
                  tf_env,
                  encoded_images=None):
        """Run an agent's policy in a particular environment. Possibly record."""
        if self.save_matrices and encoded_images is None:
            encoded_images = []

        # Add blank frames to make it easier to distinguish between runs/agents
        for _ in range(self.num_blank_frames):
            if recorder:
                recorder.add_frame(self.blank_frame)
            if self.save_matrices:
                encoded_images.append(self.blank_frame_encoding)

        rewards = 0
        policy_state = policy.get_initial_state(1)

        if 'domain_randomization' in self.name and env_name == self.adv_env_name:
            time_step = tf_env.reset_random()
        elif 'Adversarial' in env_name:
            time_step = tf_env.reset_agent()
        else:
            time_step = tf_env.reset()

        if recorder:
            recorder.add_frame(py_env.render())
        if self.save_matrices:
            encoded_images.append(self.py_env._gym_env.grid.encode())  # pylint:disable=protected-access

        num_steps = tf.constant(0.0)
        while True:
            policy_step = policy.action(time_step, policy_state=policy_state)

            policy_state = policy_step.state
            next_time_step = tf_env.step(policy_step.action)

            traj = trajectory.from_transition(time_step, policy_step,
                                              next_time_step)
            time_step = next_time_step

            num_steps += tf.math.reduce_sum(
                tf.cast(~traj.is_boundary(), tf.float32))

            rewards += time_step.reward
            if recorder:
                recorder.add_frame(py_env.render())
            if self.save_matrices:
                encoded_images.append(self.py_env._gym_env.grid.encode())  # pylint:disable=protected-access

            if traj.is_last():
                break

        return rewards.numpy().sum(), encoded_images
        def loop_body(counter, time_step, policy_state):
            """Runs a step in environment.
      While loop will call multiple times.
      Args:
        counter: Episode counters per batch index. Shape [batch_size].
        time_step: TimeStep tuple with elements shape [batch_size, ...].
        policy_state: Poicy state tensor shape [batch_size, policy_state_dim].
          Pass empty tuple for non-recurrent policies.
      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            action_step = self.policy.action(time_step, policy_state)

            # TODO(b/134487572): TF2 while_loop seems to either ignore
            # parallel_iterations or doesn't properly propagate control dependencies
            # from one step to the next. Without this dep, self.env.step() is called
            # in parallel.
            with tf.control_dependencies(tf.nest.flatten([time_step])):
                next_time_step = self.env.step(action_step.action)

            policy_state = action_step.state

            if self._is_bandit_env:
                # For Bandits we create episodes of length 1.
                # Since the `next_time_step` is always of type LAST we need to replace
                # the step type of the current `time_step` to FIRST.
                batch_size = tf.shape(input=time_step.discount)
                time_step = time_step._replace(
                    step_type=tf.fill(batch_size, ts.StepType.FIRST))

            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)

            #### Save trajectory into the server's table ####
            print('Trajectory: {}'.format(traj))
            self.sampling_client.insert(traj, {'my_table': 1.0})
            ################################################

            observer_ops = [observer(traj) for observer in self._observers]
            transition_observer_ops = [
                observer((time_step, action_step, next_time_step))
                for observer in self._transition_observers
            ]
            with tf.control_dependencies(
                [tf.group(observer_ops + transition_observer_ops)]):
                time_step, next_time_step, policy_state = tf.nest.map_structure(
                    tf.identity, (time_step, next_time_step, policy_state))

            # While loop counter is only incremented for episode reset episodes.
            # For Bandits, this is every trajectory, for MDPs, this is at boundaries.
            if self._is_bandit_env:
                counter += tf.ones(batch_size, dtype=tf.int32)
            else:
                counter += tf.cast(traj.is_boundary(), dtype=tf.int32)

            return [counter, next_time_step, policy_state]
Ejemplo n.º 18
0
 def sample(self, batch_size):
     dummy_action_step = policy_step.PolicyStep(
         action=tf.constant([tf.int32.min]))
     dummy_time_step = ts.TimeStep(step_type=tf.constant([tf.int32.min]),
                                   reward=(np.nan * tf.ones(1)),
                                   discount=(np.nan * tf.ones(1)),
                                   observation=None)
     trajs = []
     for transition in random.sample(self.buffer, batch_size):
         traj1 = trajectory.from_transition(transition.time_step,
                                            transition.action_step,
                                            transition.next_time_step)
         traj2 = trajectory.from_transition(transition.next_time_step,
                                            dummy_action_step,
                                            dummy_time_step)
         trajs.append(
             nest_utils.unbatch_nested_tensors(
                 nest_utils.stack_nested_tensors([traj1, traj2], axis=1)))
     return nest_utils.stack_nested_tensors(trajs)
Ejemplo n.º 19
0
 def _data_spec(self):
     return trajectory.from_transition(
         self.time_step_spec,
         policy_step.PolicyStep(
             action=self.action_spec,
             state=self.policy_state_spec,
             info=()
         ),
         self.time_step_spec,
     )
Ejemplo n.º 20
0
    def collect_step(self, env, policy, buffer, mac):
        """Collects the current time step of the environment and maps the
        current time_step to action in Q-table.
        """
        if self.initial_step[mac]:
            self.initial_step[mac] = False
            time_step = env.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = env.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)
            buffer.add_batch(traj)

        time_step = env.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = env.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step,
                                          next_time_step)
        buffer.add_batch(traj)
Ejemplo n.º 21
0
def collect_step(environment, policy, buffer):
  """Returns reward and termination."""
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)
  return next_time_step.reward[0], next_time_step.is_last()
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer):
    time_step_ = env.current_time_step()
    action_step = policy.action(time_step_)
    next_time_step = env.step(action=action_step.action)
    traj = trajectory.from_transition(time_step=time_step_,
                                      action_step=action_step,
                                      next_time_step=next_time_step)

    # add trajectory to the replay buffer
    buffer.add_batch(traj)
Ejemplo n.º 23
0
        def loop_body(counter, time_step, policy_state):
            """Runs a step in environment.

      While loop will call multiple times.

      Args:
        counter: Episode counters per batch index. Shape [batch_size].
        time_step: TimeStep tuple with elements shape [batch_size, ...].
        policy_state: Poicy state tensor shape [batch_size, policy_state_dim].
          Pass empty tuple for non-recurrent policies.

      Returns:
        loop_vars for next iteration of tf.while_loop.
      """
            action_step = self.policy.action(time_step, policy_state)

            # TODO(b/134487572): TF2 while_loop seems to either ignore
            # parallel_iterations or doesn't properly propagate control dependencies
            # from one step to the next. Without this dep, self.env.step() is called
            # in parallel.

            with tf.control_dependencies(tf.nest.flatten([time_step])):
                if isinstance(
                        self.policy,
                    (discrete_boltzmann_policy.DiscreteBoltzmannPolicy,
                     greedy_policy.GreedyPolicy,
                     epsilon_discrete_boltzmann_policy.
                     EpsilonDiscreteBoltzmannPolicy)):
                    next_time_step = self.env.step(
                        tf.cast(action_step.action[:, 0], tf.int64))
                else:
                    next_time_step = self.env.step(action_step.action)


#      with tf.control_dependencies(tf.nest.flatten([time_step])):
#        next_time_step = self.env.step(action_step.action)

            policy_state = action_step.state

            traj = trajectory.from_transition(time_step, action_step,
                                              next_time_step)
            observer_ops = [observer(traj) for observer in self._observers]
            transition_observer_ops = [
                observer((time_step, action_step, next_time_step))
                for observer in self._transition_observers
            ]
            with tf.control_dependencies(
                [tf.group(observer_ops + transition_observer_ops)]):
                time_step, next_time_step, policy_state = tf.nest.map_structure(
                    tf.identity, (time_step, next_time_step, policy_state))

            # While loop counter is only incremented for episode reset episodes.
            counter += tf.cast(traj.is_boundary(), dtype=tf.int32)

            return [counter, next_time_step, policy_state]
Ejemplo n.º 24
0
def get_tf_buffers(c,max_length=270):
    obs_spec,ac_spec = get_env_specs(c)
    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = policy_step.PolicyStep(ac_spec)
    trajectory_spec = trajectory.from_transition(
        time_step_spec, action_spec , time_step_spec)
    the_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=trajectory_spec,
        batch_size=1,
        max_length=max_length)
    return the_replay_buffer
def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  action_num = action_step.action.numpy()[0]
  # print(env.map_action_to_coordinate(action_num - 3136 * 4))
  # print(floor(action_num/3136))
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)
Ejemplo n.º 26
0
def collect_step(environment, policy):
  """
  Fuehrt auf auf dem aktuellen gamestate in environment 1 action aus (bestimmt durch policy)
  und speichert das ergebnis als trajectory in 'replay_buffer'
  """
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  replay_buffer.add_batch(traj)
    def _make_replay_buffer(self, tf_env):
        """Default replay buffer factory."""

        time_step_spec = tf_env.time_step_spec()
        action_spec = tf_env.action_spec()
        action_step_spec = policy_step.PolicyStep(
            action_spec, (), tensor_spec.TensorSpec((), tf.int32))
        trajectory_spec = trajectory.from_transition(time_step_spec,
                                                     action_step_spec,
                                                     time_step_spec)
        return episodic_replay_buffer.EpisodicReplayBuffer(
            trajectory_spec, end_episode_fn=lambda _: False)
Ejemplo n.º 28
0
    def add_to_replay_buffer(transition):
        time_step, _, next_time_step = transition
        time_step = prepare(time_step)
        next_time_step = prepare(next_time_step)
        action_step = policy.action(time_step)

        traj = trajectory.from_transition(time_step, action_step,
                                          next_time_step)

        traj_batched = tf.nest.map_structure(
            lambda t: tf.stack([t] * batch_size), traj)
        replay_buffer.add_batch(traj_batched)
Ejemplo n.º 29
0
def collect_step(environment, policy, buffer):
    """Execute the step in the environment and add it to the buffer.

    environment (object): Environment of the game.
    policy (object): network.
    buffer (object): data for training.
    """

    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)
Ejemplo n.º 30
0
def collect_data(environment, policy, num_episodes):
    episode_counter = 0
    environment.reset()
    while episode_counter < num_episodes:
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step,
                                          next_time_step)
        # Add trajectory to the replay buffer
        replay_buffer.add_batch(traj)
        if traj.is_boundary():
            episode_counter += 1