def _prepare_networks(self, hparams, sess):
    self.action = tf.placeholder(shape=(1,), dtype=tf.int32)
    batch_env = SimulatedBatchEnv(hparams.environment_spec, hparams.num_agents)
    self.reward, self.done = batch_env.simulate(self.action)
    self.observation = batch_env.observ
    self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32))

    environment_wrappers = hparams.environment_spec.wrappers
    wrappers = copy.copy(environment_wrappers) if environment_wrappers else []

    to_initialize = [batch_env]
    for w in wrappers:
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)

    def initialization_lambda():
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    self.initialize = initialization_lambda

    obs_copy = batch_env.observ + 0

    actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
    self.policy_probs = actor_critic.policy.probs[0, 0, :]
    self.value = actor_critic.value[0, :]
Exemple #2
0
class SimulatedBatchGymEnv(Env):
    """SimulatedBatchEnv in a Gym-like interface, environments are  batched."""
    def __init__(self, *args, **kwargs):
        with tf.Graph().as_default():
            self._batch_env = SimulatedBatchEnv(*args, **kwargs)

            self._actions_t = tf.placeholder(shape=(self.batch_size, ),
                                             dtype=tf.int32)
            self._rewards_t, self._dones_t = self._batch_env.simulate(
                self._actions_t)
            with tf.control_dependencies([self._rewards_t]):
                self._obs_t = self._batch_env.observ
            self._indices_t = tf.placeholder(shape=(self.batch_size, ),
                                             dtype=tf.int32)
            self._reset_op = self._batch_env.reset(
                tf.range(self.batch_size, dtype=tf.int32))

            self._sess = tf.Session()
            self._sess.run(tf.global_variables_initializer())
            self._batch_env.initialize(self._sess)

    @property
    def batch_size(self):
        return self._batch_env.batch_size

    @property
    def observation_space(self):
        return self._batch_env.observ_space

    @property
    def action_space(self):
        return self._batch_env.action_space

    def render(self, mode="human"):
        raise NotImplementedError()

    def reset(self, indices=None):
        if indices is None:
            indices = np.array(range(self.batch_size))
        obs = self._sess.run(self._reset_op,
                             feed_dict={self._indices_t: indices})
        # TODO(pmilos): remove if possible
        # obs[:, 0, 0, 0] = 0
        # obs[:, 0, 0, 1] = 255
        return obs

    def step(self, actions):
        obs, rewards, dones = self._sess.run(
            [self._obs_t, self._rewards_t, self._dones_t],
            feed_dict={self._actions_t: actions})
        return obs, rewards, dones

    def close(self):
        self._sess.close()
class SimulatedBatchGymEnv(Env):
  """SimulatedBatchEnv in a Gym-like interface, environments are  batched."""

  def __init__(self, *args, **kwargs):
    with tf.Graph().as_default():
      self._batch_env = SimulatedBatchEnv(*args, **kwargs)

      self._actions_t = tf.placeholder(shape=(self.batch_size,), dtype=tf.int32)
      self._rewards_t, self._dones_t = self._batch_env.simulate(self._actions_t)
      self._obs_t = self._batch_env.observ
      self._reset_op = self._batch_env.reset(
          tf.range(self.batch_size, dtype=tf.int32)
      )

      self._sess = tf.Session()
      self._sess.run(tf.global_variables_initializer())
      self._batch_env.initialize(self._sess)

  @property
  def batch_size(self):
    return self._batch_env.batch_size

  @property
  def observation_space(self):
    return self._batch_env.observ_space

  @property
  def action_space(self):
    return self._batch_env.action_space

  def render(self, mode="human"):
    raise NotImplementedError()

  def reset(self, indices=None):
    if indices:
      raise NotImplementedError()
    obs = self._sess.run(self._reset_op)
    # TODO(pmilos): remove if possible
    # obs[:, 0, 0, 0] = 0
    # obs[:, 0, 0, 1] = 255
    return obs

  def step(self, actions):
    obs, rewards, dones = self._sess.run(
        [self._obs_t, self._rewards_t, self._dones_t],
        feed_dict={self._actions_t: actions})
    return obs, rewards, dones

  def close(self):
    self._sess.close()
Exemple #4
0
    def __init__(self, *args, **kwargs):
        with tf.Graph().as_default():
            self._batch_env = SimulatedBatchEnv(*args, **kwargs)

            self._actions_t = tf.placeholder(shape=(self.batch_size, ),
                                             dtype=tf.int32)
            self._rewards_t, self._dones_t = self._batch_env.simulate(
                self._actions_t)
            self._obs_t = self._batch_env.observ
            self._reset_op = self._batch_env.reset(
                tf.range(self.batch_size, dtype=tf.int32))

            self._sess = tf.Session()
            self._sess.run(tf.global_variables_initializer())
            self._batch_env.initialize(self._sess)
Exemple #5
0
    def __init__(self,
                 environment_spec,
                 batch_size,
                 model_dir=None,
                 sess=None):
        self.batch_size = batch_size

        with tf.Graph().as_default():
            self._batch_env = SimulatedBatchEnv(environment_spec,
                                                self.batch_size)

            self.action_space = self._batch_env.action_space
            # TODO(kc): check for the stack wrapper and correct number of channels in
            # observation_space
            self.observation_space = self._batch_env.observ_space
            self._sess = sess if sess is not None else tf.Session()
            self._to_initialize = [self._batch_env]

            environment_wrappers = environment_spec.wrappers
            wrappers = copy.copy(
                environment_wrappers) if environment_wrappers else []

            for w in wrappers:
                self._batch_env = w[0](self._batch_env, **w[1])
                self._to_initialize.append(self._batch_env)

            self._sess.run(tf.global_variables_initializer())
            for wrapped_env in self._to_initialize:
                wrapped_env.initialize(self._sess)

            self._actions_t = tf.placeholder(shape=(batch_size, ),
                                             dtype=tf.int32)
            self._rewards_t, self._dones_t = self._batch_env.simulate(
                self._actions_t)
            self._obs_t = self._batch_env.observ
            self._reset_op = self._batch_env.reset(
                tf.range(batch_size, dtype=tf.int32))

            env_model_loader = tf.train.Saver(
                var_list=tf.global_variables(scope="next_frame*"))  # pylint:disable=unexpected-keyword-arg
            trainer_lib.restore_checkpoint(model_dir,
                                           saver=env_model_loader,
                                           sess=self._sess,
                                           must_restore=True)
  def __init__(self, *args, **kwargs):
    with tf.Graph().as_default():
      self._batch_env = SimulatedBatchEnv(*args, **kwargs)

      self._actions_t = tf.placeholder(shape=(self.batch_size,), dtype=tf.int32)
      self._rewards_t, self._dones_t = self._batch_env.simulate(self._actions_t)
      self._obs_t = self._batch_env.observ
      self._reset_op = self._batch_env.reset(
          tf.range(self.batch_size, dtype=tf.int32)
      )

      self._sess = tf.Session()
      self._sess.run(tf.global_variables_initializer())
      self._batch_env.initialize(self._sess)
Exemple #7
0
def define_collect(hparams, scope):
  """Collect trajectories.

  Args:
    hparams: HParams.
    scope: var scope.

  Returns:
    Returns memory (observtions, rewards, dones, actions,
    pdfs, values_functions)
    containing a rollout of environment from nested wrapped structure.
  """

  to_initialize = []
  with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
    environment_spec = hparams.environment_spec
    num_agents = hparams.num_agents
    if environment_spec.simulated_env:
      batch_env = SimulatedBatchEnv(environment_spec, num_agents)
    else:
      batch_env = PyFuncBatchEnv(environment_spec.env)

    to_initialize.append(batch_env)
    environment_wrappers = environment_spec.wrappers
    wrappers = copy.copy(environment_wrappers) if environment_wrappers else []
    wrappers.append((_MemoryWrapper, {}))
    rollout_metadata = None
    speculum = None
    for w in wrappers:
      tf.logging.info("Applying wrapper %s(%s) to env %s."
                      % (str(w[0]), str(w[1]), str(batch_env)))
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)

    rollout_metadata = _rollout_metadata(batch_env)
    speculum = batch_env.speculum

    def initialization_lambda(sess):
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    memory = [
        tf.get_variable("collect_memory_%d_%s" % (hparams.epoch_length, name),
                        shape=[hparams.epoch_length] + shape,
                        dtype=dtype,
                        initializer=tf.zeros_initializer(),
                        trainable=False)
        for (shape, dtype, name) in rollout_metadata]

    cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
                                         trainable=False)

    eval_phase_t = tf.convert_to_tensor(hparams.eval_phase)
    should_reset_var = tf.Variable(True, trainable=False)
    zeros_tensor = tf.zeros(len(batch_env))

  force_beginning_resets = tf.convert_to_tensor(
      environment_spec.force_beginning_resets
  )

  def reset_ops_group():
    return tf.group(batch_env.reset(tf.range(len(batch_env))),
                    tf.assign(cumulative_rewards, zeros_tensor))

  reset_op = tf.cond(
      tf.logical_or(should_reset_var.read_value(), force_beginning_resets),
      reset_ops_group, tf.no_op)

  with tf.control_dependencies([reset_op]):
    reset_once_op = tf.assign(should_reset_var, False)

  with tf.control_dependencies([reset_once_op]):

    def step(index, scores_sum, scores_num):
      """Single step."""
      index %= hparams.epoch_length  # Only needed in eval runs.
      # Note - the only way to ensure making a copy of tensor is to run simple
      # operation. We are waiting for tf.copy:
      # https://github.com/tensorflow/tensorflow/issues/11186
      obs_copy = batch_env.observ + 0

      def env_step(arg1, arg2, arg3):  # pylint: disable=unused-argument
        """Step of the environment."""
        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        policy = actor_critic.policy
        action = hparams.policy_to_actions_lambda(policy)

        postprocessed_action = actor_critic.action_postprocessing(action)
        reward, done = batch_env.simulate(postprocessed_action[0, ...])

        pdf = policy.prob(action)[0]
        value_function = actor_critic.value[0]
        pdf = tf.reshape(pdf, shape=(num_agents,))
        value_function = tf.reshape(value_function, shape=(num_agents,))
        done = tf.reshape(done, shape=(num_agents,))

        with tf.control_dependencies([reward, done]):
          return tf.identity(pdf), tf.identity(value_function), \
                 tf.identity(done)

      # TODO(piotrmilos): while_body is executed at most once,
      # thus should be replaced with tf.cond
      pdf, value_function, top_level_done = tf.while_loop(
          lambda _1, _2, _3: tf.equal(speculum.size(), 0),
          env_step,
          [
              tf.constant(0.0, shape=(num_agents,)),
              tf.constant(0.0, shape=(num_agents,)),
              tf.constant(False, shape=(num_agents,))
          ],
          parallel_iterations=1,
          back_prop=False,
      )

      with tf.control_dependencies([pdf, value_function]):
        obs, reward, done, action = speculum.dequeue()

        to_save = [obs, reward, done, action,
                   pdf, value_function]
        save_ops = [tf.scatter_update(memory_slot, index, value)
                    for memory_slot, value in zip(memory, to_save)]
        cumulate_rewards_op = cumulative_rewards.assign_add(reward)

        agent_indices_to_reset = tf.where(top_level_done)[:, 0]
      with tf.control_dependencies([cumulate_rewards_op]):
        # TODO(piotrmilos): possibly we need cumulative_rewards.read_value()
        scores_sum_delta = tf.reduce_sum(
            tf.gather(cumulative_rewards.read_value(), agent_indices_to_reset))
        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
      with tf.control_dependencies(save_ops + [scores_sum_delta,
                                               scores_num_delta]):
        reset_env_op = batch_env.reset(agent_indices_to_reset)
        reset_cumulative_rewards_op = tf.scatter_update(
            cumulative_rewards, agent_indices_to_reset,
            tf.gather(zeros_tensor, agent_indices_to_reset))
      with tf.control_dependencies([reset_env_op,
                                    reset_cumulative_rewards_op]):
        return [index + 1, scores_sum + scores_sum_delta,
                scores_num + scores_num_delta]

    def stop_condition(i, _, resets):
      return tf.cond(eval_phase_t,
                     lambda: resets < num_agents,
                     lambda: i < hparams.epoch_length)

    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
    index, scores_sum, scores_num = tf.while_loop(
        stop_condition,
        step,
        init,
        parallel_iterations=1,
        back_prop=False)

  # We handle force_beginning_resets differently. We assume that all envs are
  # reseted at the end of episod (though it happens at the beginning of the
  # next one
  scores_num = tf.cond(force_beginning_resets,
                       lambda: scores_num + len(batch_env), lambda: scores_num)

  with tf.control_dependencies([scores_sum]):
    scores_sum = tf.cond(
        force_beginning_resets,
        lambda: scores_sum + tf.reduce_sum(cumulative_rewards.read_value()),
        lambda: scores_sum)

  mean_score = tf.cond(tf.greater(scores_num, 0),
                       lambda: scores_sum / tf.cast(scores_num, tf.float32),
                       lambda: 0.)
  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
  with tf.control_dependencies([index, printing]):
    memory = [mem.read_value() for mem in memory]
    # When generating real data together with PPO training we must use single
    # agent. For PPO to work we reshape the history, as if it was generated
    # by real_ppo_effective_num_agents.
    if (getattr(hparams, "effective_num_agents", None) and
        not hparams.eval_phase):
      new_memory = []
      effective_num_agents = hparams.effective_num_agents
      assert hparams.epoch_length % effective_num_agents == 0, (
          "The rollout of hparams.epoch_length will be distributed amongst"
          "effective_num_agents of agents")
      new_epoch_length = int(hparams.epoch_length / effective_num_agents)
      for mem, info in zip(memory, rollout_metadata):
        shape, _, name = info
        new_shape = [effective_num_agents, new_epoch_length] + shape[1:]
        perm = list(range(len(shape)+1))
        perm[0] = 1
        perm[1] = 0
        mem = tf.transpose(mem, perm=perm)
        mem = tf.reshape(mem, shape=new_shape)
        mem = tf.transpose(mem, perm=perm,
                           name="collect_memory_%d_%s"
                           % (new_epoch_length, name))
        new_memory.append(mem)
      memory = new_memory

    mean_score_summary = tf.cond(
        tf.greater(scores_num, 0),
        lambda: tf.summary.scalar("mean_score_this_iter", mean_score),
        str)
    summaries = tf.summary.merge(
        [mean_score_summary,
         tf.summary.scalar("episodes_finished_this_iter", scores_num)])
    return memory, summaries, initialization_lambda
Exemple #8
0
class SimulatedBatchGymEnv(Env):
    """SimulatedBatchEnv in a Gym-like interface, environments are  batched."""
    def __init__(self,
                 environment_spec,
                 batch_size,
                 model_dir=None,
                 sess=None):
        self.batch_size = batch_size

        with tf.Graph().as_default():
            self._batch_env = SimulatedBatchEnv(environment_spec,
                                                self.batch_size)

            self.action_space = self._batch_env.action_space
            # TODO(kc): check for the stack wrapper and correct number of channels in
            # observation_space
            self.observation_space = self._batch_env.observ_space
            self._sess = sess if sess is not None else tf.Session()
            self._to_initialize = [self._batch_env]

            environment_wrappers = environment_spec.wrappers
            wrappers = copy.copy(
                environment_wrappers) if environment_wrappers else []

            for w in wrappers:
                self._batch_env = w[0](self._batch_env, **w[1])
                self._to_initialize.append(self._batch_env)

            self._sess.run(tf.global_variables_initializer())
            for wrapped_env in self._to_initialize:
                wrapped_env.initialize(self._sess)

            self._actions_t = tf.placeholder(shape=(batch_size, ),
                                             dtype=tf.int32)
            self._rewards_t, self._dones_t = self._batch_env.simulate(
                self._actions_t)
            self._obs_t = self._batch_env.observ
            self._reset_op = self._batch_env.reset(
                tf.range(batch_size, dtype=tf.int32))

            env_model_loader = tf.train.Saver(
                var_list=tf.global_variables(scope="next_frame*"))  # pylint:disable=unexpected-keyword-arg
            trainer_lib.restore_checkpoint(model_dir,
                                           saver=env_model_loader,
                                           sess=self._sess,
                                           must_restore=True)

    def render(self, mode="human"):
        raise NotImplementedError()

    def reset(self, indices=None):
        if indices:
            raise NotImplementedError()
        obs = self._sess.run(self._reset_op)
        # TODO(pmilos): remove if possible
        # obs[:, 0, 0, 0] = 0
        # obs[:, 0, 0, 1] = 255
        return obs

    def step(self, actions):
        obs, rewards, dones = self._sess.run(
            [self._obs_t, self._rewards_t, self._dones_t],
            feed_dict={self._actions_t: actions})
        return obs, rewards, dones