Esempio n. 1
0
def define_train(hparams, event_dir):
  """Define the training setup."""
  del event_dir
  with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
    memory, collect_summary, initialization\
      = collect.define_collect(
          hparams, "ppo_train", eval_phase=False)
    ppo_summary = ppo.define_ppo_epoch(memory, hparams)
    summary = tf.summary.merge([collect_summary, ppo_summary])

  return summary, None, initialization
Esempio n. 2
0
def define_train(hparams, environment_spec, event_dir):
  """Define the training setup."""
  if isinstance(environment_spec, str):
    env_lambda = lambda: gym.make(environment_spec)
  else:
    env_lambda = environment_spec
  policy_lambda = hparams.network
  env = env_lambda()
  action_space = env.action_space

  batch_env = utils.define_batch_env(env_lambda, hparams.num_agents)

  policy_factory = tf.make_template(
      "network",
      functools.partial(policy_lambda, action_space, hparams))

  with tf.variable_scope("train"):
    memory, collect_summary = collect.define_collect(
        policy_factory, batch_env, hparams, eval_phase=False)
  ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
  summary = tf.summary.merge([collect_summary, ppo_summary])

  with tf.variable_scope("eval"):
    eval_env_lambda = env_lambda
    if event_dir and hparams.video_during_eval:
      # Some environments reset environments automatically, when reached done
      # state. For them we shall record only every second episode.
      d = 2 if env_lambda().metadata.get("semantics.autoreset") else 1
      eval_env_lambda = lambda: gym.wrappers.Monitor(  # pylint: disable=g-long-lambda
          env_lambda(), event_dir, video_callable=lambda i: i % d == 0)
    wrapped_eval_env_lambda = lambda: utils.EvalVideoWrapper(eval_env_lambda())
    _, eval_summary = collect.define_collect(
        policy_factory,
        utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents,
                               xvfb=hparams.video_during_eval),
        hparams, eval_phase=True)
  return summary, eval_summary
Esempio n. 3
0
  def _setup(self):
    collect_hparams = rl.ppo_pong_base()
    collect_hparams.add_hparam("environment_spec", self.environment_spec)
    collect_hparams.add_hparam("force_beginning_resets",
                               self._internal_memory_force_beginning_resets)
    collect_hparams.epoch_length = self._internal_memory_size
    collect_hparams.num_agents = 1

    if not FLAGS.agent_policy_path:
      collect_hparams.policy_network = rl.random_policy_fun

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      self.collect_memory, self.collect_trigger_op, collect_init \
        = collect.define_collect(collect_hparams, scope="gym_problems",
                                 collect_level=0, eval_phase=self.eval_phase)

    self._session = tf.Session()
    collect_init(self._session)
    self._session.run(tf.global_variables_initializer())
Esempio n. 4
0
    def _setup(self, data_dir):
        # TODO(piotrmilos):this should be consistent with
        # ppo_params in model_rl_experiment
        dumper_path = os.path.join(data_dir, "dumper")
        if os.path.isdir(dumper_path):
            self._use_dumper_data = True
            self._dumper_data_index = 0
            self._dumper_path = dumper_path
        else:

            collect_hparams = rl.ppo_pong_base()
            collect_hparams.add_hparam("environment_spec",
                                       self.environment_spec)
            collect_hparams.add_hparam(
                "force_beginning_resets",
                self._internal_memory_force_beginning_resets)
            collect_hparams.epoch_length = self._internal_memory_size
            collect_hparams.num_agents = 1

            if not FLAGS.agent_policy_path:
                collect_hparams.policy_network = rl.random_policy_fun

            policy_to_actions_lambda = None
            if self.settable_eval_phase:
                policy_to_actions_lambda = lambda policy: policy.mode()

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                self.collect_memory, self.collect_trigger_op, collect_init = (
                    collect.define_collect(
                        collect_hparams,
                        scope="gym_problems",
                        eval_phase=False,
                        collect_level=0,
                        policy_to_actions_lambda=policy_to_actions_lambda))

            self._session = tf.Session()
            collect_init(self._session)
            self._session.run(tf.global_variables_initializer())
            self.restore_networks(self._session)
            self.memory_index = 0
            self.memory = None
Esempio n. 5
0
    def _setup(self):
        collect_hparams = rl.ppo_pong_base()
        collect_hparams.add_hparam("environment_spec", self.environment_spec)
        collect_hparams.add_hparam(
            "force_beginning_resets",
            self._internal_memory_force_beginning_resets)
        collect_hparams.epoch_length = self._internal_memory_size
        collect_hparams.num_agents = 1

        if not FLAGS.agent_policy_path:
            collect_hparams.policy_network = rl.random_policy_fun

        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            self.collect_memory, self.collect_trigger_op, collect_init \
              = collect.define_collect(collect_hparams, scope="gym_problems",
                                       collect_level=0, eval_phase=self.eval_phase)

        self._session = tf.Session()
        collect_init(self._session)
        self._session.run(tf.global_variables_initializer())
Esempio n. 6
0
    def _setup(self):
        in_graph_wrappers = [(atari.ShiftRewardWrapper, {
            "add_value": 2
        }), (atari.MemoryWrapper, {})] + self.in_graph_wrappers
        env_hparams = tf.contrib.training.HParams(
            in_graph_wrappers=in_graph_wrappers,
            simulated_environment=self.simulated_environment)

        generator_batch_env = batch_env_factory(self.environment_spec,
                                                env_hparams,
                                                num_agents=1,
                                                xvfb=False)

        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            if FLAGS.agent_policy_path:
                policy_lambda = self.collect_hparams.network
            else:
                # When no agent_policy_path is set, just generate random samples.
                policy_lambda = rl.random_policy_fun
            policy_factory = tf.make_template(
                "network",
                functools.partial(policy_lambda,
                                  self.environment_spec().action_space,
                                  self.collect_hparams),
                create_scope_now_=True,
                unique_name_="network")

        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            self.collect_hparams.epoch_length = 10
            _, self.collect_trigger_op = collect.define_collect(
                policy_factory,
                generator_batch_env,
                self.collect_hparams,
                eval_phase=False,
                scope="define_collect")

        self.avilable_data_size_op = atari.MemoryWrapper.singleton.speculum.size(
        )
        self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()
        self.history_buffer = deque(maxlen=self.history_size + 1)
Esempio n. 7
0
    def _setup(self):
        # TODO: remove PongT2TGeneratorHackWrapper by writing a modality

        in_graph_wrappers = [(PongT2TGeneratorHackWrapper, {
            "add_value": 2
        }), (MemoryWrapper, {})] + self.in_graph_wrappers
        env_hparams = HParams(in_graph_wrappers=in_graph_wrappers,
                              simulated_environment=self.simulated_environment)

        generator_batch_env = \
          batch_env_factory(self.environment_spec, env_hparams, num_agents=1, xvfb=False)

        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            policy_lambda = self.collect_hparams.network
            policy_factory = tf.make_template(
                "network",
                functools.partial(policy_lambda,
                                  self.environment_spec().action_space,
                                  self.collect_hparams),
                create_scope_now_=True,
                unique_name_="network")

        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            sample_policy = lambda policy: 0 * policy.sample()
            # sample_policy = lambda policy: 0

            self.collect_hparams.epoch_length = 10
            _, self.collect_trigger_op = collect.define_collect(
                policy_factory,
                generator_batch_env,
                self.collect_hparams,
                eval_phase=False,
                policy_to_actions_lambda=sample_policy,
                scope="define_collect")

        self.avilable_data_size_op = MemoryWrapper.singleton._speculum.size()
        self.data_get_op = MemoryWrapper.singleton._speculum.dequeue()
        self.history_buffer = deque(maxlen=self.history_size + 1)
Esempio n. 8
0
    def _setup(self):
        if self.make_extra_debug_info:
            self.report_reward_statistics_every = 10
            self.dones = 0
            self.real_reward = 0
            # Slight weirdness to make sim env and real env aligned
            if self.simulated_environment:
                self.real_env.reset()
                for _ in range(self.num_input_frames):
                    self.real_ob, _, _, _ = self.real_env.step(0)
            self.total_sim_reward, self.total_real_reward = 0.0, 0.0
            self.sum_of_rewards = 0.0
            self.successful_episode_reward_predictions = 0

        in_graph_wrappers = self.in_graph_wrappers + [
            (atari.MemoryWrapper, {}), (StackAndSkipWrapper, {
                "skip": 4
            })
        ]
        env_hparams = tf.contrib.training.HParams(
            in_graph_wrappers=in_graph_wrappers,
            problem=self.real_env_problem if self.real_env_problem else self,
            simulated_environment=self.simulated_environment)
        if self.simulated_environment:
            env_hparams.add_hparam("simulation_random_starts",
                                   self.simulation_random_starts)
            env_hparams.add_hparam("intrinsic_reward_scale",
                                   self.intrinsic_reward_scale)

        generator_batch_env = batch_env_factory(self.environment_spec,
                                                env_hparams,
                                                num_agents=1,
                                                xvfb=False)

        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            if FLAGS.agent_policy_path:
                policy_lambda = self.collect_hparams.network
            else:
                # When no agent_policy_path is set, just generate random samples.
                policy_lambda = rl.random_policy_fun

        if FLAGS.autoencoder_path:
            # TODO(lukaszkaiser): remove hard-coded autoencoder params.
            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                self.setup_autoencoder()
                autoencoder_model = self.autoencoder_model
                # Feeds for autoencoding.
                shape = [
                    self.raw_frame_height, self.raw_frame_width,
                    self.num_channels
                ]
                self.autoencoder_feed = tf.placeholder(tf.int32, shape=shape)
                self.autoencoder_result = self.autoencode_tensor(
                    self.autoencoder_feed)
                # Now for autodecoding.
                shape = self.frame_shape
                self.autodecoder_feed = tf.placeholder(tf.int32, shape=shape)
                bottleneck = tf.reshape(
                    discretization.int_to_bit(self.autodecoder_feed, 8), [
                        1, 1, self.frame_height, self.frame_width,
                        self.num_channels * 8
                    ])
                autoencoder_model.set_mode(tf.estimator.ModeKeys.PREDICT)
                self.autodecoder_result = autoencoder_model.decode(bottleneck)

        def preprocess_fn(x):
            shape = [
                self.raw_frame_height, self.raw_frame_width, self.num_channels
            ]
            # TODO(lukaszkaiser): we assume x comes from StackAndSkipWrapper skip=4.
            xs = [tf.reshape(t, [1] + shape) for t in tf.split(x, 4, axis=-1)]
            autoencoded = self.autoencode_tensor(tf.concat(xs, axis=0),
                                                 batch_size=4)
            encs = [
                tf.squeeze(t, axis=[0])
                for t in tf.split(autoencoded, 4, axis=0)
            ]
            res = tf.to_float(tf.concat(encs, axis=-1))
            return tf.expand_dims(res, axis=0)

        # TODO(lukaszkaiser): x is from StackAndSkipWrapper thus 4*num_channels.
        shape = [1, self.frame_height, self.frame_width, 4 * self.num_channels]
        do_preprocess = (self.autoencoder_model is not None
                         and not self.simulated_environment)
        preprocess = (preprocess_fn, shape) if do_preprocess else None

        def policy(x):
            return policy_lambda(self.environment_spec().action_space,
                                 self.collect_hparams, x)

        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            self.collect_hparams.epoch_length = 10
            _, self.collect_trigger_op = collect.define_collect(
                policy,
                generator_batch_env,
                self.collect_hparams,
                eval_phase=self.eval_phase,
                scope="define_collect",
                preprocess=preprocess)

        self.avilable_data_size_op = atari.MemoryWrapper.singleton.speculum.size(
        )
        self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()
Esempio n. 9
0
    def _setup(self,
               data_dir,
               extra_collect_hparams=None,
               override_collect_hparams=None):
        dumper_path = os.path.join(data_dir, "dumper")
        dumper_exists = tf.gfile.Exists(dumper_path)
        tf.logging.info("Dumper path %s." % dumper_path)
        if dumper_exists and not self.settable_eval_phase:
            tf.logging.info("Using dumper data.")
            self._use_dumper_data = True
            self._dumper_data_index = 0
            self._dumper_path = dumper_path
        else:
            # TODO(piotrmilos):this should be consistent with
            # ppo_params in model_rl_experiment
            collect_hparams = rl.ppo_pong_base()
            collect_hparams.add_hparam("environment_spec",
                                       self.environment_spec)
            collect_hparams.add_hparam(
                "force_beginning_resets",
                self._internal_memory_force_beginning_resets)
            collect_hparams.epoch_length = self._internal_memory_size
            collect_hparams.num_agents = 1

            if not FLAGS.agent_policy_path:
                collect_hparams.policy_network = rl.random_policy_fun

            if extra_collect_hparams is not None:
                for (key, value) in six.iteritems(extra_collect_hparams):
                    collect_hparams.add_hparam(key, value)

            if override_collect_hparams is not None:
                # Override hparams manually - HParams.override_from_dict does not work
                # with functions.
                for (key, value) in six.iteritems(override_collect_hparams):
                    setattr(collect_hparams, key, value)

            policy_to_actions_lambda = None
            if self.settable_eval_phase:
                policy_to_actions_lambda = lambda policy: policy.mode()

            collect_level = 2  # After Resize and RewardClipping.
            if collect_hparams.environment_spec.simulated_env:
                collect_level = 1  # We still have reward clipping.
            if self._forced_collect_level is not None:  # For autoencoders.
                collect_level = self._forced_collect_level

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                self.collect_memory, self.collect_trigger_op, collect_init = (
                    collect.define_collect(
                        collect_hparams,
                        scope="gym_problems",
                        eval_phase=False,
                        collect_level=collect_level,
                        policy_to_actions_lambda=policy_to_actions_lambda))

            self._session = tf.Session()
            collect_init(self._session)
            self._session.run(tf.global_variables_initializer())
            self.restore_networks(self._session)
            self.memory_index = 0
            self.memory = None
Esempio n. 10
0
def define_train(hparams, environment_spec, event_dir):
    """Define the training setup."""
    policy_lambda = hparams.network

    if environment_spec == "stacked_pong":
        environment_spec = lambda: gym.make("PongNoFrameskip-v4")
        wrappers = hparams.in_graph_wrappers if hasattr(
            hparams, "in_graph_wrappers") else []
        wrappers.append((tf_atari_wrappers.MaxAndSkipWrapper, {"skip": 4}))
        hparams.in_graph_wrappers = wrappers
    if isinstance(environment_spec, str):
        env_lambda = lambda: gym.make(environment_spec)
    else:
        env_lambda = environment_spec

    batch_env = utils.batch_env_factory(env_lambda,
                                        hparams,
                                        num_agents=hparams.num_agents)

    policy_factory = tf.make_template(
        "network",
        functools.partial(policy_lambda, batch_env.action_space, hparams))

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        memory, collect_summary = collect.define_collect(policy_factory,
                                                         batch_env,
                                                         hparams,
                                                         eval_phase=False)
        ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
        summary = tf.summary.merge([collect_summary, ppo_summary])

    with tf.variable_scope("eval", reuse=tf.AUTO_REUSE):
        eval_env_lambda = env_lambda
        if event_dir and hparams.video_during_eval:
            # Some environments reset environments automatically, when reached done
            # state. For them we shall record only every second episode.
            d = 2 if env_lambda().metadata.get("semantics.autoreset") else 1
            eval_env_lambda = lambda: gym.wrappers.Monitor(  # pylint: disable=g-long-lambda
                env_lambda(),
                event_dir,
                video_callable=lambda i: i % d == 0)
        wrapped_eval_env_lambda = lambda: utils.EvalVideoWrapper(
            eval_env_lambda())
        # eval_batch_env = utils.define_batch_env(
        #     wrapped_eval_env_lambda, hparams.num_eval_agents,
        #     xvfb=hparams.video_during_eval)
        eval_batch_env = utils.batch_env_factory(
            wrapped_eval_env_lambda,
            hparams,
            num_agents=hparams.num_eval_agents,
            xvfb=hparams.video_during_eval)

        # TODO(blazej0): correct to the version below.
        corrected = False
        eval_summary = tf.no_op()
        if corrected:
            _, eval_summary = collect.define_collect(policy_factory,
                                                     eval_batch_env,
                                                     hparams,
                                                     eval_phase=True)
    return summary, eval_summary
Esempio n. 11
0
  def _setup(self):
    if self.make_extra_debug_info:
      self.report_reward_statistics_every = 10
      self.dones = 0
      self.real_reward = 0
      self.real_env.reset()
      # Slight weirdness to make sim env and real env aligned
      for _ in range(self.num_input_frames):
        self.real_ob, _, _, _ = self.real_env.step(0)
      self.total_sim_reward, self.total_real_reward = 0.0, 0.0
      self.sum_of_rewards = 0.0
      self.successful_episode_reward_predictions = 0

    in_graph_wrappers = self.in_graph_wrappers + [(atari.MemoryWrapper, {})]
    env_hparams = tf.contrib.training.HParams(
        in_graph_wrappers=in_graph_wrappers,
        problem=self,
        simulated_environment=self.simulated_environment)

    generator_batch_env = batch_env_factory(
        self.environment_spec, env_hparams, num_agents=1, xvfb=False)

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      if FLAGS.agent_policy_path:
        policy_lambda = self.collect_hparams.network
      else:
        # When no agent_policy_path is set, just generate random samples.
        policy_lambda = rl.random_policy_fun
      policy_factory = tf.make_template(
          "network",
          functools.partial(policy_lambda, self.environment_spec().action_space,
                            self.collect_hparams),
          create_scope_now_=True,
          unique_name_="network")

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      self.collect_hparams.epoch_length = 10
      _, self.collect_trigger_op = collect.define_collect(
          policy_factory, generator_batch_env, self.collect_hparams,
          eval_phase=False, scope="define_collect")

    if FLAGS.autoencoder_path:
      # TODO(lukaszkaiser): remove hard-coded autoencoder params.
      with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        self.setup_autoencoder()
        autoencoder_model = self.autoencoder_model
        # Feeds for autoencoding.
        shape = [self.raw_frame_height, self.raw_frame_width, self.num_channels]
        self.autoencoder_feed = tf.placeholder(tf.int32, shape=shape)
        autoencoded = autoencoder_model.encode(
            tf.reshape(self.autoencoder_feed, [1, 1] + shape))
        autoencoded = tf.reshape(
            autoencoded, [self.frame_height, self.frame_width,
                          self.num_channels, 8])  # 8-bit groups.
        self.autoencoder_result = discretization.bit_to_int(autoencoded, 8)
        # Now for autodecoding.
        shape = [self.frame_height, self.frame_width, self.num_channels]
        self.autodecoder_feed = tf.placeholder(tf.int32, shape=shape)
        bottleneck = tf.reshape(
            discretization.int_to_bit(self.autodecoder_feed, 8),
            [1, 1, self.frame_height, self.frame_width, self.num_channels * 8])
        autoencoder_model.set_mode(tf.estimator.ModeKeys.PREDICT)
        self.autodecoder_result = autoencoder_model.decode(bottleneck)

    self.avilable_data_size_op = atari.MemoryWrapper.singleton.speculum.size()
    self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()