Ejemplo n.º 1
0
  def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None, log_frequency=1000):
    self.name = name
    self.discount_factor = discount_factor
    self.max_global_steps = max_global_steps
    self.global_step = tf.contrib.framework.get_global_step()
    self.global_policy_net = policy_net
    self.global_value_net = value_net
    self.global_counter = global_counter
    self.local_counter = itertools.count()
    self.sp = StateProcessor()
    self.summary_writer = summary_writer
    self.env = env
    self.log_frequency = log_frequency

    # Create local policy/value nets that are not updated asynchronously
    with tf.variable_scope(name):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)
      self.value_net = ValueEstimator(reuse=True)

    # Op to copy params from global policy/valuenets
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES))

    self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
    self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)

    self.state = None
Ejemplo n.º 2
0
    def __init__(self, name, env, policy_net, value_net, global_counter,
                 discount_factor=0.99, summary_writer=None,
                 max_global_steps=None):
        self.name = name
        self.discount_factor = discount_factor
        self.max_global_steps = max_global_step
        self.global_step = tf.contrib.framework.get_global_step()
        self.global_policy_net = policy_net
        self.global_value_net = value_net
        self.global_counter = global_counter
        self.local_counter = itertools.count()
        self.sp = StateProcessor()
        self.summary_writer = summary_writer
        self.env = env

        with tf.variable_scope(name):
            self.policy_net = PolicyEstimator(policy_net.num_outputs)
            self.value_net = ValueEstimator(reuse=True)

        self.copy_params_op = make_copy_params_op(
            tf.contrib.slim.get_variables(
                scope="global",
                collection=tf.GraphKeys.TRAINABLE_VARIABLES)
            tf.contrib.slim.get_variables(
                scope=self.name+'/',
                ollection=tf.GraphKeys.TRAINABLE_VARIABLES)
        )

        self.vnet_train_op = make_copy_params_op(
            self.value_net, self.global_value_net)
        self.pnet_train_op = make_copy_params_op(
            self.policy_net, self.global_policy_net)

        self.state = None
Ejemplo n.º 3
0
  def __init__(self, name, env, model_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None):
    self.name = name
    self.discount_factor = discount_factor
    self.max_global_steps = max_global_steps
    self.global_step = tf.contrib.framework.get_global_step()
    self.global_model_net = model_net
    self.global_counter = global_counter
    self.local_counter = itertools.count()
    self.sp = StateProcessor()
    self.summary_writer = summary_writer
    self.env = env
    self.total_reward = 0
    self.episode_length = 0

    # Create local policy/value nets that are not updated asynchronously
    with tf.variable_scope(name):
      self.model_net = Model(model_net.num_outputs)

    # Op to copy params from global policy/valuenets
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES))

    self.mnet_train_op = make_train_op(self.model_net, self.global_model_net)

    self.state = None
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
    def testPredict(self):
        env = make_env()
        sp = StateProcessor()
        estimator = PolicyEstimator(len(VALID_ACTIONS))

        with self.test_session() as sess:
            sess.run(tf.initialize_all_variables())

            # Generate a state
            state = sp.process(env.reset())
            processed_state = atari_helpers.atari_make_initial_state(state)
            processed_states = np.array([processed_state])

            # Run feeds
            feed_dict = {
                estimator.states: processed_states,
                estimator.targets: [1.0],
                estimator.actions: [1]
            }

            loss = sess.run(estimator.loss, feed_dict)
            pred = sess.run(estimator.predictions, feed_dict)

            # Assertions
            self.assertTrue(loss != 0.0)
            self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS)))
            self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS)))
    def testGradient(self):
        env = make_env()
        sp = StateProcessor()
        estimator = PolicyEstimator(len(VALID_ACTIONS))
        grads = [g for g, _ in estimator.grads_and_vars]

        with self.test_session() as sess:
            sess.run(tf.initialize_all_variables())

            # Generate a state
            state = sp.process(env.reset())
            processed_state = atari_helpers.atari_make_initial_state(state)
            processed_states = np.array([processed_state])

            # Run feeds to get gradients
            feed_dict = {
                estimator.states: processed_states,
                estimator.targets: [1.0],
                estimator.actions: [1]
            }

            grads_ = sess.run(grads, feed_dict)

            # Apply calculated gradients
            grad_feed_dict = {k: v for k, v in zip(grads, grads_)}
            _ = sess.run(estimator.train_op, grad_feed_dict)
Ejemplo n.º 7
0
    def __init__(self, env, policy_net, summary_writer, saver=None):
        self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
        self.video_dir = os.path.abspath(self.video_dir)

        self.env = Monitor(env,
                           directory,
                           self.video_dir,
                           video_callable=lambda x: True,
                           resume=True)
        self.global_policy_net = policy_net
        self.summary_writer = summary_writer
        self.saver = saver
        self.sp = StateProcessor()

        self.checkpoint_path = os.path.abspath(
            os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

        try:
            os.makedirs(self.video_dir)
        except expression as identifier:
            pass

        with tf.variable_scope("policy_eval"):
            self.contrib.slim.get_variables(
                scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES)
            self.contrib.slim.get_variables(
                scope="policy_eval",
                collection=tf.GraphKeys.TRAINABLE_VARIABLES)
Ejemplo n.º 8
0
    def setUp(self):
        super(WorkerTest, self).setUp()

        self.env = make_env()
        self.discount_factor = 0.99
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.global_counter = itertools.count()
        self.sp = StateProcessor()

        with tf.variable_scope("global") as vs:
            self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
            self.global_value_net = ValueEstimator(reuse=True)
Ejemplo n.º 9
0
class WorkerTest(tf.test.TestCase):
    def setUp(self):
        super(WorkerTest, self).setUp()

        self.env = make_env()
        self.discount_factor = 0.99
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.global_counter = itertools.count()
        self.sp = StateProcessor()

        with tf.variable_scope("global") as vs:
            print("length of the actions: {}".format(len(VALID_ACTIONS)))
            self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
            self.global_value_net = ValueEstimator(reuse=True)

        def testPolicyNetPredict(self):
            w = Worker(name="test",
                       env=make_env(),
                       policy_net=self.global_policy_net,
                       value_net=self.global_value_net,
                       global_counter=self.global_counter,
                       discount_factor=self.discount_factor)

            with self.test_session() as sess:
                sess.run(tf.initialize_all_variables())
                state = self.sp.process(self.env.reset())
                processed_state = atari_helpers.atari_make_initial_state(state)
                action_values = w._policy_net_predict(processed_state, sess)
                self.assertEqual(action_values.shape, (4, ))

        def testValueNetPredict(self):
            w = Worker(name="test",
                       env=make_env(),
                       policy_net=self.global_policy_net,
                       value_net=self.global_value_net,
                       global_counter=self.global_counter,
                       discount_factor=self.discount_factor)

            with self.test_session() as sess:
                sess.run(tf.initialize_all_variables())
                state = self.sp.process(self.env.reset())
                processed_state = atari_helpers.atari_make_initial_state(state)
                w.state = processed_state
                transitions, local_t, global_t = w.run_n_steps(10, sess)
                policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(
                    transitions, sess)

            self.assertEqual(len(transitions), 10)
            self.assertIsNotNone(policy_net_loss)
            self.assertIsNotNone(value_net_loss)
            self.assertIsNotNone(policy_net_summaries)
            self.assertIsNotNone(value_net_summaries)
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
Ejemplo n.º 11
0
  def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None):
    self.name = name
    self.discount_factor = discount_factor
    self.max_global_steps = max_global_steps
    self.global_step = tf.contrib.framework.get_global_step()
    self.global_policy_net = policy_net
    self.global_value_net = value_net
    self.global_counter = global_counter
    self.local_counter = itertools.count()
    self.sp = StateProcessor()
    self.summary_writer = summary_writer
    self.env = env

    # Create local policy/value nets that are not updated asynchronously
    with tf.variable_scope(name):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)
      self.value_net = ValueEstimator(reuse=True)

    # Op to copy params from global policy/valuenets
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES))

    self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
    self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)

    self.state = None
Ejemplo n.º 12
0
    def __init__(self, env, global_net, summary_writer, saver=None):
        self.env = env
        self.global_net = global_net
        self.summary_writer = summary_writer
        self.saver = saver
        self.sp = StateProcessor()

        self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
        self.video_dir = os.path.abspath(self.video_dir)
        self.checkpoint_path = os.path.abspath(
            os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

        try:
            os.makedirs(self.video_dir)
        except:
            pass

        # Local policy net
        with tf.variable_scope("policy_eval"):
            self.local_net = PolicyValueEstimator()

        # Op to copy params from global policy/value net parameters
        self.copy_params_op = make_copy_params_op(
            tf.contrib.slim.get_variables(
                scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
            tf.contrib.slim.get_variables(
                scope="policy_eval",
                collection=tf.GraphKeys.TRAINABLE_VARIABLES))
  def setUp(self):
    super(WorkerTest, self).setUp()

    self.env = make_env()
    self.discount_factor = 0.99
    self.global_step = tf.Variable(0, name="global_step", trainable=False)
    self.global_counter = itertools.count()
    self.sp = StateProcessor()

    with tf.variable_scope("global") as vs:
      self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
      self.global_value_net = ValueEstimator(reuse=True)
class PolicyMonitor(object):
  """
  Helps evaluating a policy by running an episode in an environment,
  saving a video, and plotting summaries to Tensorboard.

  Args:
    env: environment to run in
    policy_net: A policy estimator
    summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries
  """
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))

  def _policy_net_predict(self, state, sess):
    feed_dict = { self.policy_net.states: [state] }
    preds = sess.run(self.policy_net.predictions, feed_dict)
    return preds["probs"][0]

  def eval_once(self, sess):
    with sess.as_default(), sess.graph.as_default():
      # Copy params to local model
      global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])

      # Run an episode
      done = False
      state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
      total_reward = 0.0
      episode_length = 0
      while not done:
        action_probs = self._policy_net_predict(state, sess)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = self.env.step(action)
        next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state))
        total_reward += reward
        episode_length += 1
        state = next_state

      # Add summaries
      episode_summary = tf.Summary()
      episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward")
      episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length")
      self.summary_writer.add_summary(episode_summary, global_step)
      self.summary_writer.flush()

      if self.saver is not None:
        self.saver.save(sess, self.checkpoint_path)

      tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length))

      f_reward.write(str(global_step) + " " + str(total_reward) + " " + str(episode_length) + "\n")

      return total_reward, episode_length

  def continuous_eval(self, eval_every, sess, coord):
    """
    Continuously evaluates the policy every [eval_every] seconds.
    """
    try:
      while not coord.should_stop():
        self.eval_once(sess)
        # Sleep until next evaluation cycle
        time.sleep(eval_every)
    except tf.errors.CancelledError:
      return
Ejemplo n.º 15
0
class Worker(object):
    """
  An A3C worker thread. Runs episodes locally and updates global shared value and policy nets.

  Args:
    name: A unique name for this worker
    env: The Gym environment used by this worker
    policy_net: Instance of the globally shared policy net
    value_net: Instance of the globally shared value net
    global_counter: Iterator that holds the global step
    discount_factor: Reward discount factor
    summary_writer: A tf.train.SummaryWriter for Tensorboard summaries
    max_global_steps: If set, stop coordinator when global_counter > max_global_steps
  """
    def __init__(self,
                 name,
                 env,
                 policy_net,
                 value_net,
                 global_counter,
                 discount_factor=0.99,
                 summary_writer=None,
                 max_global_steps=None):
        self.name = name
        self.discount_factor = discount_factor
        self.max_global_steps = max_global_steps
        self.global_step = tf.contrib.framework.get_global_step()
        self.global_policy_net = policy_net
        self.global_value_net = value_net
        self.global_counter = global_counter
        self.local_counter = itertools.count()
        self.sp = StateProcessor()
        self.summary_writer = summary_writer
        self.env = env

        # Create local policy/value nets that are not updated asynchronously
        with tf.variable_scope(name):
            self.policy_net = PolicyEstimator(policy_net.num_outputs)
            self.value_net = ValueEstimator(reuse=True)

        # Op to copy params from global policy/valuenets
        self.copy_params_op = make_copy_params_op(
            tf.contrib.slim.get_variables(
                scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
            tf.contrib.slim.get_variables(
                scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES))

        self.vnet_train_op = make_train_op(self.value_net,
                                           self.global_value_net)
        self.pnet_train_op = make_train_op(self.policy_net,
                                           self.global_policy_net)

        self.state = None

    def run(self, sess, coord, t_max):
        with sess.as_default(), sess.graph.as_default():
            # Initial state
            self.state = atari_helpers.atari_make_initial_state(
                self.sp.process(self.env.reset()))
            try:
                while not coord.should_stop():
                    # Copy Parameters from the global networks
                    sess.run(self.copy_params_op)

                    # Collect some experience
                    transitions, local_t, global_t = self.run_n_steps(
                        t_max, sess)

                    if self.max_global_steps is not None and global_t >= self.max_global_steps:
                        tf.logging.info(
                            "Reached global step {}. Stopping.".format(
                                global_t))
                        coord.request_stop()
                        return

                    # Update the global networks
                    self.update(transitions, sess)

            except tf.errors.CancelledError:
                return

    def _policy_net_predict(self, state, sess):
        feed_dict = {self.policy_net.states: [state]}
        preds = sess.run(self.policy_net.predictions, feed_dict)
        return preds["probs"][0]

    def _value_net_predict(self, state, sess):
        feed_dict = {self.value_net.states: [state]}
        preds = sess.run(self.value_net.predictions, feed_dict)
        return preds["logits"][0]

    def run_n_steps(self, n, sess):
        transitions = []
        for _ in range(n):
            # Take a step
            action_probs = self._policy_net_predict(self.state, sess)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = self.env.step(action)
            next_state = atari_helpers.atari_make_next_state(
                self.state, self.sp.process(next_state))

            # Store transition
            transitions.append(
                Transition(state=self.state,
                           action=action,
                           reward=reward,
                           next_state=next_state,
                           done=done))

            # Increase local and global counters
            local_t = next(self.local_counter)
            global_t = next(self.global_counter)

            if local_t % 100 == 0:
                tf.logging.info("{}: local Step {}, global step {}".format(
                    self.name, local_t, global_t))

            if done:
                self.state = atari_helpers.atari_make_initial_state(
                    self.sp.process(self.env.reset()))
                break
            else:
                self.state = next_state
        return transitions, local_t, global_t

    def update(self, transitions, sess):
        """
    Updates global policy and value networks based on collected experience

    Args:
      transitions: A list of experience transitions
      sess: A Tensorflow session
    """

        # If we episode was not done we bootstrap the value from the last state
        reward = 0.0
        if not transitions[-1].done:
            reward = self._value_net_predict(transitions[-1].state, sess)

        # Accumulate minibatch exmaples
        states = []
        policy_targets = []
        value_targets = []
        actions = []

        for transition in transitions[::-1]:
            reward = transition.reward + self.discount_factor * reward
            policy_target = (reward -
                             self._value_net_predict(transition.state, sess))
            # Accumulate updates
            states.append(transition.state)
            actions.append(transition.action)
            policy_targets.append(policy_target)
            value_targets.append(reward)

        feed_dict = {
            self.policy_net.states: np.array(states),
            self.policy_net.targets: policy_targets,
            self.policy_net.actions: actions,
            self.value_net.states: np.array(states),
            self.value_net.targets: value_targets,
        }

        # Train the global estimators using local gradients
        global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run(
            [
                self.global_step, self.policy_net.loss, self.value_net.loss,
                self.pnet_train_op, self.vnet_train_op,
                self.policy_net.summaries, self.value_net.summaries
            ], feed_dict)

        # Write summaries
        if self.summary_writer is not None:
            self.summary_writer.add_summary(pnet_summaries, global_step)
            self.summary_writer.add_summary(vnet_summaries, global_step)
            self.summary_writer.flush()

        return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries
class PolicyMonitor(object):
  """
  Helps evaluating a policy by running an episode in an environment,
  saving a video, and plotting summaries to Tensorboard.

  Args:
    env: environment to run in
    policy_net: A policy estimator
    summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries
  """
  def __init__(self, env, policy_net, summary_writer, saver=None):

    self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
    self.video_dir = os.path.abspath(self.video_dir)

    self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True)
    self.global_policy_net = policy_net
    self.summary_writer = summary_writer
    self.saver = saver
    self.sp = StateProcessor()

    self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

    try:
      os.makedirs(self.video_dir)
    except FileExistsError:
      pass

    # Local policy net
    with tf.variable_scope("policy_eval"):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)

    # Op to copy params from global policy/value net parameters
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))

  def _policy_net_predict(self, state, sess):
    feed_dict = { self.policy_net.states: [state] }
    preds = sess.run(self.policy_net.predictions, feed_dict)
    return preds["probs"][0]

  def eval_once(self, sess):
    with sess.as_default(), sess.graph.as_default():
      # Copy params to local model
      global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op])

      # Run an episode
      done = False
      state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
      total_reward = 0.0
      episode_length = 0
      while not done:
        action_probs = self._policy_net_predict(state, sess)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = self.env.step(action)
        next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state))
        total_reward += reward
        episode_length += 1
        state = next_state

      # Add summaries
      episode_summary = tf.Summary()
      episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward")
      episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length")
      self.summary_writer.add_summary(episode_summary, global_step)
      self.summary_writer.flush()

      if self.saver is not None:
        self.saver.save(sess, self.checkpoint_path)

      tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length))

      return total_reward, episode_length

  def continuous_eval(self, eval_every, sess, coord):
    """
    Continuously evaluates the policy every [eval_every] seconds.
    """
    try:
      while not coord.should_stop():
        self.eval_once(sess)
        # Sleep until next evaluation cycle
        time.sleep(eval_every)
    except tf.errors.CancelledError:
      return
Ejemplo n.º 17
0
class Worker(object):
  """
  An A3C worker thread. Runs episodes locally and updates global shared value and policy nets.

  Args:
    name: A unique name for this worker
    env: The Gym environment used by this worker
    policy_net: Instance of the globally shared policy net
    value_net: Instance of the globally shared value net
    global_counter: Iterator that holds the global step
    discount_factor: Reward discount factor
    summary_writer: A tf.train.SummaryWriter for Tensorboard summaries
    max_global_steps: If set, stop coordinator when global_counter > max_global_steps
  """
  def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None):
    self.name = name
    self.discount_factor = discount_factor
    self.max_global_steps = max_global_steps
    self.global_step = tf.contrib.framework.get_global_step()
    self.global_policy_net = policy_net
    self.global_value_net = value_net
    self.global_counter = global_counter
    self.local_counter = itertools.count()
    self.sp = StateProcessor()
    self.summary_writer = summary_writer
    self.env = env

    # Create local policy/value nets that are not updated asynchronously
    with tf.variable_scope(name):
      self.policy_net = PolicyEstimator(policy_net.num_outputs)
      self.value_net = ValueEstimator(reuse=True)

    # Op to copy params from global policy/valuenets
    self.copy_params_op = make_copy_params_op(
      tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES),
      tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES))

    self.vnet_train_op = make_train_op(self.value_net, self.global_value_net)
    self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net)

    self.state = None

  def run(self, sess, coord, t_max):
    with sess.as_default(), sess.graph.as_default():
      # Initial state
      self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
      try:
        while not coord.should_stop():
          # Copy Parameters from the global networks
          sess.run(self.copy_params_op)

          # Collect some experience
          transitions, local_t, global_t = self.run_n_steps(t_max, sess)

          if self.max_global_steps is not None and global_t >= self.max_global_steps:
            tf.logging.info("Reached global step {}. Stopping.".format(global_t))
            coord.request_stop()
            return

          # Update the global networks
          self.update(transitions, sess)

      except tf.errors.CancelledError:
        return

  def _policy_net_predict(self, state, sess):
    feed_dict = { self.policy_net.states: [state] }
    preds = sess.run(self.policy_net.predictions, feed_dict)
    return preds["probs"][0]

  def _value_net_predict(self, state, sess):
    feed_dict = { self.value_net.states: [state] }
    preds = sess.run(self.value_net.predictions, feed_dict)
    return preds["logits"][0]

  def run_n_steps(self, n, sess):
    transitions = []
    for _ in range(n):
      # Take a step
      action_probs = self._policy_net_predict(self.state, sess)
      action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
      next_state, reward, done, _ = self.env.step(action)
      next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state))

      # Store transition
      transitions.append(Transition(
        state=self.state, action=action, reward=reward, next_state=next_state, done=done))

      # Increase local and global counters
      local_t = next(self.local_counter)
      global_t = next(self.global_counter)

      if local_t % 100 == 0:
        tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t))

      if done:
        self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset()))
        break
      else:
        self.state = next_state
    return transitions, local_t, global_t

  def update(self, transitions, sess):
    """
    Updates global policy and value networks based on collected experience

    Args:
      transitions: A list of experience transitions
      sess: A Tensorflow session
    """

    # If we episode was not done we bootstrap the value from the last state
    reward = 0.0
    if not transitions[-1].done:
      reward = self._value_net_predict(transitions[-1].next_state, sess)

    # Accumulate minibatch exmaples
    states = []
    policy_targets = []
    value_targets = []
    actions = []

    for transition in transitions[::-1]:
      reward = transition.reward + self.discount_factor * reward
      policy_target = (reward - self._value_net_predict(transition.state, sess))
      # Accumulate updates
      states.append(transition.state)
      actions.append(transition.action)
      policy_targets.append(policy_target)
      value_targets.append(reward)

    feed_dict = {
      self.policy_net.states: np.array(states),
      self.policy_net.targets: policy_targets,
      self.policy_net.actions: actions,
      self.value_net.states: np.array(states),
      self.value_net.targets: value_targets,
    }

    # Train the global estimators using local gradients
    global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run([
      self.global_step,
      self.policy_net.loss,
      self.value_net.loss,
      self.pnet_train_op,
      self.vnet_train_op,
      self.policy_net.summaries,
      self.value_net.summaries
    ], feed_dict)

    # Write summaries
    if self.summary_writer is not None:
      self.summary_writer.add_summary(pnet_summaries, global_step)
      self.summary_writer.add_summary(vnet_summaries, global_step)
      self.summary_writer.flush()

    return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries
Ejemplo n.º 18
0
class Worker(object):

    def __init__(self, name, env, policy_net, value_net, global_counter,
                 discount_factor=0.99, summary_writer=None,
                 max_global_steps=None):
        self.name = name
        self.discount_factor = discount_factor
        self.max_global_steps = max_global_step
        self.global_step = tf.contrib.framework.get_global_step()
        self.global_policy_net = policy_net
        self.global_value_net = value_net
        self.global_counter = global_counter
        self.local_counter = itertools.count()
        self.sp = StateProcessor()
        self.summary_writer = summary_writer
        self.env = env

        with tf.variable_scope(name):
            self.policy_net = PolicyEstimator(policy_net.num_outputs)
            self.value_net = ValueEstimator(reuse=True)

        self.copy_params_op = make_copy_params_op(
            tf.contrib.slim.get_variables(
                scope="global",
                collection=tf.GraphKeys.TRAINABLE_VARIABLES)
            tf.contrib.slim.get_variables(
                scope=self.name+'/',
                ollection=tf.GraphKeys.TRAINABLE_VARIABLES)
        )

        self.vnet_train_op = make_copy_params_op(
            self.value_net, self.global_value_net)
        self.pnet_train_op = make_copy_params_op(
            self.policy_net, self.global_policy_net)

        self.state = None

    def run(self, sess, coord, t_max):
        with sess.as_default(), sess.graph.as_Default():
            self.state = atari_helpers.atari_make_initial_state(
                self.sp.process(self.env.reset()))
            try:
                while not coord.should_stop():
                    sess.run(self.copy_params_op)

                    transitions, local_t, global_t = self.run_n_steps(
                        t_max, sess)

                    if self.max_global_steps is not None
                    and global_t >= self.max_global_steps:
                        tf.logging.info(
                            "Reached global step {}. Stopping."
                            .format(global_t))
                        coord.request_stop()
                        return

                    self.update(transitions, sess)

            except tf.errors.CancelledError:
                return

    def _policy_net_predict(self, state, sess):
        feed_dic = {self.policy_net.states: [states]}
        preds = sess.run(self.policy_net.predictions, feed_dic)
        return preds["probs"][0]

    def _value_net_predict(self, state, sess):
        feed_dict = {self.value_net.states: [state]}
        preds = sess.run(self.value_net.predictions, feed_dict)
        return preds["logits"][0]

    def run_n_steps(self, n, sess):
        transitions = []

        for _ in range(n):
            action_probes = self._policy_net_predict(self.state, sess)
            action = np.random.choice(
                np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = self.env.step(action)
            next_state = atari_helpers.atari_make_next_state(
                self.state, self.sp.process(next_state))

            transitions.append(Transition(s
                                          tate=self.state,
                                          action=action,
                                          reward=reward,
                                          next_state=next_state,
                                          done=done))

            local_t = next(self.local_counter)
            global_t = next(self.global_counter)

            if local_t % 100 == 0:
                tf.logging.info("{}: local step {}, global step {}". format(
                    self.name, local_t, global_t))

            if done:
                self.state = atari_helpers.atari_make_initial_state(
                    self.sp.process(self.env.reset()))
                break
            else:
                self.state = next_state
Ejemplo n.º 19
0
class PolicyMonitor(object):
    def __init__(self, env, policy_net, summary_writer, saver=None):
        self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
        self.video_dir = os.path.abspath(self.video_dir)

        self.env = Monitor(env,
                           directory,
                           self.video_dir,
                           video_callable=lambda x: True,
                           resume=True)
        self.global_policy_net = policy_net
        self.summary_writer = summary_writer
        self.saver = saver
        self.sp = StateProcessor()

        self.checkpoint_path = os.path.abspath(
            os.path.join(summary_writer.get_logdir(), "../checkpoints/model"))

        try:
            os.makedirs(self.video_dir)
        except expression as identifier:
            pass

        with tf.variable_scope("policy_eval"):
            self.contrib.slim.get_variables(
                scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES)
            self.contrib.slim.get_variables(
                scope="policy_eval",
                collection=tf.GraphKeys.TRAINABLE_VARIABLES)

    def _policy_net_predict(self, state, sess):
        feed_dict = {self.policy_net_states: [states]}
        preds = sess.run(self.run(self.policy_net.predictions, feed_dict))
        return preds["probs"][0]

    def eval_once(self, sess):
        with sess.as_default(), sess.graph.as_default():
            global_step, _ = sess.run(
                [tf.contrib.framework.get_global_Step(), self.copy_params.op])

            done = False
            state = atari_helpers.atari_make_initial_state(
                self.sp.process(self.env.reset()))
            total_reward = 0.0
            episode_length = 0

            while not done:
                action_probs = self._policy_net_predict(state, sess)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
                next_state, reward, done, _ = self.env.step(action)
                next_state = atari_helpers.atari_make_next_state(
                    state, self.sp.process(next_state))
                total_reward += reward
                episode_length += 1
                state = next_state

            episode_summary = tf.summary()
            episode_summary.value.add(simple_value=total_reward,
                                      tag="eval/total_reward")
            episode_summary.value.add(simple_value=episode_length,
                                      tag="eval/episode_length")
            self.summary_writer.add_summary(episode_summary, global_step)
            self.summary_writer.flush()

            if self.saver is not None:
                self.saver.save(sess, self.checkpoint_path)

            tf.logging.info(
                "Eval results at step {}: total_reward {},episode_length {}".
                format(global_step, total_reward, episode_length))

            return total_reward, episode_length

    def continous_eval(self, eval_every, sess, coord):
        try:
            while not coord.should_stop():
                self.eval_once(sess)
                time.sleep(eval_every)
        except tf.errors.CanceledError:
            return
class WorkerTest(tf.test.TestCase):
  def setUp(self):
    super(WorkerTest, self).setUp()

    self.env = make_env()
    self.discount_factor = 0.99
    self.global_step = tf.Variable(0, name="global_step", trainable=False)
    self.global_counter = itertools.count()
    self.sp = StateProcessor()

    with tf.variable_scope("global") as vs:
      self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS))
      self.global_value_net = ValueEstimator(reuse=True)

  def testPolicyNetPredict(self):
    w = Worker(
      name="test",
      env=make_env(),
      policy_net=self.global_policy_net,
      value_net=self.global_value_net,
      global_counter=self.global_counter,
      discount_factor=self.discount_factor)

    with self.test_session() as sess:
      sess.run(tf.initialize_all_variables())
      state = self.sp.process(self.env.reset())
      processed_state = atari_helpers.atari_make_initial_state(state)
      action_values = w._policy_net_predict(processed_state, sess)
      self.assertEqual(action_values.shape, (4,))


  def testValueNetPredict(self):
    w = Worker(
      name="test",
      env=make_env(),
      policy_net=self.global_policy_net,
      value_net=self.global_value_net,
      global_counter=self.global_counter,
      discount_factor=self.discount_factor)

    with self.test_session() as sess:
      sess.run(tf.initialize_all_variables())
      state = self.sp.process(self.env.reset())
      processed_state = atari_helpers.atari_make_initial_state(state)
      state_value = w._value_net_predict(processed_state, sess)
      self.assertEqual(state_value.shape, ())

  def testRunNStepsAndUpdate(self):
    w = Worker(
      name="test",
      env=make_env(),
      policy_net=self.global_policy_net,
      value_net=self.global_value_net,
      global_counter=self.global_counter,
      discount_factor=self.discount_factor)

    with self.test_session() as sess:
      sess.run(tf.initialize_all_variables())
      state = self.sp.process(self.env.reset())
      processed_state = atari_helpers.atari_make_initial_state(state)
      w.state = processed_state
      transitions, local_t, global_t = w.run_n_steps(10, sess)
      policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(transitions, sess)

    self.assertEqual(len(transitions), 10)
    self.assertIsNotNone(policy_net_loss)
    self.assertIsNotNone(value_net_loss)
    self.assertIsNotNone(policy_net_summaries)
    self.assertIsNotNone(value_net_summaries)