Example #1
0
    def testAdaptiveKlLoss(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._time_step_spec.observation,
            self._action_spec,
            fc_layer_params=None)
        value_net = value_network.ValueNetwork(
            self._time_step_spec.observation, fc_layer_params=None)
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=actor_net,
            value_net=value_net,
            initial_adaptive_kl_beta=1.0,
            adaptive_kl_target=10.0,
            adaptive_kl_tolerance=0.5,
        )

        # Initialize variables
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Loss should not change if data kl is target kl.
        loss_1 = agent.adaptive_kl_loss([10.0])
        loss_2 = agent.adaptive_kl_loss([10.0])
        self.assertEqual(self.evaluate(loss_1), self.evaluate(loss_2))

        # If data kl is low, kl penalty should decrease between calls.
        loss_1 = self.evaluate(agent.adaptive_kl_loss([1.0]))
        adaptive_kl_beta_update_fn = common.function(
            agent.update_adaptive_kl_beta)
        self.evaluate(adaptive_kl_beta_update_fn([1.0]))
        loss_2 = self.evaluate(agent.adaptive_kl_loss([1.0]))
        self.assertGreater(loss_1, loss_2)

        # # # If data kl is low, kl penalty should increase between calls.
        loss_1 = self.evaluate(agent.adaptive_kl_loss([100.0]))
        self.evaluate(adaptive_kl_beta_update_fn([100.0]))
        loss_2 = self.evaluate(agent.adaptive_kl_loss([100.0]))
        self.assertLess(loss_1, loss_2)
Example #2
0
    def testEntropyRegularizationLoss(self, not_zero):
        ent_reg = 0.1 * not_zero
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=DummyActorNet(self._obs_spec, self._action_spec),
            value_net=DummyValueNet(self._obs_spec),
            normalize_observations=False,
            entropy_regularization=ent_reg,
        )

        # Call other loss functions to make sure trainable variables are
        #   constructed.
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[0], [1]], dtype=tf.float32)
        returns = tf.constant([1.9, 1.0], dtype=tf.float32)
        sample_action_log_probs = tf.constant([[0.9], [0.3]], dtype=tf.float32)
        advantages = tf.constant([1.9, 1.0], dtype=tf.float32)
        weights = tf.ones_like(advantages)
        current_policy_distribution, unused_network_state = DummyActorNet(
            self._obs_spec, self._action_spec)(time_steps.observation,
                                               time_steps.step_type, ())
        agent.policy_gradient_loss(time_steps, actions,
                                   sample_action_log_probs, advantages,
                                   current_policy_distribution, weights)
        agent.value_estimation_loss(time_steps, returns, weights)

        # Now request entropy regularization loss.
        # Action stdevs should be ~1.0, and mean entropy ~3.70111.
        expected_loss = -3.70111 * ent_reg
        loss = agent.entropy_regularization_loss(time_steps,
                                                 current_policy_distribution,
                                                 weights)

        self.evaluate(tf.compat.v1.initialize_all_variables())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
Example #3
0
    def testValueEstimationLoss(self):
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=DummyActorNet(self._obs_spec, self._action_spec),
            value_net=DummyValueNet(self._obs_spec),
            value_pred_loss_coef=1.0,
            normalize_observations=False,
        )

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        returns = tf.constant([1.9, 1.0], dtype=tf.float32)
        weights = tf.ones_like(returns)

        expected_loss = 123.205
        loss = agent.value_estimation_loss(time_steps, returns, weights)

        self.evaluate(tf.compat.v1.initialize_all_variables())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
Example #4
0
  def get_agent(self, env, params):
    """Returns a TensorFlow PPO-Agent
    
    Arguments:
        env {TFAPyEnvironment} -- Tensorflow-Agents PyEnvironment
        params {ParameterServer} -- ParameterServer from BARK
    
    Returns:
        agent -- tf-agent
    """

    # actor network
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        env.observation_spec(),
        env.action_spec(),
        fc_layer_params=tuple(
          self._params["ML"]["Agent"]["actor_fc_layer_params"]))

    # critic network
    value_net = value_network.ValueNetwork(
      env.observation_spec(),
      fc_layer_params=tuple(
        self._params["ML"]["Agent"]["critic_fc_layer_params"]))
    
    # agent
    tf_agent = ppo_agent.PPOAgent(
      env.time_step_spec(),
      env.action_spec(),
      actor_net=actor_net,
      value_net=value_net,
      optimizer=tf.compat.v1.train.AdamOptimizer(
          learning_rate=self._params["ML"]["Agent"]["learning_rate"]),
      train_step_counter=self._ckpt.step,
      num_epochs=self._params["ML"]["Agent"]["num_epochs"],
      name=self._params["ML"]["Agent"]["agent_name"],
      debug_summaries=self._params["ML"]["Agent"]["debug_summaries"])
    tf_agent.initialize()
    return tf_agent
Example #5
0
    def testAdaptiveKlLoss(self):
        if tf.executing_eagerly():
            self.skipTest('b/123777119')  # Secondary bug: ('b/123770194')
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._time_step_spec.observation,
            self._action_spec,
            fc_layer_params=None)
        value_net = value_network.ValueNetwork(
            self._time_step_spec.observation, fc_layer_params=None)
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=actor_net,
            value_net=value_net,
            initial_adaptive_kl_beta=1.0,
            adaptive_kl_target=10.0,
            adaptive_kl_tolerance=0.5,
        )

        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Loss should not change if data kl is target kl.
        loss_1 = self.evaluate(agent.adaptive_kl_loss(10.0))
        loss_2 = self.evaluate(agent.adaptive_kl_loss(10.0))
        self.assertEqual(loss_1, loss_2)

        # If data kl is low, kl penalty should decrease between calls.
        loss_1 = self.evaluate(agent.adaptive_kl_loss(1.0))
        self.evaluate(agent.update_adaptive_kl_beta(1.0))
        loss_2 = self.evaluate(agent.adaptive_kl_loss(1.0))
        self.assertGreater(loss_1, loss_2)

        # If data kl is low, kl penalty should increase between calls.
        loss_1 = self.evaluate(agent.adaptive_kl_loss(100.0))
        self.evaluate(agent.update_adaptive_kl_beta(100.0))
        loss_2 = self.evaluate(agent.adaptive_kl_loss(100.0))
        self.assertLess(loss_1, loss_2)
Example #6
0
  def testComputeAdvantagesNoGae(self):
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._obs_spec, self._action_spec),
        value_net=DummyValueNet(self._obs_spec),
        normalize_observations=False,
        use_gae=False)
    rewards = tf.constant([[1.0] * 9, [1.0] * 9])
    discounts = tf.constant([[1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0],
                             [1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0]])
    returns = tf.constant([[5.0, 4.0, 3.0, 2.0, 1.0, 3.439, 2.71, 1.9, 1.0],
                           [3.0, 4.0, 7.0, 2.0, -1.0, 5.439, 2.71, -2.9, 1.0]])
    value_preds = tf.constant([
        [3.0] * 10,
        [3.0] * 10,
    ])  # One extra for final time_step.

    expected_advantages = returns - value_preds[:, :-1]
    advantages = agent.compute_advantages(rewards, returns, discounts,
                                          value_preds)
    self.assertAllClose(expected_advantages, advantages)
Example #7
0
  def testUpdateAdaptiveKlBeta(self, strategy_fn):
    with strategy_fn().scope():
      actor_net = actor_distribution_network.ActorDistributionNetwork(
          self._time_step_spec.observation,
          self._action_spec,
          fc_layer_params=None)
      value_net = value_network.ValueNetwork(
          self._time_step_spec.observation, fc_layer_params=None)
      agent = ppo_agent.PPOAgent(
          self._time_step_spec,
          self._action_spec,
          tf.compat.v1.train.AdamOptimizer(),
          actor_net=actor_net,
          value_net=value_net,
          initial_adaptive_kl_beta=1.0,
          adaptive_kl_target=10.0,
          adaptive_kl_tolerance=0.5,
      )
      agent.initialize()

    self.evaluate(tf.compat.v1.global_variables_initializer())

    # When KL is target kl, beta should not change.
    update_adaptive_kl_beta_fn = common.function(agent.update_adaptive_kl_beta)
    beta_0 = update_adaptive_kl_beta_fn([10.0])
    expected_beta_0 = 1.0
    self.assertEqual(expected_beta_0, self.evaluate(beta_0))

    # When KL is large, beta should increase.
    beta_1 = update_adaptive_kl_beta_fn([100.0])
    expected_beta_1 = 1.5
    self.assertEqual(expected_beta_1, self.evaluate(beta_1))

    # When KL is small, beta should decrease.
    beta_2 = update_adaptive_kl_beta_fn([1.0])
    expected_beta_2 = 1.0
    self.assertEqual(expected_beta_2, self.evaluate(beta_2))
Example #8
0
  def testL2RegularizationLoss(self, not_zero):
    l2_reg = 1e-4 * not_zero
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._action_spec),
        value_net=DummyValueNet(),
        normalize_observations=False,
        policy_l2_reg=l2_reg,
        value_function_l2_reg=l2_reg,
    )

    # Call other loss functions to make sure trainable variables are
    #   constructed.
    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)
    actions = tf.constant([[0], [1]], dtype=tf.float32)
    returns = tf.constant([1.9, 1.0], dtype=tf.float32)
    sample_action_log_probs = tf.constant([[0.9], [0.3]], dtype=tf.float32)
    advantages = tf.constant([1.9, 1.0], dtype=tf.float32)
    current_policy_distribution, unused_network_state = DummyActorNet(
        self._action_spec)(time_steps.observation, time_steps.step_type, ())
    valid_mask = tf.ones_like(advantages)
    agent.policy_gradient_loss(time_steps, actions, sample_action_log_probs,
                               advantages, current_policy_distribution,
                               valid_mask)
    agent.value_estimation_loss(time_steps, returns, valid_mask)

    # Now request L2 regularization loss.
    # Value function weights are [2, 1], actor net weights are [2, 1, 1, 1].
    expected_loss = l2_reg * ((2**2 + 1) + (2**2 + 1 + 1 + 1))
    loss = agent.l2_regularization_loss()

    self.evaluate(tf.global_variables_initializer())
    loss_ = self.evaluate(loss)
    self.assertAllClose(loss_, expected_loss)
Example #9
0
    def test_multiple_tf_agents(self):
        env_name = "CartPole-v0"
        # DQN
        env = gym.make(env_name)
        train_env = environment_converter.gym_to_tf(env)
        fc_layer_params = (100, )
        q_net = q_network.QNetwork(
            input_tensor_spec=train_env.observation_spec(),
            action_spec=train_env.action_spec(),
            fc_layer_params=fc_layer_params,
        )
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        dqn_tf_agent = dqn_agent.DqnAgent(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
        )
        dqn_tf_agent.initialize()

        # PPO
        env = gym.make(env_name)
        actor_fc_layers = (200, 100)
        value_fc_layers = (200, 100)
        learning_rate = 1e-3
        train_env = environment_converter.gym_to_tf(env)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=actor_fc_layers,
        )
        value_net = value_network.ValueNetwork(train_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        ppo_tf_agent = ppo_agent.PPOAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
        )
        ppo_tf_agent.initialize()

        # REINFORCE:
        env = gym.make(env_name)
        train_env = environment_converter.gym_to_tf(env)
        learning_rate = 1e-3
        fc_layer_params = (100, )
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=fc_layer_params,
        )
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)
        train_step_counter = tf.compat.v2.Variable(0)
        reinforce_tf_agent = reinforce_agent.ReinforceAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            actor_network=actor_net,
            optimizer=optimizer,
            normalize_returns=True,
            train_step_counter=train_step_counter,
        )
        reinforce_tf_agent.initialize()

        agents = [dqn_tf_agent, ppo_tf_agent, reinforce_tf_agent]
        agent_names = ["dqn_agent", "ppo_agent", "reinforce_agent"]

        train_multiple(agents, env, 1470, 195, agent_names, 200)

        trained_env = get_saved_environments()[0]
        trained_models = get_trained_model_names(trained_env)
        model_saved = set(agent_names) == set(trained_models)
        shutil.rmtree(save_path)
        self.assertTrue(model_saved)
Example #10
0
    def __init__(
        self,
        landscape: flexs.Landscape,
        rounds: int,
        sequences_batch_size: int,
        model_queries_per_batch: int,
        starting_sequence: str,
        alphabet: str,
        log_file: Optional[str] = None,
        model: Optional[flexs.Model] = None,
        num_experiment_rounds: int = 10,
        num_model_rounds: int = 1,
    ):
        """
        Args:
            num_experiment_rounds: Number of experiment-based rounds to run. This is by
                default set to 10, the same number of sequence proposal of rounds run.
            num_model_rounds: Number of model-based rounds to run.

        """
        tf.config.run_functions_eagerly(False)

        name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}"

        if model is None:
            model = DynaPPOEnsemble(
                len(starting_sequence),
                alphabet,
            )
            model.train(
                s_utils.generate_random_sequences(len(starting_sequence), 10,
                                                  alphabet),
                [0] * 10,
            )

        super().__init__(
            model,
            name,
            rounds,
            sequences_batch_size,
            model_queries_per_batch,
            starting_sequence,
            log_file,
        )

        self.alphabet = alphabet
        self.num_experiment_rounds = num_experiment_rounds
        self.num_model_rounds = num_model_rounds

        env = DynaPPOEnvMut(
            alphabet=self.alphabet,
            starting_seq=starting_sequence,
            model=model,
            landscape=landscape,
            max_num_steps=model_queries_per_batch,
        )
        validate_py_environment(env, episodes=1)
        self.tf_env = tf_py_environment.TFPyEnvironment(env)

        encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"])
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self.tf_env.observation_spec(),
            self.tf_env.action_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )
        value_net = value_network.ValueNetwork(
            self.tf_env.observation_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )

        self.agent = ppo_agent.PPOAgent(
            self.tf_env.time_step_spec(),
            self.tf_env.action_spec(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=10,
            summarize_grads_and_vars=False,
        )
        self.agent.initialize()
def train_eval_doom_simple(
		# Params for collect
		num_environment_steps=100000,
		collect_episodes_per_iteration=32,
		num_parallel_environments=1,
		replay_buffer_capacity=301,  # Per-environment
		# Params for train
		num_epochs=25,
		learning_rate=4e-4,
		# Params for eval
		eval_interval=10,
		num_video_episodes=10,
		# Params for summaries and logging
		log_interval=10):
	"""A simple train and eval for PPO."""
	# if not os.path.exists(videos_dir):
	# 	os.makedirs(videos_dir)
	global terminate
	eval_py_env = CarlaEnv()
	tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

	actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec())

	global_step = tf.compat.v1.train.get_or_create_global_step()
	optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5)

	tf_agent = ppo_agent.PPOAgent(
		tf_env.time_step_spec(),
		tf_env.action_spec(),
		optimizer,
		actor_net,
		value_net,
		num_epochs=num_epochs,
		train_step_counter=global_step,
		discount_factor=0.99,
		gradient_clipping=0.5,
		entropy_regularization=1e-2,
		importance_ratio_clipping=0.2,
		use_gae=True,
		use_td_lambda_return=True
	)
	tf_agent.initialize()

	environment_steps_metric = tf_metrics.EnvironmentSteps()
	step_metrics = [
		tf_metrics.NumberOfEpisodes(),
		environment_steps_metric,
	]

	replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity)
	train_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity)
	collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration)

	collect_time = 0
	train_time = 0
	timed_at_step = global_step.numpy()
	
	my_policy = tf_agent.policy
	saver = PolicySaver(my_policy, batch_size=None)

	def train_step():
		trajectories = train_replay_buffer.gather_all()
		return tf_agent.train(experience=trajectories)
	
	def evaluate(policy, step_count):
		create_video(tf_env, policy, 10, f'agent/behave/imageio_{step_count}.mp4')


	print("collecting samples initial:")
	collect_driver.run()
	train_replay_buffer = copy.deepcopy(replay_buffer)
	replay_buffer.clear()
	print(f"train size {train_replay_buffer.num_frames()} buffer size{replay_buffer.num_frames()}")

	while environment_steps_metric.result() < num_environment_steps and not terminate:
		start_time = time.time()
		print("collecting samples")
		collector_thread = threading.Thread(target=collect_driver.run)
		collector_thread.start()

		start_time = time.time()
		count = 0
		# while collector_thread.is_alive() and not terminate:
		# 	count = count + 1
		print(f"Training agent {count}")
		total_loss, _ = train_step()
		print()
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print(f'step = {global_step.numpy()}, loss = {total_loss}, env_metric = {environment_steps_metric.result()}')
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print()
		train_replay_buffer.clear()
		print("Training agent Finshed")
		print("Waiting for collecting samples thread")
		collector_thread.join()
		print("collecting samples Finished")
		collect_time += time.time() - start_time
		train_replay_buffer = copy.deepcopy(replay_buffer)
		replay_buffer.clear()
		train_time += time.time() - start_time

		global_step_val = global_step.numpy()

		print(f"global_step_val:{global_step_val} % log_interval:{log_interval} = {global_step_val % log_interval}")

		# if global_step_val % log_interval == 0:
		print()
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print(f'step = {global_step_val}, loss = {total_loss}')
		steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time))
		print(f'{steps_per_sec} steps/sec')
		print(f'collect_time = {collect_time}, train_time = {train_time}')
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print()
		timed_at_step = global_step_val
		collect_time = 0
		train_time = 0

		if global_step_val % eval_interval == 0:
			print("Evaluating!!")
			saver.save(f'agent/saved/policy_ppo_simple_{global_step_val}')
			policy = tf_agent.policy
			evaluate(policy, global_step_val)

	print("Terminated")
	policy = tf_agent.policy
	evaluate(policy, global_step_val)
Example #12
0
  def testAgentDoesNotFailWhenNestedObservationActionAndDebugSummaries(self):
    summary_writer = tf.compat.v2.summary.create_file_writer(FLAGS.test_tmpdir,
                                                             flush_millis=10000)
    summary_writer.set_as_default()

    nested_obs_spec = (self._obs_spec, self._obs_spec, {
        'a': self._obs_spec,
        'b': self._obs_spec,
    })
    nested_time_spec = ts.time_step_spec(nested_obs_spec)

    nested_act_spec = (self._action_spec, {
        'c': self._action_spec,
        'd': self._action_spec
    })

    class NestedActorNet(network.DistributionNetwork):

      def __init__(self, dummy_model):
        output_spec = (dummy_model.output_spec, {
            'c': dummy_model.output_spec,
            'd': dummy_model.output_spec,
        })
        super(NestedActorNet, self).__init__(
            dummy_model.input_tensor_spec, (),
            output_spec=output_spec,
            name='NestedActorNet')
        self.dummy_model = dummy_model

      def call(self, *args, **kwargs):
        dummy_ans, _ = self.dummy_model(*args, **kwargs)
        return (dummy_ans, {'c': dummy_ans, 'd': dummy_ans}), ()

    dummy_model = DummyActorNet(nested_obs_spec, self._action_spec)
    agent = ppo_agent.PPOAgent(
        nested_time_spec,
        nested_act_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=NestedActorNet(dummy_model),
        value_net=DummyValueNet(nested_obs_spec),
        debug_summaries=True)

    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ], dtype=tf.float32)

    observations = (observations, observations, {
        'a': observations,
        'b': observations,
    })

    time_steps = ts.TimeStep(
        step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)

    actions = (actions, {
        'c': actions,
        'd': actions,
    })

    action_distribution_parameters = {
        'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
        'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
    }
    action_distribution_parameters = (action_distribution_parameters, {
        'c': action_distribution_parameters,
        'd': action_distribution_parameters,
    })

    policy_info = action_distribution_parameters

    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)

    agent.train(experience)
Example #13
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        # Mock the build_train_op to return an op for incrementing this counter.
        counter = common.create_variable('test_train_counter')
        agent = ppo_agent.PPOAgent(self._time_step_spec,
                                   self._action_spec,
                                   tf.compat.v1.train.AdamOptimizer(),
                                   actor_net=DummyActorNet(
                                       self._obs_spec,
                                       self._action_spec,
                                   ),
                                   value_net=DummyValueNet(self._obs_spec),
                                   normalize_observations=False,
                                   num_epochs=num_epochs,
                                   use_gae=use_td_lambda_return,
                                   use_td_lambda_return=use_td_lambda_return,
                                   train_step_counter=counter)
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)

        mid_time_step_val = ts.StepType.MID.tolist()
        time_steps = ts.TimeStep(step_type=tf.constant(
            [[mid_time_step_val] * 3] * 2, dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }

        policy_info = action_distribution_parameters

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        # Assert that counter starts out at zero.
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertEqual(0, self.evaluate(counter))
        loss_type = self.evaluate(loss)
        loss_numpy = loss_type.loss

        # Assert that loss is not zero as we are training in a non-episodic env.
        self.assertNotEqual(
            loss_numpy,
            0.0,
            msg=('Loss is exactly zero, looks like no training '
                 'was performed due to incomplete episodes.'))

        # Assert that train_op ran increment_counter num_epochs times.
        self.assertEqual(num_epochs, self.evaluate(counter))
Example #14
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        agent = ppo_agent.PPOAgent(self._time_step_spec,
                                   self._action_spec,
                                   tf.train.AdamOptimizer(),
                                   actor_net=DummyActorNet(
                                       self._action_spec, ),
                                   value_net=DummyValueNet(outer_rank=2),
                                   normalize_observations=False,
                                   num_epochs=num_epochs,
                                   use_gae=use_td_lambda_return,
                                   use_td_lambda_return=use_td_lambda_return)
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)

        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }

        policy_info = action_distribution_parameters

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        # Mock the build_train_op to return an op for incrementing this counter.
        counter = tf.train.get_or_create_global_step()
        zero = tf.constant(0, dtype=tf.float32)
        agent.build_train_op = (
            lambda *_, **__: tf_agent.LossInfo(
                counter.assign_add(1),  # pylint: disable=g-long-lambda
                ppo_agent.PPOLossInfo(*[zero] * 5)))

        train_op = agent.train(experience)

        with self.cached_session() as sess:
            sess.run(tf.global_variables_initializer())

            # Assert that counter starts out at zero.
            counter_ = sess.run(counter)
            self.assertEqual(0, counter_)

            sess.run(train_op)

            # Assert that train_op ran increment_counter num_epochs times.
            counter_ = sess.run(counter)
            self.assertEqual(num_epochs, counter_)
Example #15
0
        environment.observation_spec(),
        environment.action_spec(),
        fc_layer_params=(200, 100))

    value_net = value_network.ValueNetwork(environment.observation_spec(),
                                           fc_layer_params=(200, 100))

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
    train_step_counter = tf.compat.v2.Variable(0)

    tf_agent = ppo_agent.PPOAgent(time_step_spec=environment.time_step_spec(),
                                  action_spec=environment.action_spec(),
                                  actor_net=actor_net,
                                  value_net=value_net,
                                  optimizer=optimizer,
                                  train_step_counter=train_step_counter,
                                  discount_factor=0.995,
                                  gradient_clipping=0.5,
                                  entropy_regularization=1e-2,
                                  importance_ratio_clipping=0.2,
                                  use_gae=True,
                                  use_td_lambda_return=True)

    tf_agent.initialize()

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec, batch_size=1, max_length=20000)

    tf_agent.train = common.function(tf_agent.train)
        BombermanEnvironment(mode="no_bomb"))
    eval_tf_env = tf_py_environment.TFPyEnvironment(
        BombermanEnvironment(mode="no_bomb"))

    actor_net, value_net = create_networks(tf_env)

    train_step = tf.Variable(0)
    update_period = 4
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)  # todo fine tune

    agent = ppo_agent.PPOAgent(tf_env.time_step_spec(),
                               tf_env.action_spec(),
                               optimizer,
                               actor_net=actor_net,
                               value_net=value_net,
                               num_epochs=25,
                               gradient_clipping=0.5,
                               entropy_regularization=1e-2,
                               importance_ratio_clipping=0.2,
                               use_gae=True,
                               use_td_lambda_return=True)

    agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=10000  # todo finetune
    )
    replay_buffer_observer = replay_buffer.add_batch
Example #17
0
def train():
    normalizer = Normalizer(0, 499)
    sae = StateAutoEncoder(1,
                           1,
                           num_state_bits,
                           normalize=True,
                           normalizer=normalizer)
    sae.use_checkpoints(encoder_path)

    train_env, _ = load_env(env_name, sae)

    master_action_spec = array_spec.BoundedArraySpec(shape=((num_options, )),
                                                     dtype=np.float32,
                                                     minimum=0,
                                                     maximum=1,
                                                     name='master_action')

    options_observation_spec = array_spec.BoundedArraySpec(
        shape=((num_options + num_state_bits), ),
        dtype=np.float32,
        minimum=0,
        maximum=1,
        name='option_observation')
    options_action_spec = array_spec.BoundedArraySpec(shape=(num_state_bits,
                                                             2),
                                                      dtype=np.float32,
                                                      minimum=0,
                                                      maximum=1,
                                                      name='option_action')
    options_time_step_spec = ts.TimeStep(
        step_type=train_env.time_step_spec().step_type,
        reward=train_env.time_step_spec().reward,
        discount=train_env.time_step_spec().discount,
        observation=options_observation_spec)

    num_actions = train_env.action_spec().maximum - train_env.action_spec(
    ).minimum + 1
    low_level_model, callbacks = setup_model(num_actions, num_state_bits, sae,
                                             low_level_model_path)

    low_level_env = LowLevelEnv(train_env, low_level_model)

    options_env = OptionsEnv(low_level_env, options_observation_spec,
                             options_action_spec)
    option_train_env = tf_py_environment.TFPyEnvironment(options_env)

    master_env = MasterEnv(low_level_env, master_action_spec)
    master_train_env = tf_py_environment.TFPyEnvironment(master_env)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    master_value_network = value_network.ValueNetwork(
        master_train_env.time_step_spec().observation, fc_layer_params=(100, ))

    master_actor_network = actor_distribution_network.ActorDistributionNetwork(
        master_train_env.time_step_spec().observation,
        master_train_env.action_spec(),
        fc_layer_params=(100, ))

    master_agent = ppo_agent.PPOAgent(master_train_env.time_step_spec(),
                                      master_train_env.action_spec(),
                                      optimizer=optimizer,
                                      actor_net=master_actor_network,
                                      value_net=master_value_network,
                                      train_step_counter=tf.Variable(0))
    master_agent.initialize()
    master_agent.train = common.function(master_agent.train)
    options_env.set_master_policy(master_agent.policy)

    options_critic_net = critic_network.CriticNetwork(
        (option_train_env.observation_spec(), option_train_env.action_spec()),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=(100, ),
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')

    options_actor_net = OptionsNetwork(option_train_env.observation_spec(),
                                       option_train_env.action_spec(), 4)

    options_agent = sac_agent.SacAgent(
        option_train_env.time_step_spec(),
        option_train_env.action_spec(),
        actor_network=options_actor_net,
        critic_network=options_critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=tf.Variable(0))
    options_agent.initialize()
    options_agent.train = common.function(options_agent.train)
    master_env.set_options_policy(options_agent.policy)

    master_rb = create_replay_buffer(master_agent, batch_size,
                                     replay_buffer_max_length)
    options_rb = create_replay_buffer(options_agent, batch_size,
                                      replay_buffer_max_length)

    master_ds = master_rb.as_dataset(num_parallel_calls=3,
                                     sample_batch_size=batch_size,
                                     num_steps=2)
    master_iter = iter(master_ds)
    options_ds = options_rb.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2)
    options_iter = iter(options_ds)

    master_checkpointer = create_train_checkpointer(checkpoint_dir + "master/",
                                                    master_agent, master_rb,
                                                    global_step)
    options_checkpointer = create_train_checkpointer(
        checkpoint_dir + "options/", options_agent, options_rb, global_step)

    master_saver = policy_saver.PolicySaver(master_agent.policy)
    options_saver = policy_saver.PolicySaver(options_agent.policy)

    def check_interval(interval):
        return global_step % interval == 0

    while (global_step < num_iterations):
        populate_buffer(master_train_env, master_rb,
                        master_agent.collect_policy,
                        master_agent.time_step_spec, master_collect_steps,
                        batch_size)
        for _ in range(warmup_period):
            experience, unused_info = next(master_iter)
            master_loss = master_agent.train(experience)

        for _ in range(joint_update_period):
            populate_buffer(master_train_env, master_rb,
                            master_agent.collect_policy,
                            master_agent.time_step_spec, 2, batch_size)
            populate_buffer(option_train_env, options_rb,
                            options_agent.collect_policy,
                            options_agent.time_step_spec, 2, batch_size)
            option_exp, unused_info = next(options_iter)
            options_loss = options_agent.train(option_exp)
            master_exp, unused_info = next(master_iter)
            master_loss = master_agent.train(master_exp)

        global_step.assign_add(1)

        if check_interval(log_interval):
            print('step = {0}: master loss = {1}, options loss = {2}'.format(
                global_step.value, master_loss, options_loss))

        if check_interval(checkpoint_interval):
            master_checkpointer.save(global_step)
            options_checkpointer.save(global_step)
            print('Checkpoint saved!')

        # Reset master here

    master_saver.save(save_dir + "master/")
    options_saver.save(save_dir + "options/")
    print("Policies Saved!")
def train_eval(
        root_dir,
        summary_dir,
        game_config,
        tf_master='',
        env_load_fn=None,
        random_seed=0,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(150, 75),
        value_fc_layers=(150, 75),
        actor_fc_layers_rnn=(150, ),
        value_fc_layers_rnn=(150, ),
        use_rnns=True,
        # Params for collect
        num_environment_steps=int(3e08),
        collect_episodes_per_iteration=90,
        num_parallel_environments=30,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-4,
        # Params for eval
        num_eval_episodes=30,
        eval_interval=5000,
        # Params for summaries and logging
        train_checkpoint_interval=2000,
        policy_checkpoint_interval=1000,
        rb_checkpoint_interval=4000,
        log_interval=500,
        summary_interval=500,
        summaries_flush_secs=1,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None,
        eval_py_env=None,
        tf_env=None):
    tf.reset_default_graph()
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    # ################################################ #
    # ------------ Create summary-writers ------------ #
    # ################################################ #
    root_dir = os.path.expanduser(root_dir)
    summary_dir = os.path.join(summary_dir, FOLDERNAME)
    train_dir = os.path.join(os.path.join(root_dir, 'train'), FOLDERNAME)
    eval_dir = os.path.join(os.path.join(root_dir, 'eval'), FOLDERNAME)

    train_summary_writer, eval_summary_writer = get_writers_train_eval(
        summary_dir, eval_dir, filename_suffix=FILENAME_SUFFIX)
    eval_metrics = get_metrics_eval(num_parallel_environments,
                                    num_eval_episodes)
    eval_summary_writer_flush_op = eval_summary_writer.flush()

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        tf.compat.v1.set_random_seed(random_seed)

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        # ################################################ #
        # ---------------- Create Networks --------------- #
        # ################################################ #
        if use_rnns:
            actor_net, value_net = get_networks(
                tf_env, {
                    "actor_net": actor_fc_layers_rnn,
                    "value_net": value_fc_layers_rnn
                })
        else:
            actor_net, value_net = get_networks(tf_env, {
                "actor_net": actor_fc_layers,
                "value_net": value_fc_layers
            })

        state_pred_net = custom_environment.predictive_models.StatePredictor(
            state_pred_l1, state_pred_l2, num_parallel_environments,
            curiosity_param)
        action_pred_net = custom_environment.predictive_models.ActionPredictor(
            action_pred_l1, action_pred_l2, action_pred_l3)

        #traj = trajectory.Trajectory(
        #                        step_type=[],
        #                        observation=[],
        #                        action=[],
        #                        policy_info=[],
        #                        next_step_type=[],
        #                        reward=[],
        #                        discount=[])

        # ################################################ #
        # ---------------- Create PPO Agent -------------- #
        # ################################################ #
        tf_agent = ppo_agent.PPOAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            optimizer,
            entropy_regularization=0,  #0.1 up to 0.4
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step,
            normalize_observations=False
        )  # cause the observations also include the 0-1 mask

        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy)

        # ################################################ #
        # ---------------- Create Metrics ---------------- #
        # ################################################ #
        train_metrics, step_metrics, environment_steps_count = get_metrics_train_and_step(
            num_eval_episodes, num_parallel_environments)

        # Add to replay buffer and other agent specific observers.
        replay_buffer_observer = [replay_buffer.add_batch]

        # ################################################ #
        # ----------------- Trajectories ----------------- #
        # ################################################ #
        collect_policy = tf_agent.collect_policy

        collect_op = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=replay_buffer_observer + train_metrics,
            num_episodes=collect_episodes_per_iteration).run()

        trajectories = replay_buffer.gather_all()

        train_op, _ = tf_agent.train(trajectories)

        # Prediction Implementation OPs
        gather_op = replay_buffer.gather_all()

        clear_op = replay_buffer.clear()

        step_type = tf.placeholder("int32", None)
        state = tf.placeholder("uint8", [None, None])
        info = tf.placeholder("int64", None)
        mask = tf.placeholder("float32", [None, None])
        state2 = tf.placeholder("uint8", [None, None])
        action = tf.placeholder("int64", None)
        logits = tf.placeholder("float32", [None, None])
        next_step_type = tf.placeholder("int32", None)
        reward = tf.placeholder("float32", None)
        discount = tf.placeholder("float32", None)

        traj = trajectory.Trajectory(step_type=step_type,
                                     observation={
                                         'state': state,
                                         'mask': mask,
                                         'info': info,
                                         'state2': state2
                                     },
                                     action=action,
                                     policy_info={'logits': logits},
                                     next_step_type=next_step_type,
                                     reward=reward,
                                     discount=discount)

        add_batch_op = replay_buffer.add_batch(traj)

        # printing
        #print_op = tf.print(trajectories)
        #with tf.control_dependencies([print_op]):
        #    train_op, _ = tf_agent.train(trajectories)

        with tf.control_dependencies([train_op]):
            clear_replay_op = replay_buffer.clear()

        with tf.control_dependencies([clear_replay_op]):
            train_op = tf.identity(train_op)

    # ################################################ #
    # ------------ Create Checkpointers -------------- #
    # ################################################ #
        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'policy'),
                                                  policy=tf_agent.policy,
                                                  global_step=global_step)
        rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'replay_buffer'),
                                              max_to_keep=1,
                                              replay_buffer=replay_buffer)

        # ################################################ #
        # -------------- Create Summary Ops -------------- #
        # ################################################ #
        summary_ops = []
        for train_metric in train_metrics:
            summary_ops.append(
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=step_metrics))

        with eval_summary_writer.as_default(), \
             tf.compat.v2.summary.record_if(True):
            for eval_metric in eval_metrics:
                eval_metric.tf_summaries(train_step=global_step,
                                         step_metrics=step_metrics)

        init_agent_op = tf_agent.initialize()

        # ################################################ #
        # --------------- Initialize Graph --------------- #
        # ################################################ #

        with tf.compat.v1.Session(tf_master) as sess:
            train_checkpointer.initialize_or_restore(sess)
            rb_checkpointer.initialize_or_restore(sess)
            common.initialize_uninitialized_variables(sess)

            sess.run(init_agent_op)
            sess.run(train_summary_writer.init())
            sess.run(eval_summary_writer.init())

            collect_time = 0
            train_time = 0
            timed_at_step = sess.run(global_step)
            steps_per_second_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(), name='steps_per_sec_ph')
            steps_per_second_summary = tf.compat.v2.summary.scalar(
                name='global_steps_per_sec',
                data=steps_per_second_ph,
                step=global_step)

            # ################################################ #
            # -------------------- Loop ------ --------------- #
            # ------------ Collect/Train/Write --------------- #
            # ################################################ #

            while sess.run(environment_steps_count) < num_environment_steps:
                global_step_val = sess.run(global_step)
                if global_step_val % eval_interval == 0:
                    metric_utils.compute_summaries(
                        eval_metrics,
                        eval_py_env,
                        eval_py_policy,
                        num_episodes=num_eval_episodes,
                        global_step=global_step_val,
                        callback=eval_metrics_callback,
                        log=True,
                    )
                    sess.run(eval_summary_writer_flush_op)

                start_time = time.time()
                sess.run(collect_op)
                collect_time += time.time() - start_time

                # ################################################ #
                # -------- Prediction-Implementation Start ------- #
                # ################################################ #

                if statePred or actionPred:

                    #get trajectory and clear Replay-Buffer
                    collectedTrajectory = sess.run(gather_op)

                    if curiosity:  #experimental
                        sess.run(clear_op)

                        #augment reward in trajectory
                        collectedTrajectory = state_pred_net.augmentReward(
                            collectedTrajectory)

                        #write augmented trajectory back into replay buffer
                        for j in range(len(collectedTrajectory[0][0])):
                            i = j - 1

                            sess.run(
                                add_batch_op,
                                feed_dict={
                                    step_type:
                                    collectedTrajectory[0][:, i],
                                    state:
                                    collectedTrajectory[1].get("state")[:,
                                                                        i, :],
                                    info:
                                    collectedTrajectory[1].get("info")[:, i],
                                    mask:
                                    collectedTrajectory[1].get("mask")[:,
                                                                       i, :],
                                    state2:
                                    collectedTrajectory[1].get("state2")[:,
                                                                         i, :],
                                    action:
                                    collectedTrajectory[2][:, i],
                                    logits:
                                    collectedTrajectory[3].get("logits")[:,
                                                                         i, :],
                                    next_step_type:
                                    collectedTrajectory[4][:, i],
                                    reward:
                                    collectedTrajectory[5][:, i],
                                    discount:
                                    collectedTrajectory[6][:, i]
                                })

                    #train prediction network
                    if statePred:
                        state_pred_net.train(collectedTrajectory, True)

                    if actionPred:
                        action_pred_net.train(collectedTrajectory, True)

                # ################################################ #
                # ------ Prediction-Implementation Stop ---------- #
                # ################################################ #

                train_time = 0
                total_loss = -1  #indicates that there was no training
                if train:
                    start_time = time.time()
                    total_loss, _ = sess.run([train_op, summary_ops])
                    train_time += time.time() - start_time

                # ################################################ #
                # ---------- Logging and Checkpointing ----------- #
                # ################################################ #
                if saveModel:
                    global_step_val = sess.run(global_step)
                    if global_step_val % log_interval == 0:
                        logging.info('step = %d, loss = %f', global_step_val,
                                     total_loss)
                        steps_per_sec = ((global_step_val - timed_at_step) /
                                         (collect_time + train_time))
                        logging.info('%.3f steps/sec', steps_per_sec)
                        sess.run(
                            steps_per_second_summary,
                            feed_dict={steps_per_second_ph: steps_per_sec})
                        logging.info(
                            '%s', 'collect_time = {}, train_time = {}'.format(
                                collect_time, train_time))
                        timed_at_step = global_step_val
                        collect_time = 0
                        train_time = 0

                    if global_step_val % train_checkpoint_interval == 0:
                        train_checkpointer.save(global_step=global_step_val)

                    if global_step_val % policy_checkpoint_interval == 0:
                        policy_checkpointer.save(global_step=global_step_val)

                    if global_step_val % rb_checkpoint_interval == 0:
                        rb_checkpointer.save(global_step=global_step_val)

            if saveModel:
                # One final eval before exiting.
                metric_utils.compute_summaries(
                    eval_metrics,
                    eval_py_env,
                    eval_py_policy,
                    num_episodes=num_eval_episodes,
                    global_step=global_step_val,
                    callback=eval_metrics_callback,
                    log=True,
                )
            sess.run(eval_summary_writer_flush_op)
        tf.reset_default_graph()
Example #19
0
  def testBuildTrainOp(self):
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._action_spec,),
        value_net=DummyValueNet(),
        normalize_observations=False,
        normalize_rewards=False,
        value_pred_loss_coef=1.0,
        policy_l2_reg=1e-4,
        value_function_l2_reg=1e-4,
        entropy_regularization=0.1,
        importance_ratio_clipping=10,
    )
    observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]],
                               dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)
    actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32)
    returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
    sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3],
                                          dtype=tf.float32)
    advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
    valid_mask = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32)
    sample_action_distribution_parameters = {
        'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]], dtype=tf.float32),
        'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]], dtype=tf.float32),
    }
    train_step = tf.train.get_or_create_global_step()

    (train_op, losses) = (
        agent.build_train_op(
            time_steps,
            actions,
            sample_action_log_probs,
            returns,
            advantages,
            sample_action_distribution_parameters,
            valid_mask,
            train_step,
            summarize_gradients=False,
            gradient_clipping=0.0,
            debug_summaries=False))
    (policy_gradient_loss, value_estimation_loss, l2_regularization_loss,
     entropy_reg_loss, kl_penalty_loss) = losses

    # Run train_op once.
    self.evaluate(tf.global_variables_initializer())
    total_loss_, pg_loss_, ve_loss_, l2_loss_, ent_loss_, kl_penalty_loss_ = (
        self.evaluate([
            train_op, policy_gradient_loss, value_estimation_loss,
            l2_regularization_loss, entropy_reg_loss, kl_penalty_loss
        ]))

    # Check loss values are as expected. Factor of 2/4 is because four timesteps
    # were included in the data, but two were masked out. Reduce_means in losses
    # will divide by 4, but computed loss values are for first 2 timesteps.
    expected_pg_loss = -0.0164646133 * 2 / 4
    expected_ve_loss = 123.205 * 2 / 4
    expected_l2_loss = 1e-4 * 12 * 2 / 4
    expected_ent_loss = -0.370111 * 2 / 4
    expected_kl_penalty_loss = 0.0
    self.assertAllClose(
        expected_pg_loss + expected_ve_loss + expected_l2_loss +
        expected_ent_loss + expected_kl_penalty_loss,
        total_loss_,
        atol=0.001,
        rtol=0.001)
    self.assertAllClose(expected_pg_loss, pg_loss_)
    self.assertAllClose(expected_ve_loss, ve_loss_)
    self.assertAllClose(expected_l2_loss, l2_loss_, atol=0.001, rtol=0.001)
    self.assertAllClose(expected_ent_loss, ent_loss_)
    self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss_)

    # Assert that train_step was incremented
    self.assertEqual(1, self.evaluate(train_step))
Example #20
0
def train_eval(
        root_dir,
        random_seed=0,
        num_epochs=1000000,
        # Params for train
        normalize_observations=True,
        normalize_rewards=True,
        discount_factor=1.0,
        lr=1e-5,
        lr_schedule=None,
        num_policy_updates=20,
        initial_adaptive_kl_beta=0.0,
        kl_cutoff_factor=0,
        importance_ratio_clipping=0.2,
        value_pred_loss_coef=0.5,
        gradient_clipping=None,
        entropy_regularization=0.0,
        log_prob_clipping=0.0,
        # Params for log, eval, save
        eval_interval=100,
        save_interval=1000,
        checkpoint_interval=None,
        summary_interval=100,
        do_evaluation=True,
        # Params for data collection
        train_batch_size=10,
        eval_batch_size=100,
        collect_driver=None,
        eval_driver=None,
        replay_buffer_capacity=20000,
        # Policy and value networks
        ActorNet=actor_distribution_network.ActorDistributionNetwork,
        zero_means_kernel_initializer=False,
        init_action_stddev=0.35,
        actor_fc_layers=(),
        value_fc_layers=(),
        use_rnn=True,
        actor_lstm_size=(12, ),
        value_lstm_size=(12, ),
        **kwargs):
    """ A simple train and eval for PPO agent. 
    
    Args:
        root_dir (str): directory for saving training and evalutaion data
        random_seed (int): seed for random number generator
        num_epochs (int): number of training epochs. At each epoch a batch
            of data is collected according to one stochastic policy, and then
            the policy is updated.
        normalize_observations (bool): flag for normalization of observations.
            Uses StreamingTensorNormalizer which normalizes based on the whole
            history of observations.
        normalize_rewards (bool): flag for normalization of rewards.
            Uses StreamingTensorNormalizer which normalizes based on the whole
            history of rewards.
        discount_factor (float): rewards discout factor, should be in (0,1]
        lr (float): learning rate for Adam optimizer
        lr_schedule (callable: int -> float, optional): function to schedule 
            the learning rate annealing. Takes as argument the int epoch
            number and returns float value of the learning rate. 
        num_policy_updates (int): number of policy gradient steps to do on each
            epoch of training. In PPO this is typically >1.
        initial_adaptive_kl_beta (float): see tf-agents PPO docs 
        kl_cutoff_factor (float): see tf-agents PPO docs 
        importance_ratio_clipping (float): clipping value for importance ratio.
            Should demotivate the policy from doing updates that significantly
            change the policy. Should be in (0,1]
        value_pred_loss_coef (float): weight coefficient for quadratic value
            estimation loss.
        gradient_clipping (float): gradient clipping coefficient.
        entropy_regularization (float): entropy regularization loss coefficient.
        log_prob_clipping (float): +/- value for clipping log probs to prevent 
            inf / NaN values.  Default: no clipping.
        eval_interval (int): interval between evaluations, counted in epochs.
        save_interval (int): interval between savings, counted in epochs. It
            updates the log file and saves the deterministic policy.
        checkpoint_interval (int): interval between saving checkpoints, counted 
            in epochs. Overwrites the previous saved one. Defaults to None, 
            in which case checkpoints are not saved.
        summary_interval (int): interval between summary writing, counted in 
            epochs. tf-agents takes care of summary writing; results can be
            later displayed in tensorboard.
        do_evaluation (bool): flag to interleave training epochs with 
            evaluation epochs.
        train_batch_size (int): training batch size, collected in parallel.
        eval_batch_size (int): batch size for evaluation of the policy.
        collect_driver (Driver): driver for training data collection
        eval_driver (Driver): driver for evaluation data collection
        replay_buffer_capacity (int): How many transition tuples the buffer 
            can store. The buffer is emptied and re-populated at each epoch.
        ActorNet (network.DistributionNetwork): a distribution actor network 
            to use for training. The default is ActorDistributionNetwork from
            tf-agents, but this can also be customized.
        zero_means_kernel_initializer (bool): flag to initialize the means
            projection network with zeros. If this flag is not set, it will
            use default tf-agent random initializer.
        init_action_stddev (float): initial stddev of the normal action dist.
        actor_fc_layers (tuple): sizes of fully connected layers in actor net.
        value_fc_layers (tuple): sizes of fully connected layers in value net.
        use_rnn (bool): whether to use LSTM units in the neural net.
        actor_lstm_size (tuple): sizes of LSTM layers in actor net.
        value_lstm_size (tuple): sizes of LSTM layers in value net.
    """
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    tf.compat.v1.set_random_seed(random_seed)

    # Setup directories within 'root_dir'
    if not os.path.isdir(root_dir): os.mkdir(root_dir)
    policy_dir = os.path.join(root_dir, 'policy')
    checkpoint_dir = os.path.join(root_dir, 'checkpoint')
    logfile = os.path.join(root_dir, 'log.hdf5')
    train_dir = os.path.join(root_dir, 'train_summaries')

    # Create tf summary writer
    train_summary_writer = tf.compat.v2.summary.create_file_writer(train_dir)
    train_summary_writer.set_as_default()
    summary_interval *= num_policy_updates
    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):

        # Define action and observation specs
        observation_spec = collect_driver.observation_spec()
        action_spec = collect_driver.action_spec()

        # Preprocessing: flatten and concatenate observation components
        preprocessing_layers = {
            obs: tf.keras.layers.Flatten()
            for obs in observation_spec.keys()
        }
        preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)

        # Define actor network and value network
        if use_rnn:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                input_tensor_spec=observation_spec,
                output_tensor_spec=action_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                input_fc_layer_params=None,
                lstm_size=actor_lstm_size,
                output_fc_layer_params=actor_fc_layers)

            value_net = value_rnn_network.ValueRnnNetwork(
                input_tensor_spec=observation_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                input_fc_layer_params=None,
                lstm_size=value_lstm_size,
                output_fc_layer_params=value_fc_layers)
        else:
            npn = actor_distribution_network._normal_projection_net
            normal_projection_net = lambda specs: npn(
                specs,
                zero_means_kernel_initializer=zero_means_kernel_initializer,
                init_action_stddev=init_action_stddev)

            actor_net = ActorNet(
                input_tensor_spec=observation_spec,
                output_tensor_spec=action_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                fc_layer_params=actor_fc_layers,
                continuous_projection_net=normal_projection_net)

            value_net = value_network.ValueNetwork(
                input_tensor_spec=observation_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                fc_layer_params=value_fc_layers)

        # Create PPO agent
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)
        tf_agent = ppo_agent.PPOAgent(
            time_step_spec=collect_driver.time_step_spec(),
            action_spec=action_spec,
            optimizer=optimizer,
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=num_policy_updates,
            train_step_counter=global_step,
            discount_factor=discount_factor,
            normalize_observations=normalize_observations,
            normalize_rewards=normalize_rewards,
            initial_adaptive_kl_beta=initial_adaptive_kl_beta,
            kl_cutoff_factor=kl_cutoff_factor,
            importance_ratio_clipping=importance_ratio_clipping,
            gradient_clipping=gradient_clipping,
            value_pred_loss_coef=value_pred_loss_coef,
            entropy_regularization=entropy_regularization,
            log_prob_clipping=log_prob_clipping,
            debug_summaries=True)

        tf_agent.initialize()
        eval_policy = tf_agent.policy
        collect_policy = tf_agent.collect_policy

        # Create replay buffer and collection driver
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=tf_agent.collect_data_spec,
            batch_size=train_batch_size,
            max_length=replay_buffer_capacity)

        def train_step():
            experience = replay_buffer.gather_all()
            return tf_agent.train(experience)

        tf_agent.train = common.function(tf_agent.train)

        avg_return_metric = tf_metrics.AverageReturnMetric(
            batch_size=eval_batch_size, buffer_size=eval_batch_size)

        collect_driver.setup(collect_policy, [replay_buffer.add_batch])
        eval_driver.setup(eval_policy, [avg_return_metric])

        # Create a checkpointer and load the saved agent
        train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir,
                                                 max_to_keep=1,
                                                 agent=tf_agent,
                                                 policy=tf_agent.policy,
                                                 replay_buffer=replay_buffer,
                                                 global_step=global_step)

        train_checkpointer.initialize_or_restore()
        global_step = tf.compat.v1.train.get_global_step()

        # Saver for the deterministic policy
        saved_model = policy_saver.PolicySaver(eval_policy,
                                               train_step=global_step)

        # Evaluate policy once before training
        if do_evaluation:
            eval_driver.run(0)
            avg_return = avg_return_metric.result().numpy()
            avg_return_metric.reset()
            log = {
                'returns': [avg_return],
                'epochs': [0],
                'policy_steps': [0],
                'experience_time': [0.0],
                'train_time': [0.0]
            }
            print('-------------------')
            print('Epoch 0')
            print('  Policy steps: 0')
            print('  Experience time: 0.00 mins')
            print('  Policy train time: 0.00 mins')
            print('  Average return: %.5f' % avg_return)

        # Save initial random policy
        path = os.path.join(policy_dir, ('0').zfill(6))
        saved_model.save(path)

        # Training loop
        train_timer = timer.Timer()
        experience_timer = timer.Timer()
        for epoch in range(1, num_epochs + 1):
            # Collect new experience
            experience_timer.start()
            collect_driver.run(epoch)
            experience_timer.stop()
            # Update the policy
            train_timer.start()
            if lr_schedule: optimizer._lr = lr_schedule(epoch)
            train_loss = train_step()
            replay_buffer.clear()
            train_timer.stop()

            if (epoch % eval_interval == 0) and do_evaluation:
                # Evaluate the policy
                eval_driver.run(epoch)
                avg_return = avg_return_metric.result().numpy()
                avg_return_metric.reset()

                # Print out and log all metrics
                print('-------------------')
                print('Epoch %d' % epoch)
                print('  Policy steps: %d' % (epoch * num_policy_updates))
                print('  Experience time: %.2f mins' %
                      (experience_timer.value() / 60))
                print('  Policy train time: %.2f mins' %
                      (train_timer.value() / 60))
                print('  Average return: %.5f' % avg_return)
                log['epochs'].append(epoch)
                log['policy_steps'].append(epoch * num_policy_updates)
                log['returns'].append(avg_return)
                log['experience_time'].append(experience_timer.value())
                log['train_time'].append(train_timer.value())
                # Save updated log
                save_log(log, logfile, ('%d' % epoch).zfill(6))

            if epoch % save_interval == 0:
                # Save deterministic policy
                path = os.path.join(policy_dir, ('%d' % epoch).zfill(6))
                saved_model.save(path)

            if checkpoint_interval is not None and \
                epoch % checkpoint_interval == 0:
                # Save training checkpoint
                train_checkpointer.save(global_step)
        collect_driver.finish_training()
        eval_driver.finish_training()
Example #21
0
def train_eval_doom_simple(
        # Params for collect
        num_environment_steps=30000000,
        collect_episodes_per_iteration=32,
        num_parallel_environments=32,
        replay_buffer_capacity=301,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=4e-4,
        # Params for eval
        eval_interval=500,
        num_video_episodes=10,
        # Params for summaries and logging
        log_interval=50):
    """A simple train and eval for PPO."""
    # if not os.path.exists(videos_dir):
    # 	os.makedirs(videos_dir)

    # eval_py_env = CSGOEnvironment()
    # eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)
    tf_env = tf_py_environment.TFPyEnvironment(CSGOEnvironment())

    actor_net, value_net = create_networks(tf_env.observation_spec(),
                                           tf_env.action_spec())

    global_step = tf.compat.v1.train.get_or_create_global_step()
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate,
                                                 epsilon=1e-5)

    tf_agent = ppo_agent.PPOAgent(tf_env.time_step_spec(),
                                  tf_env.action_spec(),
                                  optimizer,
                                  actor_net,
                                  value_net,
                                  num_epochs=num_epochs,
                                  train_step_counter=global_step,
                                  discount_factor=0.99,
                                  gradient_clipping=0.5,
                                  entropy_regularization=1e-2,
                                  importance_ratio_clipping=0.2,
                                  use_gae=True,
                                  use_td_lambda_return=True)
    tf_agent.initialize()

    environment_steps_metric = tf_metrics.EnvironmentSteps()
    step_metrics = [
        tf_metrics.NumberOfEpisodes(),
        environment_steps_metric,
    ]

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=num_parallel_environments,
        max_length=replay_buffer_capacity)
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        tf_agent.collect_policy,
        observers=[replay_buffer.add_batch] + step_metrics,
        num_episodes=collect_episodes_per_iteration)

    def train_step():
        trajectories = replay_buffer.gather_all()
        return tf_agent.train(experience=trajectories)

    # def evaluate():
    # 	create_video(eval_py_env, eval_tf_env, tf_agent.policy, num_episodes=num_video_episodes, video_filename=os.path.join(videos_dir, "video_%d.mp4" % global_step_val))

    collect_time = 0
    train_time = 0
    timed_at_step = global_step.numpy()

    while environment_steps_metric.result() < num_environment_steps:

        start_time = time.time()
        collect_driver.run()
        collect_time += time.time() - start_time

        start_time = time.time()
        total_loss, _ = train_step()
        replay_buffer.clear()
        train_time += time.time() - start_time

        global_step_val = global_step.numpy()

        if global_step_val % log_interval == 0:
            logging.info('step = %d, loss = %f', global_step_val, total_loss)
            steps_per_sec = ((global_step_val - timed_at_step) /
                             (collect_time + train_time))
            logging.info('%.3f steps/sec', steps_per_sec)
            logging.info('collect_time = {}, train_time = {}'.format(
                collect_time, train_time))

            timed_at_step = global_step_val
            collect_time = 0
            train_time = 0
Example #22
0
    def train(self):
        """ trains a policy using the gym_env.
            Sets training_losses and training_average_returns, depending on the training scheme
            defined in TrainingDuration configuration.
        """
        # Create Training Environment, Optimizer and PpoAgent
        self._log_agent("Creating environment:")
        train_env = self._create_tfagent_env()
        observation_spec = train_env.observation_spec()
        action_spec = train_env.action_spec()
        timestep_spec = train_env.time_step_spec()

        self._log_agent("Creating agent:")
        self._log_agent("  creating  tf.compat.v1.train.AdamOptimizer( ... )")
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=self._learning_rate)

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec, action_spec, fc_layer_params=self.fc_layers)
        value_net = value_network.ValueNetwork(observation_spec,
                                               fc_layer_params=self.fc_layers)

        self._log_agent("  creating  PpoAgent( ... )")
        tf_agent = ppo_agent.PPOAgent(
            timestep_spec,
            action_spec,
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=self._training_duration.num_epochs_per_iteration)
        self._log_agent("  executing tf_agent.initialize()")
        tf_agent.initialize()
        self._trained_policy = tf_agent.policy

        # Data collection
        self._log_agent("Creating data collection:")
        collect_data_spec = tf_agent.collect_data_spec
        self._log_agent("  creating TFUniformReplayBuffer()")
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            collect_data_spec,
            batch_size=1,
            max_length=self._num_training_steps_in_replay_buffer)

        collect_policy = tf_agent.collect_policy
        self._log_agent("  creating DynamicEpisodeDriver()")
        collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            train_env,
            collect_policy,
            observers=[replay_buffer.add_batch],
            num_episodes=self._training_duration.num_episodes_per_iteration)

        # Train
        collect_driver.run = common.function(collect_driver.run,
                                             autograph=False)
        tf_agent.train = common.function(tf_agent.train, autograph=False)

        self._clear_average_rewards_and_steps_log()
        self._record_average_rewards_and_steps()
        self.training_losses = []

        self._log_agent("Starting training:")
        for step in range(1, self._training_duration.num_iterations + 1):
            msg = f'training {step:4} of {self._training_duration.num_iterations:<4}:'
            self._log_agent(msg + " executing collect_driver.run()")
            collect_driver.run()

            self._log_agent(msg + " executing replay_buffer.gather_all()")
            trajectories = replay_buffer.gather_all()

            self._log_agent(msg + " executing tf_agent.train(...)")
            total_loss, _ = tf_agent.train(experience=trajectories)
            self.training_losses.append(float(total_loss))
            self._log_minimal(
                f'{msg} completed tf_agent.train(...) = {total_loss.numpy():>8.3f} [loss]'
            )

            self._log_agent(msg + " executing replay_buffer.clear()")
            replay_buffer.clear()

            if step % self._training_duration.num_iterations_between_eval == 0:
                self._record_average_rewards_and_steps()
        return
Example #23
0
    def testDebugSummaries(self):
        logdir = self.get_temp_dir()
        with tf.contrib.summary.create_file_writer(logdir,
                                                   max_queue=None,
                                                   flush_millis=None,
                                                   filename_suffix=None,
                                                   name=None).as_default():
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.train.AdamOptimizer(),
                actor_net=DummyActorNet(self._action_spec, ),
                value_net=DummyValueNet(),
                debug_summaries=True,
            )
            observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
            time_steps = ts.restart(observations, batch_size=2)
            actions = tf.constant([[0], [1]], dtype=tf.float32)
            returns = tf.constant([1.9, 1.0], dtype=tf.float32)
            sample_action_log_probs = tf.constant([0.9, 0.3], dtype=tf.float32)
            advantages = tf.constant([1.9, 1.0], dtype=tf.float32)
            weights = tf.ones_like(advantages)
            sample_action_distribution_parameters = {
                'loc': tf.constant([[9.0], [15.0]], dtype=tf.float32),
                'scale': tf.constant([[8.0], [12.0]], dtype=tf.float32),
            }
            train_step = tf.train.get_or_create_global_step()

            with self.cached_session() as sess:
                tf.contrib.summary.initialize(session=sess)

                (_, _) = (agent.build_train_op(
                    time_steps,
                    actions,
                    sample_action_log_probs,
                    returns,
                    advantages,
                    sample_action_distribution_parameters,
                    weights,
                    train_step,
                    summarize_gradients=False,
                    gradient_clipping=0.0,
                    debug_summaries=False))
                summaries_without_debug = tf.contrib.summary.all_summary_ops()

                (_, _) = (agent.build_train_op(
                    time_steps,
                    actions,
                    sample_action_log_probs,
                    returns,
                    advantages,
                    sample_action_distribution_parameters,
                    weights,
                    train_step,
                    summarize_gradients=False,
                    gradient_clipping=0.0,
                    debug_summaries=True))
                summaries_with_debug = tf.contrib.summary.all_summary_ops()

                self.assertGreater(len(summaries_with_debug),
                                   len(summaries_without_debug))
Example #24
0
  def testSequencePreprocessNotBatched(self, strategy_fn):
    with strategy_fn().scope():
      counter = common.create_variable('test_train_counter')
      n_time_steps = 3
      agent = ppo_agent.PPOAgent(
          self._time_step_spec,
          self._action_spec,
          tf.compat.v1.train.AdamOptimizer(),
          actor_net=DummyActorNet(
              self._obs_spec,
              self._action_spec,
          ),
          value_net=DummyValueNet(self._obs_spec),
          normalize_observations=False,
          num_epochs=1,
          use_gae=False,
          use_td_lambda_return=False,
          compute_value_and_advantage_in_train=False,
          train_step_counter=counter)
      agent.initialize()
    observations = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32)

    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(
        step_type=tf.constant(
            [mid_time_step_val] * n_time_steps, dtype=tf.int32),
        reward=tf.constant([1] * n_time_steps, dtype=tf.float32),
        discount=tf.constant([1] * n_time_steps, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[0], [1], [1]], dtype=tf.float32)

    old_action_distribution_parameters = {
        'loc': tf.constant([[0.0]] * n_time_steps, dtype=tf.float32),
        'scale': tf.constant([[1.0]] * n_time_steps, dtype=tf.float32),
    }

    value_preds = tf.constant([9., 15., 21.], dtype=tf.float32)
    policy_info = {
        'dist_params': old_action_distribution_parameters,
        'value_prediction': value_preds,
    }
    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)

    returned_experience = agent.preprocess_sequence(experience)
    self.evaluate(tf.compat.v1.initialize_all_variables())

    self.assertAllClose(observations, returned_experience.observation)
    self.assertAllClose(actions, returned_experience.action)

    self.assertAllClose(old_action_distribution_parameters,
                        returned_experience.policy_info['dist_params'])
    self.assertEqual(n_time_steps,
                     returned_experience.policy_info['return'].shape)
    self.assertAllClose([40.4821, 30.79],
                        returned_experience.policy_info['return'][:-1])
    self.assertEqual(n_time_steps,
                     returned_experience.policy_info['advantage'].shape)
    self.assertAllClose([31.482101, 15.790001],
                        returned_experience.policy_info['advantage'][:-1])
Example #25
0
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        env_load_fn=suite_mujoco.load,
        random_seed=0,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(200, 100),
        value_fc_layers=(200, 100),
        use_rnns=False,
        # Params for collect
        num_environment_steps=10000000,
        collect_episodes_per_iteration=30,
        num_parallel_environments=30,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-4,
        # Params for eval
        num_eval_episodes=30,
        eval_interval=500,
        # Params for summaries and logging
        train_checkpoint_interval=500,
        policy_checkpoint_interval=500,
        log_interval=50,
        summary_interval=50,
        summaries_flush_secs=1,
        use_tf_functions=True,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    saved_model_dir = os.path.join(root_dir, 'policy_saved_model')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        tf.compat.v1.set_random_seed(random_seed)
        eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name))
        tf_env = tf_py_environment.TFPyEnvironment(
            parallel_py_environment.ParallelPyEnvironment(
                [lambda: env_load_fn(env_name)] * num_parallel_environments))
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        if use_rnns:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                input_fc_layer_params=actor_fc_layers,
                output_fc_layer_params=None)
            value_net = value_rnn_network.ValueRnnNetwork(
                tf_env.observation_spec(),
                input_fc_layer_params=value_fc_layers,
                output_fc_layer_params=None)
        else:
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                fc_layer_params=actor_fc_layers)
            value_net = value_network.ValueNetwork(
                tf_env.observation_spec(), fc_layer_params=value_fc_layers)

        tf_agent = ppo_agent.PPOAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step)
        tf_agent.initialize()

        environment_steps_metric = tf_metrics.EnvironmentSteps()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]

        train_metrics = step_metrics + [
            tf_metrics.AverageReturnMetric(
                batch_size=num_parallel_environments),
            tf_metrics.AverageEpisodeLengthMetric(
                batch_size=num_parallel_environments),
        ]

        eval_policy = tf_agent.policy
        collect_policy = tf_agent.collect_policy

        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'policy'),
                                                  policy=eval_policy,
                                                  global_step=global_step)
        saved_model = policy_saver.PolicySaver(eval_policy,
                                               train_step=global_step)

        train_checkpointer.initialize_or_restore()

        collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=collect_episodes_per_iteration)

        def train_step():
            trajectories = replay_buffer.gather_all()
            return tf_agent.train(experience=trajectories)

        if use_tf_functions:
            # TODO(b/123828980): Enable once the cause for slowdown was identified.
            collect_driver.run = common.function(collect_driver.run,
                                                 autograph=False)
            tf_agent.train = common.function(tf_agent.train, autograph=False)
            train_step = common.function(train_step)

        collect_time = 0
        train_time = 0
        timed_at_step = global_step.numpy()

        while environment_steps_metric.result() < num_environment_steps:
            global_step_val = global_step.numpy()
            if global_step_val % eval_interval == 0:
                metric_utils.eager_compute(
                    eval_metrics,
                    eval_tf_env,
                    eval_policy,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics',
                )

            start_time = time.time()
            collect_driver.run()
            collect_time += time.time() - start_time

            start_time = time.time()
            total_loss, _ = train_step()
            replay_buffer.clear()
            train_time += time.time() - start_time

            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=step_metrics)

            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             total_loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info('collect_time = {}, train_time = {}'.format(
                    collect_time, train_time))
                with tf.compat.v2.summary.record_if(True):
                    tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                                data=steps_per_sec,
                                                step=global_step)

                if global_step_val % train_checkpoint_interval == 0:
                    train_checkpointer.save(global_step=global_step_val)

                if global_step_val % policy_checkpoint_interval == 0:
                    policy_checkpointer.save(global_step=global_step_val)
                    saved_model_path = os.path.join(
                        saved_model_dir,
                        'policy_' + ('%d' % global_step_val).zfill(9))
                    saved_model.save(saved_model_path)

                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

        # One final eval before exiting.
        metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
Example #26
0
from tf_agents.agents.ppo import ppo_agent
from tf_agents.networks.value_network import ValueNetwork

actor_net = actor_distribution_network.ActorDistributionNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=actor_fc_layer_params)

value_net = ValueNetwork(train_env.observation_spec())

global_step = tf.compat.v2.Variable(0)
tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(),
                              train_env.action_spec(),
                              actor_net=actor_net,
                              value_net=value_net,
                              optimizer=tf.compat.v1.train.AdamOptimizer(
                                  learning_rate=actor_learning_rate),
                              train_step_counter=global_step)
Example #27
0
  def testGetEpochLoss(self):
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=DummyActorNet(self._obs_spec, self._action_spec),
        value_net=DummyValueNet(self._obs_spec),
        normalize_observations=False,
        normalize_rewards=False,
        value_pred_loss_coef=1.0,
        policy_l2_reg=1e-4,
        value_function_l2_reg=1e-4,
        entropy_regularization=0.1,
        importance_ratio_clipping=10,
    )
    observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]],
                               dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)
    actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32)
    returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
    sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3],
                                          dtype=tf.float32)
    advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
    weights = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32)
    sample_action_distribution_parameters = {
        'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]], dtype=tf.float32),
        'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]], dtype=tf.float32),
    }
    train_step = tf.compat.v1.train.get_or_create_global_step()

    loss_info = agent.get_epoch_loss(
        time_steps,
        actions,
        sample_action_log_probs,
        returns,
        advantages,
        sample_action_distribution_parameters,
        weights,
        train_step,
        debug_summaries=False)

    self.evaluate(tf.compat.v1.initialize_all_variables())
    total_loss, extra_loss_info = self.evaluate(loss_info)
    (policy_gradient_loss, value_estimation_loss, l2_regularization_loss,
     entropy_reg_loss, kl_penalty_loss) = extra_loss_info

    # Check loss values are as expected. Factor of 2/4 is because four timesteps
    # were included in the data, but two were masked out. Reduce_means in losses
    # will divide by 4, but computed loss values are for first 2 timesteps.
    expected_pg_loss = -0.0164646133 * 2 / 4
    expected_ve_loss = 123.205 * 2 / 4
    expected_l2_loss = 1e-4 * 12 * 2 / 4
    expected_ent_loss = -0.370111 * 2 / 4
    expected_kl_penalty_loss = 0.0
    self.assertAllClose(
        expected_pg_loss + expected_ve_loss + expected_l2_loss +
        expected_ent_loss + expected_kl_penalty_loss,
        total_loss,
        atol=0.001,
        rtol=0.001)
    self.assertAllClose(expected_pg_loss, policy_gradient_loss)
    self.assertAllClose(expected_ve_loss, value_estimation_loss)
    self.assertAllClose(expected_l2_loss, l2_regularization_loss, atol=0.001,
                        rtol=0.001)
    self.assertAllClose(expected_ent_loss, entropy_reg_loss)
    self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss)
Example #28
0
    def __init__(
        self,
        landscape: flexs.Landscape,
        rounds: int,
        sequences_batch_size: int,
        model_queries_per_batch: int,
        starting_sequence: str,
        alphabet: str,
        log_file: Optional[str] = None,
        model: Optional[flexs.Model] = None,
        num_experiment_rounds: int = 10,
        num_model_rounds: int = 1,
        env_batch_size: int = 4,
    ):
        """
        Args:
            num_experiment_rounds: Number of experiment-based rounds to run. This is by
                default set to 10, the same number of sequence proposal of rounds run.
            num_model_rounds: Number of model-based rounds to run.
            env_batch_size: Number of epsisodes to batch together and run in parallel.

        """
        tf.config.run_functions_eagerly(False)

        name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}"

        if model is None:
            model = DynaPPOEnsemble(
                len(starting_sequence),
                alphabet,
            )
            # Some models in the ensemble need to be trained on dummy dataset before
            # they can predict
            model.train(
                s_utils.generate_random_sequences(len(starting_sequence), 10,
                                                  alphabet),
                [0] * 10,
            )

        super().__init__(
            model,
            name,
            rounds,
            sequences_batch_size,
            model_queries_per_batch,
            starting_sequence,
            log_file,
        )

        self.alphabet = alphabet
        self.num_experiment_rounds = num_experiment_rounds
        self.num_model_rounds = num_model_rounds
        self.env_batch_size = env_batch_size

        env = DynaPPOEnv(self.alphabet, len(starting_sequence), model,
                         landscape, env_batch_size)
        self.tf_env = tf_py_environment.TFPyEnvironment(env)

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self.tf_env.observation_spec(),
            self.tf_env.action_spec(),
            fc_layer_params=[128],
        )
        value_net = value_network.ValueNetwork(self.tf_env.observation_spec(),
                                               fc_layer_params=[128])

        print(self.tf_env.action_spec())
        self.agent = ppo_agent.PPOAgent(
            time_step_spec=self.tf_env.time_step_spec(),
            action_spec=self.tf_env.action_spec(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=10,
            summarize_grads_and_vars=False,
        )
        self.agent.initialize()
Example #29
0
def train_eval(
        root_dir,
        tf_master='',
        env_name='HalfCheetah-v2',
        env_load_fn=suite_mujoco.load,
        random_seed=0,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(200, 100),
        value_fc_layers=(200, 100),
        use_rnns=False,
        # Params for collect
        num_environment_steps=10000000,
        collect_episodes_per_iteration=30,
        num_parallel_environments=30,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-4,
        # Params for eval
        num_eval_episodes=30,
        eval_interval=500,
        # Params for summaries and logging
        train_checkpoint_interval=100,
        policy_checkpoint_interval=50,
        rb_checkpoint_interval=200,
        log_interval=50,
        summary_interval=50,
        summaries_flush_secs=1,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None):
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        batched_py_metric.BatchedPyMetric(
            AverageReturnMetric,
            metric_args={'buffer_size': num_eval_episodes},
            batch_size=num_parallel_environments),
        batched_py_metric.BatchedPyMetric(
            AverageEpisodeLengthMetric,
            metric_args={'buffer_size': num_eval_episodes},
            batch_size=num_parallel_environments),
    ]
    eval_summary_writer_flush_op = eval_summary_writer.flush()

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        tf.compat.v1.set_random_seed(random_seed)
        eval_py_env = parallel_py_environment.ParallelPyEnvironment(
            [lambda: env_load_fn(env_name)] * num_parallel_environments)
        tf_env = tf_py_environment.TFPyEnvironment(
            parallel_py_environment.ParallelPyEnvironment(
                [lambda: env_load_fn(env_name)] * num_parallel_environments))
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        if use_rnns:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                input_fc_layer_params=actor_fc_layers,
                output_fc_layer_params=None)
            value_net = value_rnn_network.ValueRnnNetwork(
                tf_env.observation_spec(),
                input_fc_layer_params=value_fc_layers,
                output_fc_layer_params=None)
        else:
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                fc_layer_params=actor_fc_layers)
            value_net = value_network.ValueNetwork(
                tf_env.observation_spec(), fc_layer_params=value_fc_layers)

        tf_agent = ppo_agent.PPOAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step)

        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy)

        environment_steps_metric = tf_metrics.EnvironmentSteps()
        environment_steps_count = environment_steps_metric.result()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]
        train_metrics = step_metrics + [
            tf_metrics.AverageReturnMetric(),
            tf_metrics.AverageEpisodeLengthMetric(),
        ]

        # Add to replay buffer and other agent specific observers.
        replay_buffer_observer = [replay_buffer.add_batch]

        collect_policy = tf_agent.collect_policy

        collect_op = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=replay_buffer_observer + train_metrics,
            num_episodes=collect_episodes_per_iteration).run()

        trajectories = replay_buffer.gather_all()

        train_op, _ = tf_agent.train(experience=trajectories)

        with tf.control_dependencies([train_op]):
            clear_replay_op = replay_buffer.clear()

        with tf.control_dependencies([clear_replay_op]):
            train_op = tf.identity(train_op)

        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'policy'),
                                                  policy=tf_agent.policy,
                                                  global_step=global_step)
        rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'replay_buffer'),
                                              max_to_keep=1,
                                              replay_buffer=replay_buffer)

        for train_metric in train_metrics:
            train_metric.tf_summaries(train_step=global_step,
                                      step_metrics=step_metrics)

        with eval_summary_writer.as_default(), \
             tf.compat.v2.summary.record_if(True):
            for eval_metric in eval_metrics:
                eval_metric.tf_summaries(step_metrics=step_metrics)

        init_agent_op = tf_agent.initialize()

        with tf.compat.v1.Session(tf_master) as sess:
            # Initialize graph.
            train_checkpointer.initialize_or_restore(sess)
            rb_checkpointer.initialize_or_restore(sess)
            common.initialize_uninitialized_variables(sess)

            sess.run(init_agent_op)
            sess.run(train_summary_writer.init())
            sess.run(eval_summary_writer.init())

            collect_time = 0
            train_time = 0
            timed_at_step = sess.run(global_step)
            steps_per_second_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(), name='steps_per_sec_ph')
            steps_per_second_summary = tf.contrib.summary.scalar(
                name='global_steps/sec', tensor=steps_per_second_ph)

            while sess.run(environment_steps_count) < num_environment_steps:
                global_step_val = sess.run(global_step)
                if global_step_val % eval_interval == 0:
                    metric_utils.compute_summaries(
                        eval_metrics,
                        eval_py_env,
                        eval_py_policy,
                        num_episodes=num_eval_episodes,
                        global_step=global_step_val,
                        callback=eval_metrics_callback,
                        log=True,
                    )
                    sess.run(eval_summary_writer_flush_op)

                start_time = time.time()
                sess.run(collect_op)
                collect_time += time.time() - start_time
                start_time = time.time()
                total_loss = sess.run(train_op)
                train_time += time.time() - start_time

                global_step_val = sess.run(global_step)
                if global_step_val % log_interval == 0:
                    logging.info('step = %d, loss = %f', global_step_val,
                                 total_loss)
                    steps_per_sec = ((global_step_val - timed_at_step) /
                                     (collect_time + train_time))
                    logging.info('%.3f steps/sec', steps_per_sec)
                    sess.run(steps_per_second_summary,
                             feed_dict={steps_per_second_ph: steps_per_sec})
                    logging.info(
                        '%s', 'collect_time = {}, train_time = {}'.format(
                            collect_time, train_time))
                    timed_at_step = global_step_val
                    collect_time = 0
                    train_time = 0

                if global_step_val % train_checkpoint_interval == 0:
                    train_checkpointer.save(global_step=global_step_val)

                if global_step_val % policy_checkpoint_interval == 0:
                    policy_checkpointer.save(global_step=global_step_val)

                if global_step_val % rb_checkpoint_interval == 0:
                    rb_checkpointer.save(global_step=global_step_val)

            # One final eval before exiting.
            metric_utils.compute_summaries(
                eval_metrics,
                eval_py_env,
                eval_py_policy,
                num_episodes=num_eval_episodes,
                global_step=global_step_val,
                callback=eval_metrics_callback,
                log=True,
            )
            sess.run(eval_summary_writer_flush_op)
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        env_load_fn=suite_mujoco.load,
        random_seed=0,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(512, 256, 256, 30),
        value_fc_layers=(512, 256, 256, 25),
        use_rnns=False,
        # Params for collect
        num_environment_steps=10000000,
        collect_episodes_per_iteration=NumEpisodes,
        num_parallel_environments=1,
        replay_buffer_capacity=10000,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=5e-4,
        # Params for eval
        num_eval_episodes=5,
        eval_interval=500,
        # Params for summaries and logging
        log_interval=50,
        summary_interval=50,
        summaries_flush_secs=1,
        use_tf_functions=True,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train6')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        tf.compat.v1.set_random_seed(random_seed)
        #eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name))
        #tf_env = tf_py_environment.TFPyEnvironment(
        #    parallel_py_environment.ParallelPyEnvironment(
        #        [lambda: env_load_fn(env_name)] * num_parallel_environments))
        env = xSpace()
        if isinstance(env, py_environment.PyEnvironment):
            eval_tf_env = tf_py_environment.TFPyEnvironment(env)
            tf_env = tf_py_environment.TFPyEnvironment(env)
            print("Py Env")
        elif isinstance(env, tf_environment.TFEnvironment):
            eval_tf_env = env
            tf_env = env
            print("TF Env")
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        if use_rnns:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                input_fc_layer_params=actor_fc_layers,
                output_fc_layer_params=None)
            value_net = value_rnn_network.ValueRnnNetwork(
                tf_env.observation_spec(),
                input_fc_layer_params=value_fc_layers,
                output_fc_layer_params=None)
        else:
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                fc_layer_params=actor_fc_layers)
            value_net = value_network.ValueNetwork(
                tf_env.observation_spec(), fc_layer_params=value_fc_layers)

        tf_agent = ppo_agent.PPOAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            optimizer,
            lambda_value=0.98,
            discount_factor=0.995,
            #value_pred_loss_coef=0.005,
            use_gae=True,
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step,
            normalize_observations=False)
        tf_agent.initialize()
        print("************ INITIALIZING **********************")
        environment_steps_metric = tf_metrics.EnvironmentSteps()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]

        train_metrics = step_metrics + [
            tf_metrics.AverageReturnMetric(),
            tf_metrics.AverageEpisodeLengthMetric(),
        ]

        eval_policy = tf_agent.policy
        collect_policy = tf_agent.collect_policy
        # this for tensorbaord
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=collect_episodes_per_iteration)

        if use_tf_functions:
            # TODO(b/123828980): Enable once the cause for slowdown was identified.
            collect_driver.run = common.function(collect_driver.run,
                                                 autograph=False)
            tf_agent.train = common.function(tf_agent.train, autograph=False)

        collect_time = 0
        train_time = 0
        timed_at_step = global_step.numpy()

        while environment_steps_metric.result() < num_environment_steps:
            global_step_val = global_step.numpy()
            eval_tf_env.reset()
            if global_step_val % eval_interval == 0:
                #tf_env.ResetMattData()
                metric_utils.eager_compute(
                    eval_metrics,
                    eval_tf_env,
                    eval_policy,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics',
                )
            #print("eager compute completed")
            eval_tf_env.reset()
            start_time = time.time()
            collect_driver.run()
            #print("collect completed")
            collect_time += time.time() - start_time
            print("collect_time:" + str(collect_time))
            start_time = time.time()
            trajectories = replay_buffer.gather_all()
            #print("start train completed")
            #pdb.set_trace()
            #k=trajectories[5]
            #xMean=tf.reduce_mean(k)

            print('training...')
            total_loss, _ = tf_agent.train(experience=trajectories)
            print('training complete. total loss:' + str(total_loss))
            #print("end train completed")
            replay_buffer.clear()
            train_time += time.time() - start_time

            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=step_metrics)

            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             total_loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info('collect_time = {}, train_time = {}'.format(
                    collect_time, train_time))
                with tf.compat.v2.summary.record_if(True):
                    tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                                data=steps_per_sec,
                                                step=global_step)

                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

        # One final eval before exiting.
        metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )