def testCreateAgentWithPrebuiltPreprocessingLayers(self):
        dense_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(10),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Reshape([2, 5]),
        ])
        q_net = KerasLayersNet(self._time_step_spec.observation,
                               self._action_spec, dense_layer)
        with self.assertRaisesRegexp(
                ValueError, 'shares weights with the original network'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec,
                self._action_spec,
                categorical_q_network=q_net,
                optimizer=None)

        # Explicitly share weights between q and target networks.
        # This would be an unusual setup so we check that an error is thrown.
        q_target_net = KerasLayersNet(self._time_step_spec.observation,
                                      self._action_spec, dense_layer)
        with self.assertRaisesRegexp(
                ValueError, 'shares weights with the original network'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec,
                self._action_spec,
                categorical_q_network=q_net,
                optimizer=None,
                target_categorical_q_network=q_target_net)
Example #2
0
    def testCreateAgentWithPrebuiltPreprocessingLayers(self):
        dense_layer = tf.keras.layers.Dense(3)
        q_net = KerasLayersNet(self._time_step_spec.observation,
                               self._action_spec, dense_layer)
        with self.assertRaisesRegexp(
                ValueError, 'shares weights with the original network'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec,
                self._action_spec,
                categorical_q_network=q_net,
                optimizer=None)

        # Explicitly share weights between q and target networks; this is ok.
        q_target_net = KerasLayersNet(self._time_step_spec.observation,
                                      self._action_spec, dense_layer)
        categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec,
            self._action_spec,
            categorical_q_network=q_net,
            optimizer=None,
            target_categorical_q_network=q_target_net)

        q_bad_target_net = KerasLayersNet(self._time_step_spec.observation,
                                          self._action_spec,
                                          dense_layer,
                                          num_atoms=3)

        with self.assertRaisesRegexp(ValueError,
                                     'have different numbers of atoms'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec,
                self._action_spec,
                categorical_q_network=q_net,
                optimizer=None,
                target_categorical_q_network=q_bad_target_net)
    def testCreateAgentDimChecks(self):
        action_spec = tensor_spec.BoundedTensorSpec([1, 2], tf.int32, 0, 1)

        with self.assertRaisesRegex(ValueError, 'Only scalar actions'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec, action_spec, self._dummy_categorical_net,
                self._optimizer)
Example #4
0
  def testInitialize(self):
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        self._time_step_spec,
        self._action_spec,
        self._categorical_net,
        self._optimizer)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)
    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_time_steps = ts.transition(observations, rewards, discounts)

    experience = test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    loss_info = agent._loss(experience)
    initialize = agent.initialize()

    self.evaluate(tf.compat.v1.global_variables_initializer())
    losses = self.evaluate(loss_info).loss
    self.assertGreater(losses, 0.0)

    critic_variables = agent._q_network.variables
    target_critic_variables = agent._target_q_network.variables
    self.assertTrue(critic_variables)
    self.assertTrue(target_critic_variables)
    self.evaluate(initialize)
    for s, t in zip(critic_variables, target_critic_variables):
      self.assertAllClose(self.evaluate(s), self.evaluate(t))
Example #5
0
  def testTrain(self):
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        self._time_step_spec,
        self._action_spec,
        self._dummy_categorical_net,
        self._optimizer)

    observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
    time_steps = ts.restart(observations, batch_size=2)

    actions = tf.constant([0, 1], dtype=tf.int32)
    action_steps = policy_step.PolicyStep(actions)

    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
    next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
    next_time_steps = ts.transition(next_observations, rewards, discounts)

    experience = test_utils.stacked_trajectory_from_transition(
        time_steps, action_steps, next_time_steps)

    train_step = agent.train(experience, weights=None)

    # Due to the constant initialization of the DummyCategoricalNet, we can
    # expect the same loss every time.
    expected_loss = 2.19525
    self.evaluate(tf.compat.v1.global_variables_initializer())
    evaluated_loss, _ = self.evaluate(train_step)
    self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Example #6
0
    def testCriticLossNStep(self):
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec,
            self._action_spec,
            self._dummy_categorical_net,
            self._optimizer,
            n_step_update=2)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        third_observations = tf.constant([[9, 10], [11, 12]], dtype=tf.float32)
        third_time_steps = ts.transition(third_observations, rewards,
                                         discounts)

        experience1 = trajectory.from_transition(time_steps, action_steps,
                                                 next_time_steps)
        experience2 = trajectory.from_transition(next_time_steps, action_steps,
                                                 third_time_steps)
        experience3 = trajectory.from_transition(third_time_steps,
                                                 action_steps,
                                                 third_time_steps)

        experience = tf.nest.map_structure(
            lambda x, y, z: tf.stack([x, y, z], axis=1), experience1,
            experience2, experience3)

        loss_info = agent._loss(experience)

        # discounted_returns should evaluate to 10 + 0.9 * 10 = 19 and
        # 20 + 0.9 * 20 = 38.
        evaluated_discounted_returns = self.evaluate(agent._discounted_returns)
        self.assertAllClose(evaluated_discounted_returns, [[19], [38]],
                            atol=1e-4)

        # Both final_value_discount values should be 0.9 * 0.9 = 0.81.
        evaluated_final_value_discount = self.evaluate(
            agent._final_value_discount)
        self.assertAllClose(evaluated_final_value_discount, [[0.81], [0.81]],
                            atol=1e-4)

        # Due to the constant initialization of the DummyCategoricalNet, we can
        # expect the same loss every time.
        expected_loss = 2.19525
        self.evaluate(tf.compat.v1.global_variables_initializer())
        evaluated_loss = self.evaluate(loss_info).loss
        self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Example #7
0
    def testCreateAgentNestSizeChecks(self):
        action_spec = [
            tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1),
            tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)
        ]

        with self.assertRaisesRegexp(ValueError,
                                     '.*Only one dimensional actions.*'):
            categorical_dqn_agent.CategoricalDqnAgent(
                self._time_step_spec, action_spec, self._dummy_categorical_net,
                self._optimizer)
Example #8
0
    def testTrainWithRnn(self):
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)

        batch_size = 5
        observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                   dtype=tf.float32)
        actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.int32)
        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * batch_size,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * batch_size,
                                                      dtype=tf.float32),
                                 observation=[observations])

        experience = trajectory.Trajectory(step_type=time_steps.step_type,
                                           observation=observations,
                                           action=actions,
                                           policy_info=(),
                                           next_step_type=time_steps.step_type,
                                           reward=time_steps.reward,
                                           discount=time_steps.discount)

        categorical_q_rnn_network = DummyCategoricalQRnnNetwork(
            self._obs_spec,
            action_spec,
            conv_layer_params=None,
            input_fc_layer_params=(16, ),
            preprocessing_combiner=None,
            lstm_size=(40, ),
            output_fc_layer_params=(16, ),
        )

        counter = common.create_variable('test_train_counter')

        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec,
            action_spec,
            categorical_q_rnn_network,
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        )

        # Force variable creation.
        agent.policy.variables()
        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertEqual(self.evaluate(counter), 0)
        self.evaluate(loss)
Example #9
0
    def testPolicy(self):
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec, self._action_spec, self._categorical_net,
            self._optimizer)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions, _, _ = agent.policy.action(time_steps)
        self.assertEqual(actions.shape, [2])
        self.evaluate(tf.compat.v1.global_variables_initializer())
        actions_ = self.evaluate(actions)
        self.assertTrue(all(actions_ <= self._action_spec.maximum))
        self.assertTrue(all(actions_ >= self._action_spec.minimum))
Example #10
0
 def testCreateAgentWithPrebuiltPreprocessingLayersDiffAtoms(self):
   dense_layer = tf.keras.layers.Dense(3)
   q_net = KerasLayersNet(
       self._time_step_spec.observation, self._action_spec, dense_layer)
   dense_layer_target = tf.keras.layers.Dense(3)
   q_bad_target_net = KerasLayersNet(
       self._time_step_spec.observation, self._action_spec, dense_layer_target,
       num_atoms=3)
   with self.assertRaisesRegexp(ValueError, 'have different numbers of atoms'):
     categorical_dqn_agent.CategoricalDqnAgent(
         self._time_step_spec,
         self._action_spec,
         categorical_q_network=q_net,
         optimizer=None,
         target_categorical_q_network=q_bad_target_net)
    def testCriticLossWithMaskedActions(self):
        # Observations are now a tuple of the usual observation and an action mask.
        observation_spec_with_mask = (self._obs_spec,
                                      tensor_spec.BoundedTensorSpec([2],
                                                                    tf.int32,
                                                                    0, 1))
        time_step_spec = ts.time_step_spec(observation_spec_with_mask)
        dummy_categorical_net = DummyCategoricalNet(self._obs_spec)
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            time_step_spec,
            self._action_spec,
            dummy_categorical_net,
            self._optimizer,
            observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))

        # For `observations`, the masks are set up so that only one action is valid
        # for each element in the batch.
        observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
                        tf.constant([[1, 0], [0, 1]], dtype=tf.int32))
        time_steps = ts.restart(observations, batch_size=2)

        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        # For `next_observations`, the masks are set up so the opposite actions as
        # before are valid.
        next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32),
                             tf.constant([[0, 1], [1, 0]], dtype=tf.int32))
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, next_time_steps)

        # Due to the constant initialization of the DummyCategoricalNet, we can
        # expect the same loss every time. Note this is different from the loss in
        # testCriticLoss above due to previously optimal actions being masked out.
        expected_loss = 5.062895
        loss_info = agent._loss(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        evaluated_loss = self.evaluate(loss_info).loss
        self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
Example #12
0
    def testUpdateTarget(self):
        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec, self._action_spec, self._categorical_net,
            self._optimizer)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([0, 1], dtype=tf.int32)
        action_steps = policy_step.PolicyStep(actions)
        experience = test_utils.stacked_trajectory_from_transition(
            time_steps, action_steps, time_steps)

        loss_info = agent._loss(experience)
        update_targets = agent._update_target()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        losses = self.evaluate(loss_info).loss
        self.assertGreater(losses, 0.0)
        self.evaluate(update_targets)
Example #13
0
def load_agents_and_create_videos(
        root_dir,
        env_name='CartPole-v0',
        num_iterations=NUM_ITERATIONS,
        max_ep_steps=1000,
        train_sequence_length=1,
        # Params for QNetwork
        fc_layer_params=((100, )),
        # Params for QRnnNetwork
        input_fc_layer_params=(50, ),
        lstm_size=(20, ),
        output_fc_layer_params=(20, ),
        # Params for collect
        initial_collect_steps=10000,
        collect_steps_per_iteration=1,
        epsilon_greedy=0.1,
        replay_buffer_capacity=100000,
        # Params for target update
        target_update_tau=0.05,
        target_update_period=5,
        # Params for train
        train_steps_per_iteration=1,
        batch_size=64,
        learning_rate=1e-3,
        num_atoms=51,
        min_q_value=-20,
        max_q_value=20,
        n_step_update=1,
        gamma=0.99,
        reward_scale_factor=1.0,
        gradient_clipping=None,
        use_tf_functions=True,
        # Params for eval
        num_eval_episodes=10,
        num_random_episodes=1,
        eval_interval=1000,
        # Params for checkpoints
        train_checkpoint_interval=10000,
        policy_checkpoint_interval=5000,
        rb_checkpoint_interval=20000,
        # Params for summaries and logging
        log_interval=1000,
        summary_interval=1000,
        summaries_flush_secs=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None,
        random_metrics_callback=None):

    # Define the directories to read from
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    random_dir = os.path.join(root_dir, 'random')

    # Match the writers and metrics used in training
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)

    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)

    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    random_summary_writer = tf.compat.v2.summary.create_file_writer(
        random_dir, flush_millis=summaries_flush_secs * 1000)

    random_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Match the environments used in training
    tf_env = tf_py_environment.TFPyEnvironment(
        suite_gym.load(env_name, max_episode_steps=max_ep_steps))
    eval_py_env = suite_gym.load(env_name, max_episode_steps=max_ep_steps)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Match the agents used in training
    categorical_q_net = categorical_q_network.CategoricalQNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        num_atoms=num_atoms,
        fc_layer_params=fc_layer_params)

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    tf_agent = categorical_dqn_agent.CategoricalDqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        categorical_q_network=categorical_q_net,
        optimizer=optimizer,
        min_q_value=min_q_value,
        max_q_value=max_q_value,
        n_step_update=n_step_update,
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        train_step_counter=global_step)

    tf_agent.initialize()

    train_metrics = [
        # tf_metrics.NumberOfEpisodes(),
        # tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_steps=collect_steps_per_iteration)

    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=tf_agent,
                                             global_step=global_step,
                                             metrics=metric_utils.MetricsGroup(
                                                 train_metrics,
                                                 'train_metrics'))

    policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'policy'),
                                              policy=eval_policy,
                                              global_step=global_step)

    rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'replay_buffer'),
                                          max_to_keep=1,
                                          replay_buffer=replay_buffer)

    train_checkpointer.initialize_or_restore()
    rb_checkpointer.initialize_or_restore()

    if use_tf_functions:
        # To speed up collect use common.function.
        collect_driver.run = common.function(collect_driver.run)
        tf_agent.train = common.function(tf_agent.train)

    random_policy = random_tf_policy.RandomTFPolicy(
        eval_tf_env.time_step_spec(), eval_tf_env.action_spec())

    # Make movies of the trained agent and a random agent
    date_string = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')

    # Finally, used the saved policy to generate the video
    trained_filename = "trainedC51_" + date_string
    create_policy_eval_video(eval_tf_env, eval_py_env, tf_agent.policy,
                             trained_filename)

    # And, create one with a random agent for comparison
    random_filename = 'random_' + date_string
    create_policy_eval_video(eval_tf_env, eval_py_env, random_policy,
                             random_filename)
Example #14
0
def main(argv):
    tf.compat.v1.enable_v2_behavior()
    logging.config.dictConfig({
        'version': 1,
        # Other configs ...
        'disable_existing_loggers': True
    })
    argv = argv[0]

    evaluate = argv.eval

    # Mostly copied from https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
    # Hyperparameters
    num_iterations = argv.num_iterations

    collect_steps_per_iteration = argv.collect_steps_per_iteration
    replay_buffer_max_length = 100000

    batch_size = argv.batch_size
    learning_rate = 2.5e-5
    log_interval = argv.log_interval

    num_atoms = argv.num_atoms
    min_q_value = argv.min_q_value
    max_q_value = argv.max_q_value
    n_step_update = argv.n_step_update
    gamma = 0.99

    num_eval_episodes = 10
    eval_interval = argv.eval_interval

    save_interval = argv.save_interval
    n_parallels = argv.n_parallels
    train_in_browser = argv.train_in_browser
    # Environment
    train_py_env = Env2048(evaluate) if evaluate else ParallelPyEnvironment(
        [lambda: Env2048(train_in_browser)] * n_parallels,
        start_serially=False)
    eval_py_env = Env2048(evaluate)
    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Agent
    fc_layer_params = (64, 64, 32)
    conv_layer_params = ((512, (2, 1), (1, 1)), (512, (1, 2), (1, 1)))
    preprocessing_layers = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(512, (1, 1), (1, 1), padding='same'),
        tf.keras.layers.Conv2D(512, (2, 1), (1, 1), padding='same'),
        tf.keras.layers.Conv2D(512, (1, 2), (1, 1), padding='same'),
        tf.keras.layers.Flatten()
    ])
    preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # q_net = q_network.QNetwork(
    #     train_env.observation_spec(),
    #     train_env.action_spec(),
    #     fc_layer_params=fc_layer_params)
    # agent = dqn_agent.DqnAgent(
    #     train_env.time_step_spec(),
    #     train_env.action_spec(),
    #     q_network=q_net,
    #     optimizer=optimizer,
    #     td_errors_loss_fn=common.element_wise_squared_loss,
    #     train_step_counter=global_step)

    categorical_q_net = categorical_q_network.CategoricalQNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        num_atoms=num_atoms,
        fc_layer_params=fc_layer_params,
        # conv_layer_params=conv_layer_params
        preprocessing_layers=preprocessing_layers,
        preprocessing_combiner=preprocessing_combiner)
    agent = categorical_dqn_agent.CategoricalDqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        categorical_q_network=categorical_q_net,
        optimizer=optimizer,
        min_q_value=min_q_value,
        max_q_value=max_q_value,
        n_step_update=n_step_update,
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        train_step_counter=global_step)
    agent.initialize()

    # Replay buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_max_length)

    # Data Collection
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        train_env,
        agent.collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=collect_steps_per_iteration)

    collect_driver.run()

    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)
    iterator = iter(dataset)

    # Checkpointer
    checkpoint_dir = os.path.join(os.getcwd(), 'checkpoint')
    train_checkpointer = common.Checkpointer(ckpt_dir=checkpoint_dir,
                                             max_to_keep=1,
                                             agent=agent,
                                             policy=agent.policy,
                                             replay_buffer=replay_buffer,
                                             global_step=global_step)

    train_checkpointer.initialize_or_restore()
    global_step = tf.compat.v1.train.get_global_step()

    # Training
    if evaluate:
        avg_return, best_eval_score = compute_avg_return(
            eval_env, agent.policy, num_eval_episodes)
        print(f"Average return: {avg_return}, best score = {best_eval_score}")
        train_env.station.shutdown()
        eval_env.station.shutdown()
    else:
        agent.train = common.function(agent.train)
        # agent.train_step_counter.assign(0)
        avg_return = compute_avg_return(eval_env, agent.policy,
                                        num_eval_episodes)
        returns = [avg_return]
        t = trange(global_step.numpy(), num_iterations, leave=True)
        best_scores = np.array(
            list(map(lambda env: env.best_score, train_env.envs)))
        for _ in t:
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_driver.run()

            # Sample a batch of data from the buffer and update the agent's network.
            experience, unused_info = next(iterator)
            train_loss = agent.train(experience).loss

            scores = list(map(lambda env: env.score, train_env.envs))
            t.set_description(desc=f"Scores = {scores}")

            step = tf.compat.v1.train.get_global_step().numpy()

            if step % log_interval == 0:
                t.write(f"step = {step}: loss = {train_loss}")

            if step % save_interval == 0:
                train_checkpointer.save(step)

            if step % eval_interval == 0:
                avg_return, best_eval_score = compute_avg_return(
                    eval_env, agent.policy, num_eval_episodes)
                new_best_scores = np.array(
                    list(map(lambda env: env.best_score, train_env.envs)))
                diff = np.subtract(new_best_scores, best_scores)
                best_scores = new_best_scores
                if np.count_nonzero(diff) > 0:
                    t.write(f"step = {step}: Best scores = {best_scores}")
                t.write(
                    f'step = {step}: Average Return = {avg_return}, best score reached in training = '
                    f'{max(list(map(lambda env: env.best_score, train_env.envs)))}'
                    f', best score in eval = {best_eval_score}')
                returns.append(avg_return)
        steps = range(0, num_iterations + 1, eval_interval)
        plt.plot(steps, returns)
        plt.ylabel('Average Return')
        plt.xlabel('Step')

    train_env.close()
    eval_env.close()
    train_py_env.close()
Example #15
0
 def testCreateAgentDefaultNetwork(self):
   categorical_dqn_agent.CategoricalDqnAgent(
       self._time_step_spec,
       self._action_spec,
       self._categorical_net,
       self._optimizer)
Example #16
0
    momentum=optimizer_momentum,
    epsilon=optimizer_epsilon,
    centered=True)

# Computes epsilon for epsilon greedy policy given the training step
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,  # initial ε
    decay_steps=epsilon_decay_steps,
    end_learning_rate=epsilon_final)  # final ε

agent = categorical_dqn_agent.CategoricalDqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    categorical_q_network=categorical_q_net,
    optimizer=optimizer,
    min_q_value=int(min_q_value),
    max_q_value=int(max_q_value),
    n_step_update=int(n_step_update),
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=discount_factor,
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step))

agent.initialize()

# Speed up as tensorflow function
agent.train = function(agent.train)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
Example #17
0
    def __init__(
            self,
            root_dir,
            env_name,
            num_iterations=200,
            max_episode_frames=108000,  # ALE frames
            terminal_on_life_loss=False,
            conv_layer_params=((32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3),
                                                                  1)),
            fc_layer_params=(512, ),
            # Params for collect
            initial_collect_steps=80000,  # ALE frames
            epsilon_greedy=0.01,
            epsilon_decay_period=1000000,  # ALE frames
            replay_buffer_capacity=1000000,
            # Params for train
            train_steps_per_iteration=1000000,  # ALE frames
            update_period=16,  # ALE frames
            target_update_tau=1.0,
            target_update_period=32000,  # ALE frames
            batch_size=32,
            learning_rate=2.5e-4,
            n_step_update=2,
            gamma=0.99,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for eval
            do_eval=True,
            eval_steps_per_iteration=500000,  # ALE frames
            eval_epsilon_greedy=0.001,
            # Params for checkpoints, summaries, and logging
            log_interval=1000,
            summary_interval=1000,
            summaries_flush_secs=10,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            eval_metrics_callback=None):
        """A simple Atari train and eval for DQN.

    Args:
      root_dir: Directory to write log files to.
      env_name: Fully-qualified name of the Atari environment (i.e. Pong-v0).
      num_iterations: Number of train/eval iterations to run.
      max_episode_frames: Maximum length of a single episode, in ALE frames.
      terminal_on_life_loss: Whether to simulate an episode termination when a
        life is lost.
      conv_layer_params: Params for convolutional layers of QNetwork.
      fc_layer_params: Params for fully connected layers of QNetwork.
      initial_collect_steps: Number of frames to ALE frames to process before
        beginning to train. Since this is in ALE frames, there will be
        initial_collect_steps/4 items in the replay buffer when training starts.
      epsilon_greedy: Final epsilon value to decay to for training.
      epsilon_decay_period: Period over which to decay epsilon, from 1.0 to
        epsilon_greedy (defined above).
      replay_buffer_capacity: Maximum number of items to store in the replay
        buffer.
      train_steps_per_iteration: Number of ALE frames to run through for each
        iteration of training.
      update_period: Run a train operation every update_period ALE frames.
      target_update_tau: Coeffecient for soft target network updates (1.0 ==
        hard updates).
      target_update_period: Period, in ALE frames, to copy the live network to
        the target network.
      batch_size: Number of frames to include in each training batch.
      learning_rate: RMS optimizer learning rate.
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Applies standard single-step updates when set to 1.
      gamma: Discount for future rewards.
      reward_scale_factor: Scaling factor for rewards.
      gradient_clipping: Norm length to clip gradients.
      do_eval: If True, run an eval every iteration. If False, skip eval.
      eval_steps_per_iteration: Number of ALE frames to run through for each
        iteration of evaluation.
      eval_epsilon_greedy: Epsilon value to use for the evaluation policy (0 ==
        totally greedy policy).
      log_interval: Log stats to the terminal every log_interval training
        steps.
      summary_interval: Write TF summaries every summary_interval training
        steps.
      summaries_flush_secs: Flush summaries to disk every summaries_flush_secs
        seconds.
      debug_summaries: If True, write additional summaries for debugging (see
        dqn_agent for which summaries are written).
      summarize_grads_and_vars: Include gradients in summaries.
      eval_metrics_callback: A callback function that takes (metric_dict,
        global_step) as parameters. Called after every eval with the results of
        the evaluation.
    """
        self._update_period = update_period / ATARI_FRAME_SKIP
        self._train_steps_per_iteration = (train_steps_per_iteration /
                                           ATARI_FRAME_SKIP)
        self._do_eval = do_eval
        self._eval_steps_per_iteration = eval_steps_per_iteration / ATARI_FRAME_SKIP
        self._eval_epsilon_greedy = eval_epsilon_greedy
        self._initial_collect_steps = initial_collect_steps / ATARI_FRAME_SKIP
        self._summary_interval = summary_interval
        self._num_iterations = num_iterations
        self._log_interval = log_interval
        self._eval_metrics_callback = eval_metrics_callback

        with gin.unlock_config():
            gin.bind_parameter(('tf_agents.environments.atari_preprocessing.'
                                'AtariPreprocessing.terminal_on_life_loss'),
                               terminal_on_life_loss)

        root_dir = os.path.expanduser(root_dir)
        train_dir = os.path.join(root_dir, 'train')
        eval_dir = os.path.join(root_dir, 'eval')

        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            train_dir, flush_millis=summaries_flush_secs * 1000)
        train_summary_writer.set_as_default()
        self._train_summary_writer = train_summary_writer

        self._eval_summary_writer = None
        if self._do_eval:
            self._eval_summary_writer = tf.compat.v2.summary.create_file_writer(
                eval_dir, flush_millis=summaries_flush_secs * 1000)
            self._eval_metrics = [
                py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                               buffer_size=np.inf),
                py_metrics.AverageEpisodeLengthMetric(
                    name='PhaseAverageEpisodeLength', buffer_size=np.inf),
            ]

        self._global_step = tf.compat.v1.train.get_or_create_global_step()
        with tf.compat.v2.summary.record_if(lambda: tf.math.equal(
                self._global_step % self._summary_interval, 0)):
            self._env = suite_atari.load(
                env_name,
                max_episode_steps=max_episode_frames / ATARI_FRAME_SKIP,
                gym_env_wrappers=suite_atari.
                DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING)
            self._env = batched_py_environment.BatchedPyEnvironment(
                [self._env])

            observation_spec = tensor_spec.from_spec(
                self._env.observation_spec())
            time_step_spec = ts.time_step_spec(observation_spec)
            action_spec = tensor_spec.from_spec(self._env.action_spec())

            with tf.device('/cpu:0'):
                epsilon = tf.compat.v1.train.polynomial_decay(
                    1.0,
                    self._global_step,
                    epsilon_decay_period / ATARI_FRAME_SKIP /
                    self._update_period,
                    end_learning_rate=epsilon_greedy)

            with tf.device('/gpu:0'):
                optimizer = tf.compat.v1.train.RMSPropOptimizer(
                    learning_rate=learning_rate,
                    decay=0.95,
                    momentum=0.0,
                    epsilon=0.00001,
                    centered=True)
                categorical_q_net = AtariCategoricalQNetwork(
                    observation_spec,
                    action_spec,
                    conv_layer_params=conv_layer_params,
                    fc_layer_params=fc_layer_params)
                agent = categorical_dqn_agent.CategoricalDqnAgent(
                    time_step_spec,
                    action_spec,
                    categorical_q_network=categorical_q_net,
                    optimizer=optimizer,
                    epsilon_greedy=epsilon,
                    n_step_update=n_step_update,
                    target_update_tau=target_update_tau,
                    target_update_period=(target_update_period /
                                          ATARI_FRAME_SKIP /
                                          self._update_period),
                    gamma=gamma,
                    reward_scale_factor=reward_scale_factor,
                    gradient_clipping=gradient_clipping,
                    debug_summaries=debug_summaries,
                    summarize_grads_and_vars=summarize_grads_and_vars,
                    train_step_counter=self._global_step)

                self._collect_policy = py_tf_policy.PyTFPolicy(
                    agent.collect_policy)

                if self._do_eval:
                    self._eval_policy = py_tf_policy.PyTFPolicy(
                        epsilon_greedy_policy.EpsilonGreedyPolicy(
                            policy=agent.policy,
                            epsilon=self._eval_epsilon_greedy))

                py_observation_spec = self._env.observation_spec()
                py_time_step_spec = ts.time_step_spec(py_observation_spec)
                py_action_spec = policy_step.PolicyStep(
                    self._env.action_spec())
                data_spec = trajectory.from_transition(py_time_step_spec,
                                                       py_action_spec,
                                                       py_time_step_spec)
                self._replay_buffer = py_hashed_replay_buffer.PyHashedReplayBuffer(
                    data_spec=data_spec, capacity=replay_buffer_capacity)

            with tf.device('/cpu:0'):
                ds = self._replay_buffer.as_dataset(
                    sample_batch_size=batch_size, num_steps=n_step_update + 1)
                ds = ds.prefetch(4)
                ds = ds.apply(
                    tf.data.experimental.prefetch_to_device('/gpu:0'))

            with tf.device('/gpu:0'):
                self._ds_itr = tf.compat.v1.data.make_one_shot_iterator(ds)
                experience = self._ds_itr.get_next()
                self._train_op = agent.train(experience)

                self._env_steps_metric = py_metrics.EnvironmentSteps()
                self._step_metrics = [
                    py_metrics.NumberOfEpisodes(),
                    self._env_steps_metric,
                ]
                self._train_metrics = self._step_metrics + [
                    py_metrics.AverageReturnMetric(buffer_size=10),
                    py_metrics.AverageEpisodeLengthMetric(buffer_size=10),
                ]
                # The _train_phase_metrics average over an entire train iteration,
                # rather than the rolling average of the last 10 episodes.
                self._train_phase_metrics = [
                    py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                                   buffer_size=np.inf),
                    py_metrics.AverageEpisodeLengthMetric(
                        name='PhaseAverageEpisodeLength', buffer_size=np.inf),
                ]
                self._iteration_metric = py_metrics.CounterMetric(
                    name='Iteration')

                # Summaries written from python should run every time they are
                # generated.
                with tf.compat.v2.summary.record_if(True):
                    self._steps_per_second_ph = tf.compat.v1.placeholder(
                        tf.float32, shape=(), name='steps_per_sec_ph')
                    self._steps_per_second_summary = tf.compat.v2.summary.scalar(
                        name='global_steps_per_sec',
                        data=self._steps_per_second_ph,
                        step=self._global_step)

                    for metric in self._train_metrics:
                        metric.tf_summaries(train_step=self._global_step,
                                            step_metrics=self._step_metrics)

                    for metric in self._train_phase_metrics:
                        metric.tf_summaries(
                            train_step=self._global_step,
                            step_metrics=(self._iteration_metric, ))
                    self._iteration_metric.tf_summaries(
                        train_step=self._global_step)

                    if self._do_eval:
                        with self._eval_summary_writer.as_default():
                            for metric in self._eval_metrics:
                                metric.tf_summaries(
                                    train_step=self._global_step,
                                    step_metrics=(self._iteration_metric, ))

                self._train_checkpointer = common.Checkpointer(
                    ckpt_dir=train_dir,
                    agent=agent,
                    global_step=self._global_step,
                    optimizer=optimizer,
                    metrics=metric_utils.MetricsGroup(
                        self._train_metrics + self._train_phase_metrics +
                        [self._iteration_metric], 'train_metrics'))
                self._policy_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'policy'),
                    policy=agent.policy,
                    global_step=self._global_step)
                self._rb_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
                    max_to_keep=1,
                    replay_buffer=self._replay_buffer)

                self._init_agent_op = agent.initialize()
Example #18
0
def create_agent(
        agent_class,
        environment,
        fc_layer_params,
        learning_rate,
        decaying_epsilon,
        n_step_update,
        target_update_tau,
        target_update_period,
        gamma,
        reward_scale_factor,
        gradient_clipping,
        debug_summaries,
        summarize_grads_and_vars,
        train_step_counter,
        num_atoms=None,  # Only for categorical_dqn
        min_q_value=None,  # Only for categorical_dqn
        max_q_value=None,  # Only for categorical_dqn
):
    """Creates the Hanabi agent.

	Args:
	  agent_class: str, type of agent to construct.
	  environment: The environment.
	  learning_rate: The Learning Rate
	  decaying_epsilon: Epsilon for Epsilon Greedy Policy
	  target_update_tau: Agent parameter
	  target_update_period: Agent parameter
	  gamma: Agent parameter
	  reward_scale_factor: Agent parameter
	  gradient_clipping: Agent parameter
	  debug_summaries: Agent parameter
	  summarize_grads_and_vars: Agent parameter
	  train_step_counter: The train step tf.Variable to be passed to agent


	Returns:
	  An agent for playing Hanabi.

	Raises:
	  ValueError: if an unknown agent type is requested.
	"""
    if agent_class == 'DQN':
        return dqn_agent.DqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'DDQN':
        return dqn_agent.DdqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            q_network=q_network.QNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    elif agent_class == 'categorical_dqn':
        return categorical_dqn_agent.CategoricalDqnAgent(
            environment.time_step_spec(),
            environment.action_spec(),
            categorical_q_network=categorical_q_network.CategoricalQNetwork(
                environment.time_step_spec().observation['observations'],
                environment.action_spec(),
                num_atoms=num_atoms,
                fc_layer_params=fc_layer_params),
            optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
            observation_and_action_constraint_splitter=
            observation_and_action_constraint_splitter,
            epsilon_greedy=decaying_epsilon,
            n_step_update=n_step_update,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            min_q_value=min_q_value,
            max_q_value=max_q_value,
            td_errors_loss_fn=common.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter)
    else:
        raise ValueError(
            'Expected valid agent_type, got {}'.format(agent_class))
def main(arg, pars):
    """
    
    
    """
    print("load env ..")
    env_name = ("Car-v0")
    #env = gym.make("Car-v0")
    env = suite_gym.load(env_name,
                         discount=arg.gamma,
                         max_episode_steps=arg.max_t)
    print_parameter(arg, pars)
    train_py_env = suite_gym.load(env_name,
                                  discount=arg.gamma,
                                  max_episode_steps=arg.max_t)
    eval_py_env = suite_gym.load(env_name,
                                 discount=arg.gamma,
                                 max_episode_steps=arg.max_t)
    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
    print("env loaded")
    train_dir = os.path.join(arg.root_dir, 'network_weights')
    eval_dir = os.path.join(arg.root_dir, 'eval')

    train_env.reset()
    fc_layer_params = (arg.hidden_size_1, )

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=arg.lr)
    train_step_counter = tf.compat.v2.Variable(0)
    categorical_q_net = CategoricalQNetwork(train_env.observation_spec(),
                                            train_env.action_spec(),
                                            fc_layer_params=fc_layer_params)

    agent = categorical_dqn_agent.CategoricalDqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        categorical_q_network=categorical_q_net,
        optimizer=optimizer,
        epsilon_greedy=arg.eps_start)

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()

    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=tf_agent,
                                             global_step=global_step,
                                             metrics=metric_utils.MetricsGroup(
                                                 train_metrics,
                                                 'train_metrics'))

    if arg.continue_training == False:
        tf_agent.initialize()
        if os.path.exists("network_weights/*"):
            os.remove("network_weights/*")
    else:
        print("Continue Training")
        train_checkpointer.initialize_or_restore()
    print("ready to go")
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                    train_env.action_spec())
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=arg.buffer_size)
    tf_agent.collect_data_spec
    tf_agent.collect_data_spec._fields
    collect_data(train_env,
                 random_policy,
                 replay_buffer,
                 steps=arg.learn_start,
                 max_t=40)
    print("create dataset")
    dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=arg.batch_size,
                                       num_steps=2).prefetch(3)
    iterator = iter(dataset)

    # (Optional) Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)
    # Reset the train step
    tf_agent.train_step_counter.assign(0)
    avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                    arg.num_eval_episodes)
    returns = [avg_return]
    returns_average = [avg_return]
    train_loss_average = [1]
    score = 0
    scores_window = deque(maxlen=100)  # last 100 scores
    total_train_loss = deque(maxlen=100)  # last 100 scores

    train(arg, tf_agent, train_env, eval_env, replay_buffer, iterator,
          train_checkpointer)
Example #20
0
categorical_q_net = categorical_q_network.CategoricalQNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    num_atoms=num_atoms,
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

agent = categorical_dqn_agent.CategoricalDqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    categorical_q_network=categorical_q_net,
    optimizer=optimizer,
    min_q_value=min_q_value,
    max_q_value=max_q_value,
    n_step_update=n_step_update,
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=gamma,
    train_step_counter=train_step_counter)
agent.initialize()

# setup the policies
eval_policy = agent.policy  # The main policy that is used for evaluation and deployment
collect_policy = agent.collect_policy  # A second policy that is used for data collection
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

# Data Collection