Ejemplo n.º 1
0
    def testActorLoss(self, num_bc_steps, expected_loss):
        agent = cql_sac_agent.CqlSacAgent(self._time_step_spec,
                                          self._action_spec,
                                          critic_network=DummyCriticNet(),
                                          actor_network=DummyActorNet(
                                              self._obs_spec,
                                              self._action_spec),
                                          actor_optimizer=None,
                                          critic_optimizer=None,
                                          alpha_optimizer=None,
                                          cql_alpha=1.0,
                                          num_cql_samples=1,
                                          include_critic_entropy_term=False,
                                          use_lagrange_cql_alpha=False,
                                          num_bc_steps=num_bc_steps,
                                          actor_policy_ctor=DummyActorPolicy)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[5], [6]], dtype=tf.float32)

        loss = agent.actor_loss(time_steps, actions)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
Ejemplo n.º 2
0
    def testAgentTransitionTrain(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec,
            self._action_spec,
            fc_layer_params=(10, ),
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        agent = cql_sac_agent.CqlSacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            cql_alpha=5.0,
            num_cql_samples=1,
            include_critic_entropy_term=False,
            use_lagrange_cql_alpha=False)

        time_step_spec = self._time_step_spec._replace(
            reward=tensor_spec.BoundedTensorSpec(
                [], tf.float32, minimum=0.0, maximum=1.0, name='reward'))

        transition_spec = trajectory.Transition(
            time_step=time_step_spec,
            action_step=policy_step.PolicyStep(action=self._action_spec,
                                               state=(),
                                               info=()),
            next_time_step=time_step_spec)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            transition_spec, outer_dims=(3, ))
        agent.train(sample_trajectory_experience)
Ejemplo n.º 3
0
    def testCqlLoss(self, cql_alpha, num_cql_samples, expected_loss):
        agent = cql_sac_agent.CqlSacAgent(self._time_step_spec,
                                          self._action_spec,
                                          critic_network=DummyCriticNet(),
                                          actor_network=None,
                                          actor_optimizer=None,
                                          critic_optimizer=None,
                                          alpha_optimizer=None,
                                          cql_alpha=cql_alpha,
                                          num_cql_samples=num_cql_samples,
                                          include_critic_entropy_term=False,
                                          use_lagrange_cql_alpha=False,
                                          random_seed=self._random_seed,
                                          actor_policy_ctor=DummyActorPolicy)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[5], [6]], dtype=tf.float32)

        loss = agent._cql_loss(time_steps, actions,
                               training=False) * agent._get_cql_alpha()

        self.initialize_v1_variables()
        loss_ = self.evaluate(loss)

        self.assertAllClose(loss_, expected_loss)
Ejemplo n.º 4
0
    def testCriticLoss(self, include_critic_entropy_term,
                       reward_noise_variance, use_tf_variable, td_targets):
        if use_tf_variable:
            reward_noise_variance = tf.Variable(reward_noise_variance)
        agent = cql_sac_agent.CqlSacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=None,
            actor_optimizer=None,
            critic_optimizer=None,
            alpha_optimizer=None,
            cql_alpha=1.0,
            num_cql_samples=1,
            include_critic_entropy_term=include_critic_entropy_term,
            use_lagrange_cql_alpha=False,
            reward_noise_variance=reward_noise_variance,
            actor_policy_ctor=DummyActorPolicy)

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[5], [6]], dtype=tf.float32)

        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
        next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32)
        next_time_steps = ts.transition(next_observations, rewards, discounts)

        pred_td_targets = [7., 10.]
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Expected critic loss has factor of 2, for the two TD3 critics.
        expected_loss = self.evaluate(
            2 * tf.compat.v1.losses.mean_squared_error(
                tf.constant(td_targets), tf.constant(pred_td_targets)))

        loss = agent._critic_loss_with_optional_entropy_term(
            time_steps,
            actions,
            next_time_steps,
            td_errors_loss_fn=tf.math.squared_difference)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_ = self.evaluate(loss)
        self.assertAllClose(loss_, expected_loss)
Ejemplo n.º 5
0
    def testTrainWithLagrange(
            self, use_lagrange_cql_alpha, use_variable_for_cql_alpha,
            log_cql_alpha_clipping, expected_cql_alpha_step_one,
            expected_cql_alpha_step_two, expected_cql_loss_step_one,
            expected_cql_loss_step_two):
        if use_variable_for_cql_alpha:
            cql_alpha = tf.Variable(5.0)
            cql_alpha_var = cql_alpha  # Getting around type checking.
        else:
            cql_alpha = 5.0
        cql_alpha_learning_rate = 0.5
        cql_tau = 10
        num_cql_samples = 5

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec, self._action_spec, fc_layer_params=None)
        critic_net = critic_network.CriticNetwork(
            (self._obs_spec, self._action_spec),
            observation_fc_layer_params=(16, ),
            action_fc_layer_params=(16, ),
            joint_fc_layer_params=(16, ),
            kernel_initializer='glorot_uniform',
            last_kernel_initializer='glorot_uniform')

        counter = common.create_variable('test_train_counter')
        optimizer_fn = tf.compat.v1.train.AdamOptimizer
        agent = cql_sac_agent.CqlSacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=critic_net,
            actor_network=actor_net,
            actor_optimizer=optimizer_fn(1e-3),
            critic_optimizer=optimizer_fn(1e-3),
            alpha_optimizer=optimizer_fn(1e-3),
            cql_alpha=cql_alpha,
            num_cql_samples=num_cql_samples,
            include_critic_entropy_term=False,
            use_lagrange_cql_alpha=use_lagrange_cql_alpha,
            cql_alpha_learning_rate=cql_alpha_learning_rate,
            cql_tau=cql_tau,
            random_seed=self._random_seed,
            log_cql_alpha_clipping=log_cql_alpha_clipping,
            train_step_counter=counter)

        batch_size = 5
        observations = tf.constant([[[1, 2], [3, 4]]] * batch_size,
                                   dtype=tf.float32)
        actions = tf.constant([[[0], [1]]] * batch_size, dtype=tf.float32)
        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 2] * batch_size,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 2] * batch_size,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 2] * batch_size,
                                                      dtype=tf.float32),
                                 observation=observations)

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, (), time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        # Force variable creation.
        agent.policy.variables()

        if not tf.executing_eagerly():
            # Get experience first to make sure optimizer variables are created and
            # can be initialized.
            experience = agent.train(experience)
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
            self.assertEqual(self.evaluate(counter), 0)
            self.evaluate(experience)
            self.assertEqual(self.evaluate(counter), 1)
        else:
            # Training step one.
            self.assertEqual(self.evaluate(counter), 0)
            loss = self.evaluate(agent.train(experience))
            self.assertEqual(self.evaluate(counter), 1)
            self.assertAllClose(loss.extra.cql_loss,
                                expected_cql_loss_step_one)
            self.assertAllClose(loss.extra.cql_alpha,
                                expected_cql_alpha_step_one)
            if use_lagrange_cql_alpha:
                self.assertGreater(loss.extra.cql_alpha_loss, 0)
            else:
                self.assertEqual(loss.extra.cql_alpha_loss, 0)

            # Training step two.
            if use_variable_for_cql_alpha:
                cql_alpha_var.assign_add(1)
            loss = self.evaluate(agent.train(experience))
            self.assertEqual(self.evaluate(counter), 2)
            self.assertAllClose(loss.extra.cql_loss,
                                expected_cql_loss_step_two)
            # GPU (V100) needs slightly increased to pass.
            if tf.test.is_gpu_available():
                self.assertAllClose(loss.extra.cql_alpha,
                                    expected_cql_alpha_step_two,
                                    atol=4.5e-5,
                                    rtol=1.5e-5)
            else:
                self.assertAllClose(loss.extra.cql_alpha,
                                    expected_cql_alpha_step_two)
Ejemplo n.º 6
0
    def testTrainWithRnn(self, cql_alpha, num_cql_samples,
                         include_critic_entropy_term, use_lagrange_cql_alpha,
                         expected_loss):
        actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
            self._obs_spec,
            self._action_spec,
            input_fc_layer_params=None,
            output_fc_layer_params=None,
            conv_layer_params=None,
            lstm_size=(40, ),
        )

        critic_net = critic_rnn_network.CriticRnnNetwork(
            (self._obs_spec, self._action_spec),
            observation_fc_layer_params=(16, ),
            action_fc_layer_params=(16, ),
            joint_fc_layer_params=(16, ),
            lstm_size=(16, ),
            output_fc_layer_params=None,
        )

        counter = common.create_variable('test_train_counter')

        optimizer_fn = tf.compat.v1.train.AdamOptimizer

        agent = cql_sac_agent.CqlSacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=critic_net,
            actor_network=actor_net,
            actor_optimizer=optimizer_fn(1e-3),
            critic_optimizer=optimizer_fn(1e-3),
            alpha_optimizer=optimizer_fn(1e-3),
            cql_alpha=cql_alpha,
            num_cql_samples=num_cql_samples,
            include_critic_entropy_term=include_critic_entropy_term,
            use_lagrange_cql_alpha=use_lagrange_cql_alpha,
            random_seed=self._random_seed,
            train_step_counter=counter,
        )

        batch_size = 5
        observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                   dtype=tf.float32)
        actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32)
        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * batch_size,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * batch_size,
                                                      dtype=tf.float32),
                                 observation=observations)

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, (), time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        # Force variable creation.
        agent.policy.variables()

        if not tf.executing_eagerly():
            # Get experience first to make sure optimizer variables are created and
            # can be initialized.
            experience = agent.train(experience)
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
            self.assertEqual(self.evaluate(counter), 0)
            self.evaluate(experience)
            self.assertEqual(self.evaluate(counter), 1)
        else:
            self.assertEqual(self.evaluate(counter), 0)
            loss = self.evaluate(agent.train(experience))
            self.assertAllClose(loss.loss, expected_loss)
            self.assertEqual(self.evaluate(counter), 1)
Ejemplo n.º 7
0
def train_eval(
        root_dir,
        dataset_path,
        env_name,
        # Training params
        tpu=False,
        use_gpu=False,
        num_gradient_updates=1000000,
        actor_fc_layers=(256, 256),
        critic_joint_fc_layers=(256, 256, 256),
        # Agent params
        batch_size=256,
        bc_steps=0,
        actor_learning_rate=3e-5,
        critic_learning_rate=3e-4,
        alpha_learning_rate=3e-4,
        reward_scale_factor=1.0,
        cql_alpha_learning_rate=3e-4,
        cql_alpha=5.0,
        cql_tau=10.0,
        num_cql_samples=10,
        reward_noise_variance=0.0,
        include_critic_entropy_term=False,
        use_lagrange_cql_alpha=True,
        log_cql_alpha_clipping=None,
        softmax_temperature=1.0,
        # Data params
        reward_shift=0.0,
        action_clipping=None,
        use_trajectories=False,
        data_shuffle_buffer_size_per_record=1,
        data_shuffle_buffer_size=100,
        data_num_shards=1,
        data_block_length=10,
        data_parallel_reads=None,
        data_parallel_calls=10,
        data_prefetch=10,
        data_cycle_length=10,
        # Others
        policy_save_interval=10000,
        eval_interval=10000,
        summary_interval=1000,
        learner_iterations_per_call=1,
        eval_episodes=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        seed=None):
    """Trains and evaluates CQL-SAC."""
    logging.info('Training CQL-SAC on: %s', env_name)
    tf.random.set_seed(seed)
    np.random.seed(seed)

    # Load environment.
    env = load_d4rl(env_name)
    tf_env = tf_py_environment.TFPyEnvironment(env)
    strategy = strategy_utils.get_strategy(tpu, use_gpu)

    if not dataset_path.endswith('.tfrecord'):
        dataset_path = os.path.join(dataset_path, env_name,
                                    '%s*.tfrecord' % env_name)
    logging.info('Loading dataset from %s', dataset_path)
    dataset_paths = tf.io.gfile.glob(dataset_path)

    # Create dataset.
    with strategy.scope():
        dataset = create_tf_record_dataset(
            dataset_paths,
            batch_size,
            shuffle_buffer_size_per_record=data_shuffle_buffer_size_per_record,
            shuffle_buffer_size=data_shuffle_buffer_size,
            num_shards=data_num_shards,
            cycle_length=data_cycle_length,
            block_length=data_block_length,
            num_parallel_reads=data_parallel_reads,
            num_parallel_calls=data_parallel_calls,
            num_prefetch=data_prefetch,
            strategy=strategy,
            reward_shift=reward_shift,
            action_clipping=action_clipping,
            use_trajectories=use_trajectories)

    # Create agent.
    time_step_spec = tf_env.time_step_spec()
    observation_spec = time_step_spec.observation
    action_spec = tf_env.action_spec()
    with strategy.scope():
        train_step = train_utils.create_train_step()

        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=actor_fc_layers,
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        critic_net = critic_network.CriticNetwork(
            (observation_spec, action_spec),
            joint_fc_layer_params=critic_joint_fc_layers,
            kernel_initializer='glorot_uniform',
            last_kernel_initializer='glorot_uniform')

        agent = cql_sac_agent.CqlSacAgent(
            time_step_spec,
            action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            actor_optimizer=tf.keras.optimizers.Adam(
                learning_rate=actor_learning_rate),
            critic_optimizer=tf.keras.optimizers.Adam(
                learning_rate=critic_learning_rate),
            alpha_optimizer=tf.keras.optimizers.Adam(
                learning_rate=alpha_learning_rate),
            cql_alpha=cql_alpha,
            num_cql_samples=num_cql_samples,
            include_critic_entropy_term=include_critic_entropy_term,
            use_lagrange_cql_alpha=use_lagrange_cql_alpha,
            cql_alpha_learning_rate=cql_alpha_learning_rate,
            target_update_tau=5e-3,
            target_update_period=1,
            random_seed=seed,
            cql_tau=cql_tau,
            reward_noise_variance=reward_noise_variance,
            num_bc_steps=bc_steps,
            td_errors_loss_fn=tf.math.squared_difference,
            gamma=0.99,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=None,
            log_cql_alpha_clipping=log_cql_alpha_clipping,
            softmax_temperature=softmax_temperature,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step)
        agent.initialize()

    # Create learner.
    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    collect_env_step_metric = py_metrics.EnvironmentSteps()
    learning_triggers = [
        triggers.PolicySavedModelTrigger(saved_model_dir,
                                         agent,
                                         train_step,
                                         interval=policy_save_interval,
                                         metadata_metrics={
                                             triggers.ENV_STEP_METADATA_KEY:
                                             collect_env_step_metric
                                         }),
        triggers.StepPerSecondLogTrigger(train_step, interval=100)
    ]
    cql_learner = learner.Learner(root_dir,
                                  train_step,
                                  agent,
                                  experience_dataset_fn=lambda: dataset,
                                  triggers=learning_triggers,
                                  summary_interval=summary_interval,
                                  strategy=strategy)

    # Create actor for evaluation.
    tf_greedy_policy = greedy_policy.GreedyPolicy(agent.policy)
    eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(
        tf_greedy_policy, use_tf_function=True)
    eval_actor = actor.Actor(env,
                             eval_greedy_policy,
                             train_step,
                             metrics=actor.eval_metrics(eval_episodes),
                             summary_dir=os.path.join(root_dir, 'eval'),
                             episodes_per_run=eval_episodes)

    # Run.
    dummy_trajectory = trajectory.mid((), (), (), 0., 1.)
    num_learner_iterations = int(num_gradient_updates /
                                 learner_iterations_per_call)
    for _ in range(num_learner_iterations):
        # Mimic collecting environment steps since we loaded a static dataset.
        for _ in range(learner_iterations_per_call):
            collect_env_step_metric(dummy_trajectory)

        cql_learner.run(iterations=learner_iterations_per_call)
        if eval_interval and train_step.numpy() % eval_interval == 0:
            eval_actor.run_and_log()