def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length,
                         actor_queue_cap, num_actors, num_iterations):
        episode_length = 5
        env_f = lambda: TFPyEnvironment(
            ValueUnittestEnv(batch_size=1, episode_length=episode_length))

        envs = [env_f() for _ in range(num_envs)]
        common.set_global_env(envs[0])
        alg = _create_ac_algorithm()
        driver = AsyncOffPolicyDriver(envs, alg, num_actors, unroll_length,
                                      learn_queue_cap, actor_queue_cap)
        driver.start()
        total_num_steps_ = 0
        for _ in range(num_iterations):
            total_num_steps_ += driver.run_async()
        driver.stop()

        total_num_steps = int(driver.get_metrics()[1].result())
        self.assertGreaterEqual(total_num_steps_, total_num_steps)

        # An exp is only put in the log queue after it's put in the learning queue
        # So when we stop the driver (which will force all queues to stop),
        # some exps might be missing from the metric. Here we assert an arbitrary
        # lower bound of 2/5. The upper bound is due to the fact that StepType.LAST
        # is not recorded by the metric (episode_length==5).
        self.assertLessEqual(total_num_steps, int(total_num_steps_ * 4 // 5))
        self.assertGreaterEqual(total_num_steps,
                                int(total_num_steps_ * 2 // 5))

        average_reward = int(driver.get_metrics()[2].result())
        self.assertEqual(average_reward, episode_length - 1)

        episode_length = int(driver.get_metrics()[3].result())
        self.assertEqual(episode_length, episode_length)
    def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length,
                         actor_queue_cap, num_actors, num_iterations):
        episode_length = 5
        env_f = lambda: TFPyEnvironment(
            ValueUnittestEnv(batch_size=1, episode_length=episode_length))
        alg = _create_ac_algorithm(env_f())
        driver = AsyncOffPolicyDriver(env_f, alg, num_envs, num_actors,
                                      unroll_length, learn_queue_cap,
                                      actor_queue_cap)
        driver.start()
        total_num_steps_ = 0
        for _ in range(num_iterations):
            total_num_steps_ += driver.run_async()
        driver.stop()

        total_num_steps = int(driver.get_metrics()[1].result())
        self.assertGreaterEqual(total_num_steps_, total_num_steps)
        self.assertGreaterEqual(
            total_num_steps,  # multiply by 2/3 because 1/3 of steps are StepType.LAST
            total_num_steps_ * 2 // 3)
        average_reward = int(driver.get_metrics()[2].result())
        self.assertEqual(average_reward, episode_length - 1)
        episode_length = int(driver.get_metrics()[3].result())
        self.assertEqual(episode_length, episode_length)