def test_max_seconds(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=2), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_seconds': 1, }, ]) benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0], episode_lengths=[100, 100], episode_rewards=[0, 100], episode_types=['t', 't'], timestamps=[1.5, 2]) assert _is_close(benchmark_result['scores'][0], 0.5), "benchmark_result={}".format(benchmark_result) # make sure we only include the first result because of wall clock time benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0], episode_lengths=[100, 100], episode_rewards=[0, 100], episode_types=['t', 't'], timestamps=[2, 100]) assert _is_close(benchmark_result['scores'][0], 0.0), "benchmark_result={}".format(benchmark_result)
def test_clip_average_max_timesteps(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=2), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 2, }, ]) benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0], episode_lengths=[1, 1], episode_rewards=[1, 1], episode_types=['t', 't'], timestamps=[2, 3]) _assert_benchmark_result(benchmark_result, score=0.01) # make sure we only include the first result because of timesteps benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0, 0], episode_lengths=[1, 100, 100], episode_rewards=[1, 100, 100], episode_types=['t', 't', 't'], timestamps=[2, 102, 202]) _assert_benchmark_result(benchmark_result, score=0.005, solves=False)
def test_max_timesteps(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=2), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 2, }, ]) benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0], episode_lengths=[1, 1], episode_rewards=[1, 1], episode_types=['t', 't'], timestamps=[2, 3]) assert _is_close(benchmark_result['scores'][0], 0.01), "benchmark_result={}".format(benchmark_result) # make sure we only include the first result because of timesteps benchmark_result = _benchmark_result_helper(benchmark, data_sources=[0, 0], episode_lengths=[1, 100], episode_rewards=[1, 100], episode_types=['t', 't'], timestamps=[2, 102]) assert _is_close(benchmark_result['scores'][0], 0.005), "benchmark_result={}".format(benchmark_result) assert not np.any(benchmark_result['solves'] [0]), "benchmark_result={}".format(benchmark_result)
def test(): benchmark = registration.Benchmark(id='MyBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(), tasks=[{ 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 5 }, { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 100, }]) with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = wrappers.Monitor(directory=temp, video_callable=False)(env) env.seed(0) env.set_monitor_mode('evaluation') rollout(env) env.set_monitor_mode('training') for i in range(2): rollout(env) env.set_monitor_mode('evaluation') rollout(env, good=True) env.close() results = monitoring.load_results(temp) evaluation_score = benchmark.score_evaluation( 'CartPole-v0', results['data_sources'], results['initial_reset_timestamps'], results['episode_lengths'], results['episode_rewards'], results['episode_types'], results['timestamps']) benchmark_score = benchmark.score_benchmark({ 'CartPole-v0': evaluation_score['scores'], }) assert np.all( np.isclose(evaluation_score['scores'], [0.00089999999999999998, 0.0054000000000000003 ])), "evaluation_score={}".format(evaluation_score) assert np.isclose( benchmark_score, 0.00315), "benchmark_score={}".format(benchmark_score)
def test(): benchmark = registration.Benchmark(id='MyBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(), task_groups={ 'CartPole-v0': [{ 'seeds': 1, 'timesteps': 5 }, { 'seeds': 1, 'timesteps': 100 }], }) with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env.monitor.start(temp, video_callable=False, seed=0) env.monitor.configure(mode='evaluation') rollout(env) env.monitor.configure(mode='training') for i in range(2): rollout(env) env.monitor.configure(mode='evaluation') rollout(env, good=True) env.monitor.close() results = monitoring.load_results(temp) evaluation_score = benchmark.score_evaluation( 'CartPole-v0', results['episode_lengths'], results['episode_rewards'], results['episode_types'], results['timestamps'], results['initial_reset_timestamp']) benchmark_score = benchmark.score_benchmark({ 'CartPole-v0': evaluation_score['scores'], }) assert np.all( np.isclose(evaluation_score['scores'], [0.00089999999999999998, 0.0054000000000000003 ])), "evaluation_score={}".format(evaluation_score) assert np.isclose( benchmark_score, 0.00315), "benchmark_score={}".format(benchmark_score)
def test_clip_average_evaluation_scoring(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=1), tasks=[ {'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 5, }, ] ) # simple scoring benchmark_result = _benchmark_result_helper(benchmark) _assert_benchmark_result(benchmark_result, score=0.01) # test a successful run benchmark_result = _benchmark_result_helper(benchmark, episode_rewards=[100, 100], episode_lengths=[1, 1]) _assert_benchmark_result(benchmark_result, score=1.0, solves=True)
def test_clip_average_evaluation_not_enough_rewards(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=2), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 5, }, ]) # simple scoring benchmark_result = _benchmark_result_helper(benchmark) _assert_evaluation_result( benchmark_result, score=0.005, rewards=[np.array([1, 0])], lengths=[np.array([1, 0])], )
def test_clip_scoring(): benchmark = registration.Benchmark( id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(num_episodes=1), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 5, }, ]) # simple scoring benchmark_result = _benchmark_result_helper(benchmark) assert _is_close(benchmark_result['scores'][0], 0.01), "benchmark_result={}".format(benchmark_result) # test a successful run benchmark_result = _benchmark_result_helper(benchmark, episode_rewards=[100]) assert _is_close(benchmark_result['scores'][0], 1.0), "benchmark_result={}".format(benchmark_result) assert np.all(benchmark_result['solves'][0]), "benchmark_result={}".format( benchmark_result)
import numpy as np from collections import defaultdict from gym.benchmarks import registration, scoring import gym gym.undo_logger_setup() benchmark = registration.Benchmark(id='TestBenchmark-v0', scorer=scoring.ClipTo01ThenAverage(), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 100, }, { 'env_id': 'Pendulum-v0', 'trials': 1, 'max_timesteps': 100, }, ]) def _is_close(x, target): return np.all(np.isclose(x, target)) def _assert_benchmark_result(result, score=None, solves=None, summed_training_seconds=None,
'max_timesteps': int(4e7), 'reward_floor': 2047.2, 'reward_ceiling': 5000.0, }, { 'env_id': 'VentureNoFrameskip-v4', 'trials': 2, 'max_timesteps': int(4e7), 'reward_floor': 18.0, 'reward_ceiling': 100.0, }]) register_benchmark(id='ClassicControl2-v0', name='ClassicControl2', view_group="Control", description='Simple classic control benchmark', scorer=scoring.ClipTo01ThenAverage(), tasks=[ { 'env_id': 'CartPole-v0', 'trials': 1, 'max_timesteps': 2000, }, { 'env_id': 'Pendulum-v0', 'trials': 1, 'max_timesteps': 1000, }, ]) register_benchmark(id='ClassicControl-v0', name='ClassicControl',