def __init__( self, gym_env_name: t.Text, model_shape: t.Iterable[int] = (20, 20, 20), report_every_num_of_episodes: int = 100, ): """Ctor. Default implementations are provided for all objects. They can be changed by directly setting the public properties after the creation. Args: gym_env_name: name of the gym environment, like "LunarLander-v2". model_shape: a list of number of nodes per hidden layer. report_every_num_of_episodes: do progress report every this number of episodes. """ self._gym_env_name = gym_env_name self._model_shape = tuple(model_shape) self.env = environment_impl.GymEnvironment(gym.make(gym_env_name)) self.qfunc = qfunc_impl.DDQN( model_pair=(qfunc_impl.CreateModel( state_shape=self.env.GetStateShape(), action_space_size=self.env.GetActionSpaceSize(), hidden_layer_sizes=model_shape), qfunc_impl.CreateModel( state_shape=self.env.GetStateShape(), action_space_size=self.env.GetActionSpaceSize(), hidden_layer_sizes=model_shape)), training_batch_size=DEFAULT_BATCH_SIZE, discount_factor=0.99, ) logging.printf('Using qfunc implementation: %s', string.GetClassName(self.qfunc)) self.policy = policy_impl.GreedyPolicyWithDecreasingRandomness( initial_epsilon=1.0, final_epsilon=0.1, decay_by_half_after_num_of_episodes=500) logging.printf('Using policy implementation: %s', string.GetClassName(self.policy)) self.runner = runner_impl.ExperienceReplayRunner( experience_capacity=100000, experience_sample_batch_size=DEFAULT_BATCH_SIZE) logging.printf('Using runner implementation: %s', string.GetClassName(self.runner)) self._progress_tracer = runner_extension_impl.ProgressTracer( report_every_num_of_episodes=report_every_num_of_episodes) self._model_saver = runner_extension_impl.ModelSaver( self._GetModelWeightsFilepath())
def main(_): batch_size = 64 # used in qfunc and runner. env = environment_impl.GymEnvironment(gym.make('MountainCar-v0')) env.SetGymEnvMaxEpisodeSteps(400) qfunc = qfunc_impl.DQN( model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(64, )), training_batch_size=batch_size, discount_factor=0.99, ) qfunc.Load('saved_models/mountaincar_shape_64_rmsprop_gamma_099.weights') policy = policy_impl.GreedyPolicy() runner = runner_impl.NoOpRunner() env.TurnOnRendering(should_render=True, fps=24) logging.ENV.debug_verbosity = 9 env.StartRecording(video_filename='mountaincar_demo.mp4') # First 5 runs with random actions: rand_qfunc = qfunc_impl.RandomQFunction(env.GetActionSpaceSize()) runner.Run(env=env, brain=rand_qfunc, policy=policy, num_of_episodes=5) # Then 10 runs with trained qfunc: runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=10) env.StopRecording()
def main(_): batch_size = 64 # used in qfunc and runner. env = environment_impl.GymEnvironment(gym.make('CartPole-v0')) qfunc = qfunc_impl.DQN( model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(20, 20, 20)), training_batch_size=batch_size, discount_factor=0.99, ) runner = runner_impl.ExperienceReplayRunner( experience_capacity=100000, experience_sample_batch_size=batch_size) # Train 500 episodes. logging.ENV.debug_verbosity = 3 policy = policy_impl.GreedyPolicyWithRandomness(epsilon=0.1) runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=500) # Test for 100 episodes. logging.ENV.debug_verbosity = 4 policy = policy_impl.GreedyPolicy() runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=100) # Demo with video. env.TurnOnRendering(should_render=True, fps=24) # env.StartRecording(video_filename='demo.mp4') # uncomment to record video. # First 5 runs with random actions: runner.Run(env=env, brain=qfunc_impl.RandomQFunction(env.GetActionSpaceSize()), policy=policy, num_of_episodes=5) # Then 10 runs with trained qfunc: runner.Run(env=env, brain=qfunc, policy=policy, num_of_episodes=10)
def test_saveLoad(self): tmp_file = '/tmp/DDQNTest_savedata.tmp' self.qfunc._SetValues(self.states, self.values) self.qfunc.Save(tmp_file) qfunc = qfunc_impl.DDQN(model_pair=(qfunc_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), qfunc_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), )), ) qfunc.Load(tmp_file) numpy_util.TestUtil.AssertModelWeightsEqual(qfunc._q1, self.qfunc._model) numpy_util.TestUtil.AssertModelWeightsEqual(qfunc._q2, self.qfunc._model)
def setUp(self) -> None: # State space size is 3; Action space size is 2. self.qfunc = qfunc_impl.DDQN( model_pair=(qfunc_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), qfunc_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), )), discount_factor=0.9, ) self.states = numpy.array([ [1, 2, 3], [4, 5, 6], ]) self.values = numpy.array([ [0.5, 0.5], [0.3, 0.7], ])
def _RunEnv(gym_env): env = environment_impl.GymEnvironment(gym_env) env.SetGymEnvMaxEpisodeSteps(10) qfunc = qfunc_impl.DQN(model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(4, ), )) env.Reset() policy = policy_impl.GreedyPolicyWithRandomness(epsilon=1.0) runner_impl.SimpleRunner().Run(env=env, qfunc=qfunc, policy=policy, num_of_episodes=10)
def main(_): batch_size = 64 # used in qfunc and runner. env = environment_impl.GymEnvironment(gym.make('Acrobot-v1')) qfunc = qfunc_impl.DQN( model=qfunc_impl.CreateModel( state_shape=env.GetStateShape(), action_space_size=env.GetActionSpaceSize(), hidden_layer_sizes=(20, 20, 20)), training_batch_size=batch_size, discount_factor=0.99, ) qfunc.LoadModel( 'saved_models/acrobot_v1_shape_20-20-20_rmsprop_gamma_0.99.model') policy = policy_impl.GreedyPolicy() runner = runner_impl.NoOpRunner() env.TurnOnRendering(should_render=True, fps=10) logging.ENV.debug_verbosity = 9 runner.Run(env=env, qfunc=qfunc, policy=policy, num_of_episodes=10)
def setUp(self) -> None: # State space size is 3; Action space size is 2. self.qfunc = qfunc_impl.DQN_TargetNetwork( model=qfunc_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(6, 4), ), update_target_network_every_num_of_steps=2, ) self.states = numpy.array([ [1, 2, 3], [4, 5, 6], ]) self.values = numpy.array([ [0.5, 0.5], [0.3, 0.7], ])