def test_convergence(self): a3c = a3c_impl.A3C( model=a3c_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), # optimizer=a3c_impl.CreateDefaultOptimizer(learning_rate=0.05), ) s = numpy.array([[1, 2, 3]]) a1 = numpy.array([[1, 0]]) a2 = numpy.array([[0, 1]]) for _ in range(10): # Needs to train for both actions as one step, otherwise it shows some # "staggering" effect. a3c.UpdateFromTransitions([ base.Transition(s=s, a=a1, r=1.0, sp=None), ]) a3c.UpdateFromTransitions([ base.Transition(s=s, a=a2, r=-1.0, sp=s), ]) logging.printf('%s', a3c.GetValues(s)) old_value_a1 = a3c.GetActionValues(a3c.GetValues(s), a1) # Trains for one step, for both actions. a3c.UpdateFromTransitions([ base.Transition(s=s, a=a1, r=1.0, sp=None), ]) a3c.UpdateFromTransitions([ base.Transition(s=s, a=a2, r=-1.0, sp=s), ]) self.assertGreaterEqual(a3c.GetActionValues(a3c.GetValues(s), a1), old_value_a1)
def OnCompletionCallback(self): logging.printf( 'Total: avg_reward = %3.2f, avg_steps=%3.2f', float(numpy.mean(self._episode_rewards)), float(numpy.mean(self._episode_steps)), ) # Note that since "block=False", if you run it on CLI the image will be # shown then disappear immediately. The result will persist if you run it # in notebooks. pyplot.title('Episode Rewards') pyplot.plot(self._episode_rewards) pyplot.show() if self._report_steps: pyplot.title('Episode Steps') pyplot.plot(self._episode_steps) pyplot.show()
def __init__( self, model: keras.Model, optimizer: tensorflow.train.Optimizer = None, discount_factor: float = _DEFAULT_DISCOUNT_FACTOR, loss_v: float = _DEFAULT_LOSS_V, loss_entropy: float = _DEFAULT_LOSS_ENTROPY, ): """Ctor. Args: model: a model that """ if _ACTIVE_INSTANCES: instance = _ACTIVE_INSTANCES[0] logging.printf( 'WARNING: only one A3C instance can be active; the previous instance ' '%s is now deactivated.', instance) instance.Deactivate() _ACTIVE_INSTANCES.pop() self._model = model self._optimizer = optimizer if optimizer else CreateDefaultOptimizer() self._gamma = discount_factor self._loss_v = loss_v self._loss_entropy = loss_entropy self._state_batch_shape = self._model.layers[0].input_shape # Layer -1 is the output for V, -2 is for the values of Pi. output_shape = self._model.layers[-2].output_shape[ 1:] # type: t.Tuple[int] if len(output_shape) != 1: raise NotImplementedError( 'Only supports 1D action space; got: %s' % str(output_shape)) self._action_space_size = output_shape[0] self._graph = self._BuildGraph(self._model) self.session = tensorflow.Session() backend.set_session(self.session) self.session.run(tensorflow.global_variables_initializer()) # Only one A3C instance can be active at a time. self._active = True _ACTIVE_INSTANCES.append(self)
def OnCompletionCallback(self, env: Environment, qfunc: QFunction, num_of_episodes: int): logging.printf( 'Total: run %d episodes, avg_reward = %3.2f, avg_steps=%3.2f', num_of_episodes, float(numpy.mean(self._episode_rewards[-num_of_episodes:])), float(numpy.mean(self._episode_steps[-num_of_episodes:])), ) # Note that since "block=False", if you run it on CLI the image will be # shown then disappear immediately. The result will persist if you run it # in notebooks. pyplot.title('Episode Rewards') pyplot.plot(self._episode_rewards) pyplot.show(block=False) pyplot.title('Episode Steps') pyplot.plot(self._episode_steps) pyplot.show(block=False)
def OnEpisodeFinishedCallback(self, env: Environment, brain: Brain, episode_idx: int, num_of_episodes: int, episode_reward: float, steps: int): """Reports episode progress and rewards.""" self._episode_rewards.append(episode_reward) self._episode_steps.append(steps) episode_idx += 1 # make it 1-based. if episode_idx % self._report_every_num_of_episodes == 0: logging.printf( 'Episode %d/%d: avg_reward = %3.2f, ' 'avg_steps=%3.2f (over %d episodes)', episode_idx, num_of_episodes, float( numpy.mean(self._episode_rewards[ -self._report_every_num_of_episodes:])), float( numpy.mean( self. _episode_steps[-self._report_every_num_of_episodes:])), self._report_every_num_of_episodes, )
def __init__( self, gym_env_name: t.Text, model_shape: t.Iterable[int] = (20, 20, 20), report_every_num_of_episodes: int = 100, ): """Ctor. Default implementations are provided for all objects. They can be changed by directly setting the public properties after the creation. Args: gym_env_name: name of the gym environment, like "LunarLander-v2". model_shape: a list of number of nodes per hidden layer. report_every_num_of_episodes: do progress report every this number of episodes. """ self._gym_env_name = gym_env_name self._model_shape = tuple(model_shape) self.env = environment_impl.GymEnvironment(gym.make(gym_env_name)) self.qfunc = qfunc_impl.DDQN( model_pair=(qfunc_impl.CreateModel( state_shape=self.env.GetStateShape(), action_space_size=self.env.GetActionSpaceSize(), hidden_layer_sizes=model_shape), qfunc_impl.CreateModel( state_shape=self.env.GetStateShape(), action_space_size=self.env.GetActionSpaceSize(), hidden_layer_sizes=model_shape)), training_batch_size=DEFAULT_BATCH_SIZE, discount_factor=0.99, ) logging.printf('Using qfunc implementation: %s', string.GetClassName(self.qfunc)) self.policy = policy_impl.GreedyPolicyWithDecreasingRandomness( initial_epsilon=1.0, final_epsilon=0.1, decay_by_half_after_num_of_episodes=500) logging.printf('Using policy implementation: %s', string.GetClassName(self.policy)) self.runner = runner_impl.ExperienceReplayRunner( experience_capacity=100000, experience_sample_batch_size=DEFAULT_BATCH_SIZE) logging.printf('Using runner implementation: %s', string.GetClassName(self.runner)) self._progress_tracer = runner_extension_impl.ProgressTracer( report_every_num_of_episodes=report_every_num_of_episodes) self._model_saver = runner_extension_impl.ModelSaver( self._GetModelWeightsFilepath())
def MainTest(): images, labels = CreateImageData(num_blank_images=5, num_annotated_images=5) for idx in range(5): logging.printf('Label %d: %s', idx, labels[idx]) PlotImage(images[idx])
def __init__( self, gym_env_name: t.Text, gym_env=None, report_every_num_of_episodes: int = 1, use_ddqn: bool = True, use_large_model: bool = True, ): """Ctor. Args: gym_env_name: name of the gym environment that will be created. gym_env: Gym environment. If set, use the provided Gym environment and gym_env_name is only used as a tag. report_every_num_of_episodes: do progress report every this number of episodes. use_ddqn: whether to use DDQN or DQN_TargetNetwork. use_large_model: whether to use the larger model. Without GPU it's very slow to use it. """ self._gym_env_name = gym_env_name if gym_env: env = gym_env else: env = gym.make(gym_env_name) self.env = screen_learning.ScreenGymEnvironment(env) if use_large_model: model_pair = (screen_learning.CreateOriginalConvolutionModel( action_space_size=self.env.GetActionSpaceSize()), screen_learning.CreateOriginalConvolutionModel( action_space_size=self.env.GetActionSpaceSize())) else: model_pair = (screen_learning.CreateConvolutionModel( action_space_size=self.env.GetActionSpaceSize()), screen_learning.CreateConvolutionModel( action_space_size=self.env.GetActionSpaceSize())) if use_ddqn: self.qfunc = qfunc_impl.DDQN( model_pair=model_pair, training_batch_size=DEFAULT_BATCH_SIZE, discount_factor=0.99, ) else: self.qfunc = qfunc_impl.DQN_TargetNetwork( model=model_pair[0], training_batch_size=DEFAULT_BATCH_SIZE, discount_factor=0.99) logging.printf('Using qfunc implementation: %s', string.GetClassName(self.qfunc)) self.policy = policy_impl.GreedyPolicyWithDecreasingRandomness( initial_epsilon=1.0, final_epsilon=0.1, decay_by_half_after_num_of_episodes=50) logging.printf('Using policy implementation: %s', string.GetClassName(self.policy)) self.runner = runner_impl.ExperienceReplayRunner( experience_capacity=100000, experience_sample_batch_size=DEFAULT_BATCH_SIZE) logging.printf('Using runner implementation: %s', string.GetClassName(self.runner)) self._progress_tracer = runner_extension_impl.ProgressTracer( report_every_num_of_episodes=report_every_num_of_episodes) self._model_saver = runner_extension_impl.ModelSaver( self._GetModelWeightsFilepath(), use_averaged_value_over_num_of_episodes=report_every_num_of_episodes )