def __init__( self, env_name, save_dir=None, max_episodes=constants.DEFAULT_MAX_EPISODES, gamma_discount=constants.DEFAULT_GAMMA, learning_rate=constants.DEFAULT_LEARNING_RATE, thread_count=None, worker_update_frequency=constants.DEFAULT_UPDATE_FREQUENCY, ): super().__init__(env_name, "a3c", save_dir=save_dir) self.max_episodes = max_episodes self.gamma_discount = gamma_discount self.worker_update_frequency = worker_update_frequency self.thread_count = thread_count or multiprocessing.cpu_count() # Global optimizer and model self.optimizer = tf.train.AdamOptimizer(use_locking=True, learning_rate=learning_rate) self.model = model.A3CModel(self.env) # Calling the model once will essentially tell it what to expect as # input and thus initialize all variables and weights. # If we don't do this, the model cannot be updated from worker threads. self.model( tf.convert_to_tensor(np.zeros((1, ) + self.env.observation_space.shape), dtype=tf.float32))
def __init__( self, env, global_model, global_optimizer, tracker, index, max_episodes, max_steps, gamma_discount, update_frequency, ): super().__init__() self.env = env self.global_model = global_model self.global_optimizer = global_optimizer self.tracker = tracker self.index = index self.max_episodes = max_episodes self.max_steps = max_steps self.update_frequency = update_frequency self.gamma_discount = gamma_discount self.model = model.A3CModel(env) # TODO: Make this configurable self.terminal_reward = -1
def test_rescale_mean(pendulum_env): a3c_model = model.A3CModel(pendulum_env) assert actor.rescale_mean(a3c_model, 0) == 0 assert actor.rescale_mean(a3c_model, 0.5) == 1 assert actor.rescale_mean(a3c_model, 1) == 2 assert actor.rescale_mean(a3c_model, -1) == -2 assert actor.rescale_mean(a3c_model, -0.5) == -1
def test_model_call_discrete(cartpole_env): a3c_model = model.A3CModel(cartpole_env) # Cartpole has two actions state = cartpole_env.reset() state_tensor = tf.convert_to_tensor([state], dtype=tf.float32) # Assert it's callable logits, values = a3c_model(state_tensor) assert logits.shape == (1, 2) assert values.shape == (1, 1)
def test_model_call_continuous(pendulum_env): a3c_model = model.A3CModel(pendulum_env) # Pendulum has state = pendulum_env.reset() state_tensor = tf.convert_to_tensor([state], dtype=tf.float32) # Assert it's callable and the output is a tuple. (logits1, logits2), values = a3c_model(state_tensor) assert logits1.shape == (1, 1) assert logits2.shape == (1, 1) assert values.shape == (1, 1)
def test_model_variable_scope(cartpole_env): a3c_model = model.A3CModel(cartpole_env) state = cartpole_env.reset() state_tensor = tf.convert_to_tensor([state], dtype=tf.float32) a3c_model(state_tensor) value_weights = [ v for v in a3c_model.trainable_weights if "value_scope" in v.name ] actor_weights = [ v for v in a3c_model.trainable_weights if "actor_scope" in v.name ] # Two sets of weight layers, each with a bias. assert len(value_weights) == 4 assert len(actor_weights) == 4