def pre_train_setup(self, env: EnvironmentABC, discount_factor: float = 1.0, **kwargs): assert 0.0 <= discount_factor <= 1.0 state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer)
class ActorCriticAgent(Agent): """Basic actor-critic agent.""" def __init__( self, policy_network: FunctionApproximatorABC, value_network: FunctionApproximatorABC, advantage: AdvantageABC, agent_id: str = "ActorCritic_agent", ): super().__init__() self._id = agent_id self.policy_network = policy_network self.value_network = value_network self.advantage = advantage self.memory = None self._should_reset = True @property def id(self): return self._id @timeit def train_iteration(self, env: EnvironmentABC, n_steps: int = 32, discount_factor: float = 1.0): if self._should_reset: self.memory = Memory(initial_state=env.reset(), action_type=np.int32, maximum_length=n_steps) self._should_reset = False self.play_steps(env, n_steps, self.memory) states = self.memory.get_states(include_last=True) values = self.value_network.predict(states).squeeze(axis=-1) states = states[:-1, ...] actions = self.memory.get_actions() rewards = self.memory.get_rewards() dones = self.memory.get_dones() advantages = self.advantage(rewards, values, dones, discount_factor) values = values[:-1] policy_loss = self.policy_network.train(states, actions, advantages) target_values = values + advantages value_loss = self.value_network.train(states, target_values) # TODO: entropy loss (add to PolicyGradientLoss) return (None, self.memory) @timeit def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_probs = self.policy_network.predict(state)[0] return np.random.choice(len(act_probs), p=act_probs)
def train_iteration(self, env: EnvironmentABC, n_steps: int = 32, discount_factor: float = 1.0): if self._should_reset: self.memory = Memory(initial_state=env.reset(), action_type=np.int32, maximum_length=n_steps) self._should_reset = False self.play_steps(env, n_steps, self.memory) states = self.memory.get_states(include_last=True) values = self.value_network.predict(states).squeeze(axis=-1) states = states[:-1, ...] actions = self.memory.get_actions() rewards = self.memory.get_rewards() dones = self.memory.get_dones() advantages = self.advantage(rewards, values, dones, discount_factor) values = values[:-1] policy_loss = self.policy_network.train(states, actions, advantages) target_values = values + advantages value_loss = self.value_network.train(states, target_values) # TODO: entropy loss (add to PolicyGradientLoss) return (None, self.memory)
def test_include_last(self): memory = Memory(self.states[0], np.int32, 1) self._update_memory(memory, 1) np.testing.assert_array_equal(memory.get_states(include_last=True), self.states[:2])
def test_add_one_frame_after_clear(self): memory = Memory(self.states[0], np.int32, 5) self._update_memory(memory, 1) memory.clear(self.states[0]) self._update_memory(memory, 1) self._assert_memory_contents(memory, 0, 1)
def test_empty_after_clear(self): memory = Memory(self.states[0], np.int32, 5) self._update_memory(memory, 3) memory.clear(self.states[0]) self._assert_memory_empty(memory)
def test_add_SARDs(self, _, num, from_, to): memory = Memory(self.states[0], np.int32, 5) self._update_memory(memory, num) self._assert_memory_contents(memory, from_, to)
def test_new_state_update(self): memory = Memory(self.states[0], np.int32, 5) new_new_state = np.array([100.0, 100.0]) memory.new_state_update(new_new_state) np.testing.assert_array_equal(memory.get_last_state(), new_new_state) self._assert_memory_empty(memory)
def test_init(self): memory = Memory(self.states[0], np.int32, 5) self._assert_memory_empty(memory)
class DQNAgent(Agent): """Agent using DQN algorithm""" def __init__( self, q_network: FunctionApproximatorABC, replay_buffer_size: int = 10000, start_epsilon: float = 1.0, end_epsilon: float = 0.05, epsilon_decay: int = 1000, training_set_size: int = 64, target_network_copy_iter: int = 100, steps_between_training=10, agent_id: str = "DQN_agent", ): super().__init__() self._id = agent_id self.action_space = None self.start_epsilon = start_epsilon self.epsilon = start_epsilon self.end_epsilon = end_epsilon self.epsilon_decay = epsilon_decay self.replay_buffer_size = replay_buffer_size self.q_network = q_network self.target_network = deepcopy(q_network) self.batch_size = training_set_size self.target_network_copy_iter = target_network_copy_iter self.steps_between_training = steps_between_training self.epsilon_diff = (self.start_epsilon - self.end_epsilon) / self.epsilon_decay self.replay_buffer = None @property def id(self): return self._id def pre_train_setup(self, env: EnvironmentABC, discount_factor: float = 1.0, **kwargs): assert 0.0 <= discount_factor <= 1.0 state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer) @timeit def train_iteration(self, env: EnvironmentABC, discount_factor: float = 1.0): if self.epsilon_decay < self.iteration_count: self.epsilon -= self.epsilon_diff if self.iteration_count % self.target_network_copy_iter == 0: self.target_network = deepcopy(self.q_network) self.play_steps(env, self.steps_between_training, self.replay_buffer) states, actions, rewards, dones, next_states = self.replay_buffer.sample_batch( self.replay_buffer_size, self.batch_size, next_states=True) target_vals = self.target_network.predict(next_states) target_ind = np.argmax(target_vals, axis=1) target_max = target_vals[np.arange(target_vals.shape[0]), target_ind] target_q = rewards + discount_factor * target_max * (~dones) loss = self.q_network.train(states, actions, target_q) return loss, self.replay_buffer def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_qvals = self.q_network.predict(state)[0] if np.random.uniform() < self.epsilon: return np.random.choice(len(act_qvals)) else: return np.argmax(act_qvals)
def pre_train_setup(self, env: Environment, **kwargs): self.action_space = env.action_space state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer)