Esempio n. 1
0
 def pre_train_setup(self,
                     env: EnvironmentABC,
                     discount_factor: float = 1.0,
                     **kwargs):
     assert 0.0 <= discount_factor <= 1.0
     state = env.reset()
     self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size)
     # To ensure that we have the next state after doing the first step.
     self.play_steps(env, n_steps=1, storage=self.replay_buffer)
Esempio n. 2
0
class ActorCriticAgent(Agent):
    """Basic actor-critic agent."""
    def __init__(
        self,
        policy_network: FunctionApproximatorABC,
        value_network: FunctionApproximatorABC,
        advantage: AdvantageABC,
        agent_id: str = "ActorCritic_agent",
    ):
        super().__init__()
        self._id = agent_id
        self.policy_network = policy_network
        self.value_network = value_network
        self.advantage = advantage
        self.memory = None
        self._should_reset = True

    @property
    def id(self):
        return self._id

    @timeit
    def train_iteration(self,
                        env: EnvironmentABC,
                        n_steps: int = 32,
                        discount_factor: float = 1.0):
        if self._should_reset:
            self.memory = Memory(initial_state=env.reset(),
                                 action_type=np.int32,
                                 maximum_length=n_steps)
            self._should_reset = False
        self.play_steps(env, n_steps, self.memory)
        states = self.memory.get_states(include_last=True)
        values = self.value_network.predict(states).squeeze(axis=-1)
        states = states[:-1, ...]
        actions = self.memory.get_actions()
        rewards = self.memory.get_rewards()
        dones = self.memory.get_dones()
        advantages = self.advantage(rewards, values, dones, discount_factor)
        values = values[:-1]
        policy_loss = self.policy_network.train(states, actions, advantages)
        target_values = values + advantages
        value_loss = self.value_network.train(states, target_values)
        # TODO: entropy loss (add to PolicyGradientLoss)
        return (None, self.memory)

    @timeit
    def act(self, state: State) -> Action:
        state = state.reshape(1, *state.shape)
        act_probs = self.policy_network.predict(state)[0]
        return np.random.choice(len(act_probs), p=act_probs)
Esempio n. 3
0
 def train_iteration(self,
                     env: EnvironmentABC,
                     n_steps: int = 32,
                     discount_factor: float = 1.0):
     if self._should_reset:
         self.memory = Memory(initial_state=env.reset(),
                              action_type=np.int32,
                              maximum_length=n_steps)
         self._should_reset = False
     self.play_steps(env, n_steps, self.memory)
     states = self.memory.get_states(include_last=True)
     values = self.value_network.predict(states).squeeze(axis=-1)
     states = states[:-1, ...]
     actions = self.memory.get_actions()
     rewards = self.memory.get_rewards()
     dones = self.memory.get_dones()
     advantages = self.advantage(rewards, values, dones, discount_factor)
     values = values[:-1]
     policy_loss = self.policy_network.train(states, actions, advantages)
     target_values = values + advantages
     value_loss = self.value_network.train(states, target_values)
     # TODO: entropy loss (add to PolicyGradientLoss)
     return (None, self.memory)
Esempio n. 4
0
 def test_include_last(self):
     memory = Memory(self.states[0], np.int32, 1)
     self._update_memory(memory, 1)
     np.testing.assert_array_equal(memory.get_states(include_last=True),
                                   self.states[:2])
Esempio n. 5
0
 def test_add_one_frame_after_clear(self):
     memory = Memory(self.states[0], np.int32, 5)
     self._update_memory(memory, 1)
     memory.clear(self.states[0])
     self._update_memory(memory, 1)
     self._assert_memory_contents(memory, 0, 1)
Esempio n. 6
0
 def test_empty_after_clear(self):
     memory = Memory(self.states[0], np.int32, 5)
     self._update_memory(memory, 3)
     memory.clear(self.states[0])
     self._assert_memory_empty(memory)
Esempio n. 7
0
 def test_add_SARDs(self, _, num, from_, to):
     memory = Memory(self.states[0], np.int32, 5)
     self._update_memory(memory, num)
     self._assert_memory_contents(memory, from_, to)
Esempio n. 8
0
 def test_new_state_update(self):
     memory = Memory(self.states[0], np.int32, 5)
     new_new_state = np.array([100.0, 100.0])
     memory.new_state_update(new_new_state)
     np.testing.assert_array_equal(memory.get_last_state(), new_new_state)
     self._assert_memory_empty(memory)
Esempio n. 9
0
 def test_init(self):
     memory = Memory(self.states[0], np.int32, 5)
     self._assert_memory_empty(memory)
Esempio n. 10
0
class DQNAgent(Agent):
    """Agent using DQN algorithm"""
    def __init__(
        self,
        q_network: FunctionApproximatorABC,
        replay_buffer_size: int = 10000,
        start_epsilon: float = 1.0,
        end_epsilon: float = 0.05,
        epsilon_decay: int = 1000,
        training_set_size: int = 64,
        target_network_copy_iter: int = 100,
        steps_between_training=10,
        agent_id: str = "DQN_agent",
    ):
        super().__init__()
        self._id = agent_id
        self.action_space = None
        self.start_epsilon = start_epsilon
        self.epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.epsilon_decay = epsilon_decay
        self.replay_buffer_size = replay_buffer_size
        self.q_network = q_network
        self.target_network = deepcopy(q_network)
        self.batch_size = training_set_size
        self.target_network_copy_iter = target_network_copy_iter
        self.steps_between_training = steps_between_training

        self.epsilon_diff = (self.start_epsilon -
                             self.end_epsilon) / self.epsilon_decay
        self.replay_buffer = None

    @property
    def id(self):
        return self._id

    def pre_train_setup(self,
                        env: EnvironmentABC,
                        discount_factor: float = 1.0,
                        **kwargs):
        assert 0.0 <= discount_factor <= 1.0
        state = env.reset()
        self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size)
        # To ensure that we have the next state after doing the first step.
        self.play_steps(env, n_steps=1, storage=self.replay_buffer)

    @timeit
    def train_iteration(self,
                        env: EnvironmentABC,
                        discount_factor: float = 1.0):

        if self.epsilon_decay < self.iteration_count:
            self.epsilon -= self.epsilon_diff
        if self.iteration_count % self.target_network_copy_iter == 0:
            self.target_network = deepcopy(self.q_network)
        self.play_steps(env, self.steps_between_training, self.replay_buffer)
        states, actions, rewards, dones, next_states = self.replay_buffer.sample_batch(
            self.replay_buffer_size, self.batch_size, next_states=True)
        target_vals = self.target_network.predict(next_states)
        target_ind = np.argmax(target_vals, axis=1)
        target_max = target_vals[np.arange(target_vals.shape[0]), target_ind]
        target_q = rewards + discount_factor * target_max * (~dones)
        loss = self.q_network.train(states, actions, target_q)
        return loss, self.replay_buffer

    def act(self, state: State) -> Action:
        state = state.reshape(1, *state.shape)
        act_qvals = self.q_network.predict(state)[0]
        if np.random.uniform() < self.epsilon:
            return np.random.choice(len(act_qvals))
        else:
            return np.argmax(act_qvals)
Esempio n. 11
0
 def pre_train_setup(self, env: Environment, **kwargs):
     self.action_space = env.action_space
     state = env.reset()
     self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size)
     # To ensure that we have the next state after doing the first step.
     self.play_steps(env, n_steps=1, storage=self.replay_buffer)