Ejemplo n.º 1
0
 def __init__(self, actor_id, env_conf, shared_state, shared_replay_mem,
              actor_params):
     super(Actor, self).__init__()
     self.actor_id = actor_id  # Used to compose a unique key for the transitions generated by each actor
     state_shape = tuple(env_conf['state_shape'])
     action_dim = env_conf['action_dim']
     self.params = actor_params
     self.shared_state = shared_state
     self.T = self.params["T"]
     self.Q = DuellingDQN(state_shape, action_dim)
     self.Q.load_state_dict(shared_state["Q_state_dict"])
     self.env = make_local_env(env_conf['name'])
     self.policy = self.epsilon_greedy_Q
     self.local_experience_buffer = ExperienceBuffer(
         self.params["num_steps"], self.actor_id)
     self.global_replay_queue = shared_replay_mem
     eps = self.params['epsilon']
     N = self.params['num_actors']
     alpha = self.params['alpha']
     self.epsilon = eps**(1 + alpha * self.actor_id / (N - 1))
     self.gamma = self.params['gamma']
     self.num_buffered_steps = 0  # Used to compose a unique key for the transitions generated by each actor
     self.rgb2gray = lambda x: np.dot(x,
                                      np.array([[0.299, 0.587, 0.114]]).T
                                      )  # RGB to Gray scale
     self.torch_shape = lambda x: np.reshape(self.rgb2gray(x), (1, x.shape[
         1], x.shape[0]))  # WxHxC to CxWxH
     self.obs_preproc = lambda x: np.resize(self.torch_shape(x), state_shape
                                            )
Ejemplo n.º 2
0
 def __init__(self, env_conf, learner_params, shared_state,
              shared_replay_memory):
     self.state_shape = env_conf['state_shape']
     action_dim = env_conf['action_dim']
     self.params = learner_params
     self.shared_state = shared_state
     self.Q = DuellingDQN(self.state_shape, action_dim)
     self.Q_double = DuellingDQN(
         self.state_shape, action_dim
     )  # Target Q network which is slow moving replica of self.Q
     if self.params['load_saved_state']:
         try:
             saved_state = torch.load(self.params['load_saved_state'])
             self.Q.load_state_dict(saved_state['Q_state'])
         except FileNotFoundError:
             print("WARNING: No trained model found. Training from scratch")
     self.shared_state["Q_state_dict"] = self.Q.state_dict()
     self.replay_memory = shared_replay_memory
     self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                          lr=0.00025 / 4,
                                          weight_decay=0.95,
                                          eps=1.5e-7)
     self.num_q_updates = 0
Ejemplo n.º 3
0
class Actor(mp.Process):
    def __init__(self, actor_id, env_conf, shared_state, shared_replay_mem,
                 actor_params):
        super(Actor, self).__init__()
        self.actor_id = actor_id  # Used to compose a unique key for the transitions generated by each actor
        state_shape = tuple(env_conf['state_shape'])
        action_dim = env_conf['action_dim']
        self.params = actor_params
        self.shared_state = shared_state
        self.T = self.params["T"]
        self.Q = DuellingDQN(state_shape, action_dim)
        self.Q.load_state_dict(shared_state["Q_state_dict"])
        self.env = make_local_env(env_conf['name'])
        self.policy = self.epsilon_greedy_Q
        self.local_experience_buffer = ExperienceBuffer(
            self.params["num_steps"], self.actor_id)
        self.global_replay_queue = shared_replay_mem
        eps = self.params['epsilon']
        N = self.params['num_actors']
        alpha = self.params['alpha']
        self.epsilon = eps**(1 + alpha * self.actor_id / (N - 1))
        self.gamma = self.params['gamma']
        self.num_buffered_steps = 0  # Used to compose a unique key for the transitions generated by each actor
        self.rgb2gray = lambda x: np.dot(x,
                                         np.array([[0.299, 0.587, 0.114]]).T
                                         )  # RGB to Gray scale
        self.torch_shape = lambda x: np.reshape(self.rgb2gray(x), (1, x.shape[
            1], x.shape[0]))  # WxHxC to CxWxH
        self.obs_preproc = lambda x: np.resize(self.torch_shape(x), state_shape
                                               )

    def epsilon_greedy_Q(self, qS_t):
        if random.random() >= self.epsilon:
            return np.argmax(qS_t)
        else:
            return random.choice(list(range(len(qS_t))))

    def compute_priorities(self, n_step_transitions):
        n_step_transitions = N_Step_Transition(*zip(*n_step_transitions))
        # Convert tuple to numpy array
        rew_t_to_tpB = np.array(n_step_transitions.R_ttpB)
        gamma_t_to_tpB = np.array(n_step_transitions.Gamma_ttpB)
        qS_tpn = np.array(n_step_transitions.qS_tpn)
        A_t = np.array(n_step_transitions.A_t, dtype=np.int)
        qS_t = np.array(n_step_transitions.qS_t)

        #print("np.max(qS_tpn,1):", np.max(qS_tpn, 1))
        #  Calculate the absolute n-step TD errors
        n_step_td_target = rew_t_to_tpB + gamma_t_to_tpB * np.max(qS_tpn, 1)
        #print("td_target:", n_step_td_target)
        n_step_td_error = n_step_td_target - np.array(
            [qS_t[i, A_t[i]] for i in range(A_t.shape[0])])
        #print("td_err:", n_step_td_error)
        priorities = {
            k: val
            for k in n_step_transitions.key for val in abs(n_step_td_error)
        }
        return priorities

    def run(self):
        """
        A method to gather experiences using the Actor's policy and the Actor's environment instance.
          - Periodically syncs the parameters of the Q network used by the Actor with the latest Q parameters made available by
            the Learner process.
          - Stores the single step transitions and the n-step transitions in a local experience buffer
          - Periodically flushes the n-step transition experiences to the global replay queue
        :param T: The total number of time steps to gather experience
        :return:
        """
        # 3. Get initial state from environment
        obs = self.obs_preproc(self.env.reset())
        ep_reward = []
        for t in range(self.T):
            with torch.no_grad():
                qS_t = self.Q(torch.from_numpy(obs).unsqueeze(
                    0).float())[2].squeeze().numpy()
            # 5. Select the action using the current policy
            action = self.policy(qS_t)
            # 6. Apply action in the environment
            next_obs, reward, done, _ = self.env.step(action)
            # 7. Add data to local buffer
            self.local_experience_buffer.add(
                Transition(obs, action, reward, self.gamma, qS_t))
            obs = self.obs_preproc(next_obs)
            ep_reward.append(reward)
            print("Actor#",
                  self.actor_id,
                  "t=",
                  t,
                  "action=",
                  action,
                  "reward:",
                  reward,
                  "1stp_buf_size:",
                  self.local_experience_buffer.B,
                  end='\r')

            if done:  # Not mentioned in the paper's algorithm
                # Truncate the n-step transition as the episode has ended; NOTE: Reward is set to 0
                self.local_experience_buffer.construct_nstep_transition(
                    Transition(obs, action, 0, self.gamma, qS_t))
                # Reset the environment
                obs = self.obs_preproc(self.env.reset())
                print("Actor#:", self.actor_id, "t:", t, "  ep_len:",
                      len(ep_reward), "  ep_reward:", np.sum(ep_reward))
                ep_reward = []

            # 8. Periodically send data to replay
            if self.local_experience_buffer.size >= self.params[
                    'n_step_transition_batch_size']:
                # 9. Get batches of multi-step transitions
                n_step_experience_batch = self.local_experience_buffer.get(
                    self.params['n_step_transition_batch_size'])
                # 10.Calculate the priorities for experience
                priorities = self.compute_priorities(n_step_experience_batch)
                # 11. Send the experience to the global replay memory
                self.global_replay_queue.put(
                    [priorities, n_step_experience_batch])

            if t % self.params['Q_network_sync_freq'] == 0:
                # 13. Obtain latest network parameters
                self.Q.load_state_dict(self.shared_state["Q_state_dict"])
Ejemplo n.º 4
0
if __name__ == "__main__":
    """ 
    Simple standalone test routine for Actor class
    """
    env_conf = {
        "state_shape": (1, 84, 84),
        "action_dim": 4,
        "name": "Breakout-v0"
    }
    params = {
        "local_experience_buffer_capacity": 10,
        "epsilon": 0.4,
        "alpha": 7,
        "gamma": 0.99,
        "num_actors": 2,
        "n_step_transition_batch_size": 5,
        "Q_network_sync_freq": 10,
        "num_steps": 3
    }
    dummy_q = DuellingDQN(env_conf['state_shape'], env_conf['action_dim'])
    mp_manager = mp.Manager()
    shared_state = mp_manager.dict()
    shared_state["Q_state_dict"] = dummy_q.state_dict()
    shared_replay_mem = mp_manager.Queue()
    actor = Actor(1, env_conf, shared_state, shared_replay_mem, params)
    actor.gather_experience(101)
    print("Main: replay_mem.size:", shared_replay_mem.qsize())
    for i in range(shared_replay_mem.qsize()):
        p, xp_batch = shared_replay_mem.get()
        print("priority:", p)
Ejemplo n.º 5
0
class Learner(object):
    def __init__(self, env_conf, learner_params, shared_state,
                 shared_replay_memory):
        self.state_shape = env_conf['state_shape']
        action_dim = env_conf['action_dim']
        self.params = learner_params
        self.shared_state = shared_state
        self.Q = DuellingDQN(self.state_shape, action_dim)
        self.Q_double = DuellingDQN(
            self.state_shape, action_dim
        )  # Target Q network which is slow moving replica of self.Q
        if self.params['load_saved_state']:
            try:
                saved_state = torch.load(self.params['load_saved_state'])
                self.Q.load_state_dict(saved_state['Q_state'])
            except FileNotFoundError:
                print("WARNING: No trained model found. Training from scratch")
        self.shared_state["Q_state_dict"] = self.Q.state_dict()
        self.replay_memory = shared_replay_memory
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=0.00025 / 4,
                                             weight_decay=0.95,
                                             eps=1.5e-7)
        self.num_q_updates = 0

    def compute_loss_and_priorities(self, xp_batch):
        """
        Computes the double-Q learning loss and the proportional experience priorities.
        :param xp_batch: list of experiences of type N_Step_Transition
        :return: double-Q learning loss and the proportional experience priorities
        """
        n_step_transitions = N_Step_Transition(*zip(*xp_batch))
        # Convert tuple to numpy array; Convert observations(S_t and S_tpn) to c x w x h torch Tensors (aka Variable)
        S_t = torch.from_numpy(np.array(
            n_step_transitions.S_t)).float().requires_grad_(True)
        S_tpn = torch.from_numpy(np.array(
            n_step_transitions.S_tpn)).float().requires_grad_(True)
        rew_t_to_tpB = np.array(n_step_transitions.R_ttpB)
        gamma_t_to_tpB = np.array(n_step_transitions.Gamma_ttpB)
        A_t = np.array(n_step_transitions.A_t)

        with torch.no_grad():
            G_t = rew_t_to_tpB + gamma_t_to_tpB * \
                             self.Q_double(S_tpn)[2].gather(1, torch.argmax(self.Q(S_tpn)[2], 1).view(-1, 1)).squeeze()
        Q_S_A = self.Q(S_t)[2].gather(1,
                                      torch.from_numpy(A_t).reshape(
                                          -1, 1)).squeeze()
        batch_td_error = G_t.float() - Q_S_A
        loss = 1 / 2 * (batch_td_error)**2
        # Compute the new priorities of the experience
        priorities = {
            k: v
            for k in n_step_transitions.key
            for v in abs(batch_td_error.detach().data.numpy())
        }

        return loss.mean(), priorities

    def update_Q(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.num_q_updates += 1

        if self.num_q_updates % self.params['q_target_sync_freq']:
            self.Q_double.load_state_dict(self.Q.state_dict())

    def learn(self, T):
        while self.replay_memory.size() <= self.params["min_replay_mem_size"]:
            time.sleep(1)
        for t in range(T):
            # 4. Sample a prioritized batch of transitions
            prioritized_xp_batch = self.replay_memory.sample(
                int(self.params['replay_sample_size']))
            # 5. & 7. Apply double-Q learning rule, compute loss and experience priorities
            loss, priorities = self.compute_loss_and_priorities(
                prioritized_xp_batch)
            #print("\nLearner: t=", t, "loss:", loss, "RPM.size:", self.replay_memory.size(), end='\r')
            # 6. Update parameters of the Q network(s)
            self.update_Q(loss)
            self.shared_state['Q_state_dict'] = self.Q.state_dict()
            # 8. Update priorities
            self.replay_memory.set_priorities(priorities)

            # 9. Periodically remove old experience from replay memory
            if t % self.params['remove_old_xp_freq'] == 0:
                self.replay_memory.remove_to_fit()