class Critic: def __init__(self, device, state_size, action_size, random_seed, gamma, TAU, lr, weight_decay, checkpoint_folder = './Saved_Model/'): self.DEVICE = device self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.GAMMA = gamma self.TAU = TAU self.LR = lr self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Critic Network (w/ Target Network) self.local = CriticNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.target = CriticNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR, weight_decay=self.WEIGHT_DECAY) self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth' if os.path.isfile(self.checkpoint_full_name): self.local.load_state_dict(torch.load(self.checkpoint_full_name)) self.target.load_state_dict(torch.load(self.checkpoint_full_name)) def step(self, actor, memory): # Learn, if enough samples are available in memory experiences = memory.sample() if not experiences: return self.learn(actor, experiences) def learn(self, actor, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * target(next_state, actor_target(next_state)) where: actor_target(state) -> action target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = actor.target(next_states) Q_targets_next = self.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.local.parameters(), 1) self.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = actor.local(states) actor_loss = - self.local(states, actions_pred).mean() # Minimize the loss actor.optimizer.zero_grad() actor_loss.backward() actor.optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.local, self.target) self.soft_update(actor.local, actor.target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ tau = self.TAU for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def checkpoint(self): torch.save(self.local.state_dict(), self.checkpoint_full_name)
class Actor(object): def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed + actor_id) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.n_episodes = 0 self.n_steps = 0 self.gamma = opt.gamma # epsilon self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 # モデル self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) def performing(self): torch.manual_seed(self.opt.seed) while True: self.load_model() self.train_episode() if self.n_episodes % 100 == 0: rewards = self.evaluation(self.env) rewards_mu = np.array( [np.sum(np.array(l_i), 0) for l_i in rewards]).mean() print("Episode %d, Average Reward %.2f" % (self.n_episodes, rewards_mu)) def _softmax_action(self, state): state = torch.FloatTensor([state]).to(self.device) softmax_action = torch.exp(self.actor(state)) # expをかけて,行動確率とする softmax_action = softmax_action.cpu().detach().numpy() return softmax_action def exploration_action(self, state): softmax_action = self._softmax_action(state) if np.random.rand() > self.eps_greedy: return np.argmax(softmax_action) else: return np.random.choice(self.n_act) def train_episode(self): done = False state = self.env.reset() self.env_state = state self.next_done = done while not done: self.n_steps += 1 states = np.zeros((self.opt.n_step, self.n_state)) actions = np.zeros(self.opt.n_step) rewards = np.zeros(self.opt.n_step) log_probs = np.zeros((self.opt.n_step, self.n_act)) dones = np.ones(self.opt.n_step) for i in range(self.opt.n_step): states[i] = self.env_state dones[i] = self.next_done log_prob = self.actor( torch.FloatTensor([state]).to( self.device)).detach().cpu().numpy()[0] action = self.exploration_action(state) next_state, reward, done, info = self.env.step(action) reward = 0 if done: if self.n_steps > 190: reward = 1 else: reward = -1 log_probs[i] = log_prob actions[i] = action rewards[i] = reward self.env_state = next_state self.next_done = done if done: self.env_state = self.env.reset() break # n_step回終了 if done: self.n_steps = 0 self.n_episodes += 1 self.episode_done = True else: self.episode_done = False self.q_trace.put((states, actions, rewards, dones, log_probs), block=True) # choose an action based on state for execution def action(self, state): softmax_action = self._softmax_action(state) action = np.argmax(softmax_action) return action def value(self, state): # Qを出力 state_var = torch.FloatTensor([state]).to(self.device) q_var = self.critic(state_var) # 行動価値を出value q = q_var.cpu().detach().numpy() return q def _discount_reward(self, rewards, final_value): discounted_r = np.zeros_like(rewards) R = final_value # Q(s_t, a_t) for t in reversed(range(0, len(rewards))): R = rewards[t] + self.gamma * R discounted_r[t] = R return discounted_r def evaluation(self, env_eval): rewards = [] for i in range(10): rewards_i = [] state = env_eval.reset() action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) while not done: action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) rewards.append(rewards_i) return rewards def load_model(self): try: self.actor.load_state_dict(self.learner.actor.state_dict()) self.critic.load_state_dict(self.learner.critic.state_dict()) except: print('load error')