class Actor: def __init__(self, device, key, state_size, action_size, random_seed, memory, noise, lr, weight_decay, checkpoint_folder = './Saved_Model/'): self.DEVICE = device self.KEY = key self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.LR = lr self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE) self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR) self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth' if os.path.isfile(self.checkpoint_full_name): self.local.load_state_dict(torch.load(self.checkpoint_full_name)) self.target.load_state_dict(torch.load(self.checkpoint_full_name)) # Replay memory self.memory = memory # Noise process self.noise = noise def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.DEVICE) self.local.eval() with torch.no_grad(): action = self.local(state).cpu().data.numpy() self.local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def reset(self): self.noise.reset() def checkpoint(self): torch.save(self.local.state_dict(), self.checkpoint_full_name)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, args): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_target = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) if not agent_id: self.actor_network.load_state_dict(torch.load( args['agent_p0_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p0_path']), strict=False) else: self.actor_network.load_state_dict(torch.load( args['agent_p1_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() input_state = torch.from_numpy(current_state).float().to( self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def train(self, experiences): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss mCritic.optimizer.zero_grad() mCritic_loss.backward() mCritic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_network(states) actor_loss = -mCritic.network(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(mCritic.network, mCritic.target, TAU) self.soft_update(self.actor_network, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Actor(object): def __init__(self, opt, actor_id, q_trace, learner): self.opt = opt self.q_trace = q_trace self.learner = learner self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(self.opt.env) self.env.seed(self.opt.seed + actor_id) self.n_state = self.env.observation_space.shape[0] self.n_act = self.env.action_space.n self.n_episodes = 0 self.n_steps = 0 self.gamma = opt.gamma # epsilon self.eps_greedy = 0.4 ** (1 + actor_id * 7 / (opt.n_actors - 1)) \ if opt.n_actors > 1 else 0.4 # モデル self.actor = ActorNetwork(self.n_state, self.n_act).to(self.device) self.critic = CriticNetwork(self.n_state).to(self.device) def performing(self): torch.manual_seed(self.opt.seed) while True: self.load_model() self.train_episode() if self.n_episodes % 100 == 0: rewards = self.evaluation(self.env) rewards_mu = np.array( [np.sum(np.array(l_i), 0) for l_i in rewards]).mean() print("Episode %d, Average Reward %.2f" % (self.n_episodes, rewards_mu)) def _softmax_action(self, state): state = torch.FloatTensor([state]).to(self.device) softmax_action = torch.exp(self.actor(state)) # expをかけて,行動確率とする softmax_action = softmax_action.cpu().detach().numpy() return softmax_action def exploration_action(self, state): softmax_action = self._softmax_action(state) if np.random.rand() > self.eps_greedy: return np.argmax(softmax_action) else: return np.random.choice(self.n_act) def train_episode(self): done = False state = self.env.reset() self.env_state = state self.next_done = done while not done: self.n_steps += 1 states = np.zeros((self.opt.n_step, self.n_state)) actions = np.zeros(self.opt.n_step) rewards = np.zeros(self.opt.n_step) log_probs = np.zeros((self.opt.n_step, self.n_act)) dones = np.ones(self.opt.n_step) for i in range(self.opt.n_step): states[i] = self.env_state dones[i] = self.next_done log_prob = self.actor( torch.FloatTensor([state]).to( self.device)).detach().cpu().numpy()[0] action = self.exploration_action(state) next_state, reward, done, info = self.env.step(action) reward = 0 if done: if self.n_steps > 190: reward = 1 else: reward = -1 log_probs[i] = log_prob actions[i] = action rewards[i] = reward self.env_state = next_state self.next_done = done if done: self.env_state = self.env.reset() break # n_step回終了 if done: self.n_steps = 0 self.n_episodes += 1 self.episode_done = True else: self.episode_done = False self.q_trace.put((states, actions, rewards, dones, log_probs), block=True) # choose an action based on state for execution def action(self, state): softmax_action = self._softmax_action(state) action = np.argmax(softmax_action) return action def value(self, state): # Qを出力 state_var = torch.FloatTensor([state]).to(self.device) q_var = self.critic(state_var) # 行動価値を出value q = q_var.cpu().detach().numpy() return q def _discount_reward(self, rewards, final_value): discounted_r = np.zeros_like(rewards) R = final_value # Q(s_t, a_t) for t in reversed(range(0, len(rewards))): R = rewards[t] + self.gamma * R discounted_r[t] = R return discounted_r def evaluation(self, env_eval): rewards = [] for i in range(10): rewards_i = [] state = env_eval.reset() action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) while not done: action = self.action(state) state, reward, done, _ = env_eval.step(action) rewards_i.append(reward) rewards.append(rewards_i) return rewards def load_model(self): try: self.actor.load_state_dict(self.learner.actor.state_dict()) self.critic.load_state_dict(self.learner.critic.state_dict()) except: print('load error')