class TestCritic(unittest.TestCase): def setUp(self): self.state_dim = (2, 80, 80) self.critic = Critic() def test_forward(self): n = 2 batch = torch.tensor(np.random.random_sample((n, ) + self.state_dim), dtype=torch.float) values = self.critic.forward(batch) self.assertEqual((n, 1), values.size())
class DDPG: def __init__(self, state_dim, action_dim): self.critic = Critic(state_dim, action_dim).to(device) self.target_c = copy.deepcopy(self.critic) self.actor = Actor(state_dim).to(device) self.target_a = copy.deepcopy(self.actor) self.optimizer_c = optim.Adam(self.critic.parameters(), lr=LR) self.optimizer_a = optim.Adam(self.actor.parameters(), lr=LR) def act(self, state): state = torch.from_numpy(np.array(state)).float().to(device) return self.actor.forward(state).detach().squeeze(0).cpu().numpy() def update(self, batch): states, actions, rewards, next_states, dones = zip(*batch) states = torch.from_numpy(np.array(states)).float().to(device) actions = torch.from_numpy(np.array(actions)).float().to(device) rewards = torch.from_numpy( np.array(rewards)).float().to(device).unsqueeze(1) next_states = torch.from_numpy( np.array(next_states)).float().to(device) dones = torch.from_numpy(np.array(dones)).to(device) Q_current = self.critic(states, actions) Q_next = self.target_c(next_states, self.target_a(next_states).detach()) y = (rewards + GAMMA * Q_next).detach() ##################Update critic####################### loss_c = F.mse_loss(y, Q_current) self.optimizer_c.zero_grad() loss_c.backward() self.optimizer_c.step() ##################Update actor####################### loss_a = -self.critic.forward(states, self.actor(states)).mean() self.optimizer_a.zero_grad() loss_a.backward() self.optimizer_a.step() ##################Update targets####################### for target_pr, pr in zip(self.target_a.parameters(), self.actor.parameters()): target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data) for target_pr, pr in zip(self.target_c.parameters(), self.critic.parameters()): target_pr.data.copy_(TAU * pr.data + (1 - TAU) * target_pr.data)
class TestCritic(unittest.TestCase): def setUp(self): self.state_dim = 24 * 2 self.action_dim = 2 * 2 self.critic = Critic(state_dim=self.state_dim, action_dim=self.action_dim, fc1_units=64, fc2_units=64, seed=0) def test_forward(self): n = 2 states = torch.tensor(np.random.random_sample((n, self.state_dim)), dtype=torch.float) actions = torch.tensor(np.random.random_sample((n, self.action_dim)), dtype=torch.float) values = self.critic.forward(states, actions) self.assertEqual((n, 1), values.size())
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed num agents (int): number of agents """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) ####self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.forward(next_states) Q_targets_next = self.critic_target.forward(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local.forward(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local.forward(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, num_agents, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.seed = random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents #Actor Network self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic Network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) action = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() # set module to evaluation mode with torch.no_grad(): for agent_idx, state_ in enumerate(state): action[agent_idx, :] = self.actor_local.forward( state_).cpu().data.numpy() self.actor_local.train() # reset it back to training mode if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) # restrict the output boundary -1, 1 def reset(self): self.noise.reset() def step(self, state, action, reward, next_state, done, timeStep): """Save experience in replay memory, and use random sample from buffer to updateWeight_local.""" for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) if len(self.memory) > BATCH_SIZE and timeStep % 2 == 0: self.updateWeight_local(self.memory.sample(), GAMMA) def updateWeight_local(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_actions = self.actor_target(next_states) # Sarsa? Q_target_next = self.critic_target.forward(next_states, next_actions) Q_target = rewards + gamma * Q_target_next * (1 - dones) Q_local = self.critic_local.forward(states, actions) critic_loss = F.mse_loss(Q_local, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local( states, actions_pred).mean() # '-' for Reward Maxim, gradient ascent # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.updateWeight_target(self.critic_local, self.critic_target, TAU) self.updateWeight_target(self.actor_local, self.actor_target, TAU) def updateWeight_target(self, local_model, target_model, tau): """Soft update TARGET model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, args): """ init function Args: - args: class with args parameter """ self.state_size = args.state_size self.action_size = args.action_size self.bs = args.bs self.gamma = args.gamma self.epsilon = args.epsilon self.tau = args.tau self.discrete = args.discrete self.randomer = OUNoise(args.action_size) self.buffer = ReplayBuffer(args.max_buff) self.actor = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor) self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) def reset(self): """ reset noise and model """ self.randomer.reset() def get_action(self, state): """ get distribution of action Args: - state: list, shape == [state_size] """ state = torch.tensor(state, dtype=torch.float).unsqueeze(0) action = self.actor(state).detach() action = action.squeeze(0).numpy() action += self.epsilon * self.randomer.noise() action = np.clip(action, -1.0, 1.0) return action def learning(self): """ learn models """ s1, a1, r1, t1, s2 = self.buffer.sample_batch(self.bs) # bool -> int t1 = 1 - t1 s1 = torch.tensor(s1, dtype=torch.float) a1 = torch.tensor(a1, dtype=torch.float) r1 = torch.tensor(r1, dtype=torch.float) t1 = torch.tensor(t1, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) a2 = self.actor_target(s2).detach() q2 = self.critic_target(s2, a2).detach() q2_plus_r = r1[:, None] + t1[:, None] * self.gamma * q2 q1 = self.critic.forward(s1, a1) # critic gradient critic_loss = nn.MSELoss() loss_critic = critic_loss(q1, q2_plus_r) self.critic_opt.zero_grad() loss_critic.backward() self.critic_opt.step() # actor gradient pred_a = self.actor.forward(s1) loss_actor = (-self.critic.forward(s1, pred_a)).mean() self.actor_opt.zero_grad() loss_actor.backward() self.actor_opt.step() # Notice that we only have gradient updates for actor and critic, not target # actor_opt.step() and critic_opt.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return loss_actor.item(), loss_critic.item()
class DDPG: """Interacts with and learns from the environment. There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions. Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action). The critic for each agents gets to see the actions and observations of all agents. """ def __init__(self, state_size, action_size, num_agents): """Initialize an Agent object. Params ====== state_size (int): dimension of each state for each agent action_size (int): dimension of each action for each agent """ self.state_size = state_size self.action_size = action_size self.actor_local = Actor(state_size, action_size).to(DEVICE) self.actor_target = Actor(state_size, action_size).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_target = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.noise_scale = NOISE_START self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" if self.noise_scale > NOISE_END: self.noise_scale *= NOISE_REDUCTION if not add_noise: self.noise_scale = 0.0 states = torch.from_numpy(states).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() actions += self.noise_scale * self.noise() return np.clip(actions, -1, 1) def noise(self): return 0.5 * np.random.randn(1, self.action_size) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ full_states, actor_full_actions, full_actions, agent_rewards, \ agent_dones, full_next_states, critic_full_next_actions = experiences # ---------------------------- update critic ---------------------------- # Q_target_next = self.critic_target(full_next_states, critic_full_next_actions) Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones) Q_expected = self.critic_local(full_states, full_actions) critic_loss = F.mse_loss(input=Q_expected, target=Q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # actor_loss = -self.critic_local.forward(full_states, actor_full_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @staticmethod def hard_update(target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(source_param.data)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] ### for the number of processes to use if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) ## ALE Environments : mostly has Discrete action_space type if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] ### shape==3 for ALE Environments : States are 3D (Image Pi) if len(envs.observation_space.shape) == 3: actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) baseline_target = Baseline_Critic(in_channels=4, num_actions=envs.action_space.n) if args.cuda: actor.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() baseline_target.cuda() actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr) baseline_optim = optim.Adam(actor.parameters(), lr=1e-4) tau_soft_update = 0.001 mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): temperature = 1.0 ## num_steps = 5 as in A2C for step in range(args.num_steps): temperature = temperature / (step + 1) # Sample actions action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True), temperature, envs.action_space.n, args.num_processes) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, dist_entropy.data, value.data, reward, masks) nth_step_return = rollouts.returns[0].cpu().numpy() current_state = rollouts.observations[0].cpu().numpy() nth_state = rollouts.observations[-1].cpu().numpy() current_action = rollouts.action_log_probs[0].cpu().numpy() current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy() mem_buffer.add((current_state, nth_state, current_action, nth_step_return, done, current_action_dist_entropy)) action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True), temperature, envs.action_space.n, args.num_processes) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) bs_size = args.batch_size if len(mem_buffer.storage) >= bs_size: ##samples from the replay buffer state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample( bs_size) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, envs.action_space.n]) #current Q estimate q_batch = critic(to_tensor(state), to_tensor(action)) # target Q estimate next_state_action_probs = target_actor( to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True)) next_q_values = critic_target(to_tensor(next_state, volatile=True), next_state_action_probs[1]) next_q_values.volatile = False target_q_batch = to_tensor(returns) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() value_loss = criterion(q_batch, target_q_batch) if args.gradient_penalty == True: gradients = torch.autograd.grad(value_loss, critic.parameters(), allow_unused=True, retain_graph=True, create_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1)** 2).mean() * args.lambda_grad_penalty gradient_penalty.backward() else: value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor.zero_grad() policy_loss = -critic( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ### Soft trust region constraint for the actor current_action_probs = actor(to_tensor(state, volatile=False), to_tensor(state, volatile=False), to_tensor(state, volatile=False))[0] target_action_probs = target_actor(to_tensor(state, volatile=True), to_tensor(state, volatile=True), to_tensor(state, volatile=True))[0] policy_regularizer = criterion(current_action_probs, target_action_probs) ## Actor update with entropy penalty policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \ + args.actor_kl_lambda * policy_regularizer if args.actor_several_updates == True: for p in range(args.actor_updates): policy_loss.backward(retain_variables=True) else: policy_loss.backward() ##clipping of gradient norms gradient_norms = nn.utils.clip_grad_norm(actor.parameters(), args.max_grad_norm) print("gradient_norms", gradient_norms) actor_optim.step() if args.second_order_grads == True: """ Training the Baseline critic (f(s, \mu(s))) """ baseline_target.zero_grad() ## f(s, \mu(s)) current_baseline = baseline_target( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ## \grad f(s,a) grad_baseline_params = torch.autograd.grad( current_baseline.mean(), actor.parameters(), retain_graph=True, create_graph=True) ## MSE : (Q - f)^{2} baseline_loss = (q_batch.detach() - current_baseline).pow(2).mean() # baseline_loss.volatile=True actor.zero_grad() baseline_target.zero_grad() grad_norm = 0 for grad_1, grad_2 in zip(grad_params, grad_baseline_params): grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum() grad_norm = grad_norm.sqrt() ##Loss for the Baseline approximator (f) overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm overall_loss.backward() baseline_optim.step() soft_update(target_actor, actor, tau_soft_update) soft_update(critic_target, critic, tau_soft_update) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and len( mem_buffer.storage) >= bs_size: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor if args.cuda: save_model = copy.deepcopy(actor).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0], entropy_log_prob.mean())) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] all_value_loss = [value_loss.data.cpu().numpy()[0]] all_policy_loss = [policy_loss.data.cpu().numpy()[0]] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss) # # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class Agent(): def __init__(self, action_space_shape, observation_space_shape, n_train_steps=50 * 1000000, replay_memory_size=1000000, k=3): # Cuda self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Hyperparameters - dynamic self.action_space_shape = action_space_shape self.observation_space_shape = observation_space_shape self.k = k self.observation_input_shape = multiply_tuple( self.observation_space_shape, self.k) self.n_train_steps = n_train_steps self.replay_memory_size = replay_memory_size self.replay_memory = deque(maxlen=self.replay_memory_size) # Hyperparameters - static self.training_start_time_step = 1000 # Minimum: k * minibatch_size == 3 * 64 = 192 self.gamma = 0.99 # For reward discount self.tau = 0.001 # For soft update # Hyperparameters - Ornstein_Uhlenbeck_noise self.theta = 0.15 self.sigma = 0.2 self.Ornstein_Uhlenbeck_noise = OUNoise( action_space_shape=self.action_space_shape, theta=self.theta, sigma=self.sigma) # Hyperparameters - NN model self.minibatch_size = 64 # For training NN self.lr_actor = 10e-4 self.lr_critic = 10e-3 self.weight_decay_critic = 10e-2 # Parameters - etc self.action = None self.time_step = 0 self.train_step = 0 self.train_complete = False # Modules self.actor = Actor( action_space_shape=self.action_space_shape, observation_space_shape=self.observation_input_shape).to( self.device) self.critic = Critic( action_space_shape=self.action_space_shape, observation_space_shape=self.observation_input_shape).to( self.device) self.actor_hat = copy.deepcopy(self.actor) self.critic_hat = copy.deepcopy(self.critic) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.lr_actor) self.optimizer_critic = optim.Adam( self.critic.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Operations self.mode('train') def reset(self, observation): self.previous_observation = torch.tensor([observation] * self.k).to( dtype=torch.float, device=self.device).view(self.observation_input_shape) self.observation_buffer = list() self.reward = torch.tensor([0]) # Tensor form for compatibility self.Ornstein_Uhlenbeck_noise.reset() # Since replay memory is somewhat full, we can decrease waiting time for sufficient data to fill in the replay memory. self.training_start_time_step = max( 0, self.training_start_time_step - self.time_step) self.time_step = 0 # Don't reset replay_memory # self.replay_memory = deque(maxlen = self.replay_memory_size) def mode(self, mode): self.mode = mode if self.mode == 'train': pass elif self.mode == 'test': pass else: assert False, 'mode not specified' def wakeup(self): # Frame skipping # See & Select actions every kth frame. Modify ations every kth frame # Otherwise, skip frame if self.time_step % self.k == 0: return True else: return False def act(self): if self.wakeup() == True: self.action = self.actor.forward( self.previous_observation) + torch.as_tensor( self.Ornstein_Uhlenbeck_noise(), dtype=torch.float, device=self.device) self.time_step += 1 # Return numpy version return self.action.detach().numpy() def observe(self, observation, reward): if self.wakeup() == True: # Append observation self.observation_buffer.append(observation) self.new_observation = torch.tensor(self.observation_buffer).to( dtype=torch.float, device=self.device).view(self.observation_input_shape) # Add reward self.reward += reward # Store transition in replay memory # If memory size exceeds, the oldest memory is popped (deque property) # wrap self.action with torch.tensor() to reset requires_grad = False self.replay_memory.append( (self.previous_observation, self.action.clone().detach(), self.reward, self.new_observation )) # self.action.new_tensor() == self.action.clone().detach() # The new observation will be the previous observation next time self.previous_observation = self.new_observation # Empty observation buffer, reset reward self.observation_buffer = list() self.reward = torch.tensor([0]) # Tensor form for compatibility else: self.observation_buffer.append(observation) self.reward += reward def random_sample_data(self): memory_size = len(self.replay_memory) # state, action, reward, state_next s_i = list() a_i = list() r_i = list() s_i_1 = list() # Random Sample transitions, append them into np arrays random_index = np.random.choice( memory_size, size=self.minibatch_size, replace=False ) # random_index: [0,5,4,9, ...] // "replace = False" makes the indices exclusive. for index in random_index: # Random sample transitions, 'minibatch' times s, a, r, s_1 = self.replay_memory[index] s_i.append( s ) # s_i Equivalent to [self.replay_memory[index][0] for index in random_index] a_i.append(a) r_i.append(r) s_i_1.append(s_1) s_i = torch.stack(s_i).to(dtype=torch.float, device=self.device) a_i = torch.stack(a_i).to(dtype=torch.float, device=self.device) r_i = torch.stack(r_i).to(dtype=torch.float, device=self.device) s_i_1 = torch.stack(s_i_1).to(dtype=torch.float, device=self.device) return s_i, a_i, r_i, s_i_1 def train(self): if self.wakeup( ) == True and self.time_step >= self.training_start_time_step: # 1. Sample random minibatch of transitions from replay memory # state, action, reward, state_next s_i, a_i, r_i, s_i_1 = self.random_sample_data( ) # **minibatch info included in "self" # 2. Set y_i y_i = r_i + self.gamma * self.critic_hat.forward( s_i_1, self.actor_hat.forward(s_i_1)) # 3. Calculate Loss self.optimizer_critic.zero_grad() critic_loss = F.mse_loss(y_i, self.critic.forward(s_i, a_i)) # 4. Update Critic critic_loss.backward() self.optimizer_critic.step() # 5. Update Actor self.optimizer_actor.zero_grad() critic_Q_mean = -self.critic.forward( s_i, self.actor.forward(s_i)).mean() critic_Q_mean.backward() self.optimizer_actor.step() # 6. Update target networks self.critic_hat = self.tau * self.critic + ( 1 - self.tau) * self.critic_hat self.actor = self.tau * self.actor + (1 - self.tau) * self.actor_hat # 7. Increment train step. # If train step meets its scheduled training steps, change "train_complete" status self.train_step += 1 if self.train_step >= self.n_train_steps: self.train_complete = True
class Agent(): def __init__(self, state_size, action_size, num_agents, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.seed = random.seed(seed) self.state_size = state_size # 24 self.action_size = action_size # 2 self.num_agents = num_agents # 2 self.eps = eps_start #Actor Network: State -> Action self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic Network: State1 x State2 x Action1 x Action2 ... -> Qvalue self.critic_local = Critic(state_size * num_agents, action_size * num_agents, seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, add_noise): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() # set module to evaluation mode with torch.no_grad(): action = self.actor_local.forward(state).cpu().data.numpy() self.actor_local.train() # reset it back to training mode if add_noise: action += self.noise.sample() * self.eps return np.clip(action, -1, 1) # restrict the output boundary -1, 1 def reset(self): self.noise.reset() def step(self, state, action, reward, next_state, done, timestep, agent_index): """Save experience in replay memory, and use random sample from buffer to updateWeight_local.""" # for i in range(self.num_agents): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE and timestep % UPDATE_FREQUENCY == 0: self.updateWeight_local(agent_index, self.memory.sample(), GAMMA) def updateWeight_local(self, agent_index, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # states: (batchsize, 24x2) # actions: (batchsize, 2x2) # rewards: (batchsize, 1x2) # next_states: (batchsize, 24x2) # dones: (batchsize, 1x2) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models self_next_actions = self.actor_target( next_states[:, self.state_size * agent_index:self.state_size * (agent_index + 1)]) # actor by self obser notSelf_actions = actions[:, self.action_size * (1 - agent_index):self.action_size * (2 - agent_index)] # competitor's actions if agent_index == 0: # concat order by agent index next_acitons = torch.cat((self_next_actions, notSelf_actions), dim=1).to(device) # index0-> self:first else: next_acitons = torch.cat((notSelf_actions, self_next_actions), dim=1).to(device) # index1 -> self:second Q_target_next = self.critic_target.forward( next_states, next_acitons) # critic by both agent's obs and actions Q_target = rewards + gamma * Q_target_next * (1 - dones) Q_local = self.critic_local.forward(states, actions) critic_loss = F.mse_loss(Q_local, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss self_actions_pred = self.actor_local.forward( states[:, self.state_size * agent_index:self.state_size * (agent_index + 1)]) #actor by self agent's obser notSelf_actions = actions[:, self.action_size * (1 - agent_index):self.action_size * (2 - agent_index)] # competitor's actions if agent_index == 0: actions_pred = torch.cat((self_actions_pred, notSelf_actions), dim=1).to(device) else: actions_pred = torch.cat((notSelf_actions, self_actions_pred), dim=1).to(device) actor_loss = -self.critic_local( states, actions_pred).mean() # '-' for Reward Maxim, gradient ascent # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.updateWeight_target(self.critic_local, self.critic_target, TAU) self.updateWeight_target(self.actor_local, self.actor_target, TAU) # Update epsilon noise value self.eps = self.eps - (1 / eps_decay) if self.eps < eps_end: self.eps = eps_end def updateWeight_target(self, local_model, target_model, tau): """Soft update TARGET model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.env = env self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = BasicBuffer(buffer_maxlen) self.noise = OUNoise(self.env.action_space) def get_action(self, obs): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # update critic q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # update actor policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class Agent(object): """ Interacts with and learns from the environment. """ def __init__(self, state_space, hidden_size, action_size, num_agents, seed=0, buffer_size=int(1e6), actor_lr=1e-4, actor_hidden_sizes=(128, 256), actor_weight_decay=0, critic_lr=1e-4, critic_hidden_sizes=(128, 256, 128), critic_weight_decay=0, batch_size=128, gamma=0.99, tau=1e-3): """ Initialize an Agent object. Params ====== state_space (tuple): dimension of each states hidden_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents to train seed (int): random seed, default value is 0 buffer_size (int): buffer size of experience memory, default value is 100000 actor_lr (float): learning rate of actor model, default value is 1e-4 actor_lr (float): learning rate of actor model, default value is 1e-4 actor_hidden_sizes (tuple): size of hidden layer of actor model, default value is (128, 256) critic_lr (float): learning rate of critic model, default value is 1e-4 critic_hidden_sizes (tuple): size of hidden layer of critic model, default value is (128, 256, 128) batch_size (int): mini-batch size gamma (float): discount factor tau (float): interpolation parameter """ self.state_space = state_space self.hidden_size = hidden_size self.action_size = action_size self.num_agents = num_agents self.seed = seed self.batch_size = batch_size # mini-batch size self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters # Actor Network self.actor_local = Actor(state_space, hidden_size, action_size, seed, hidden_units=actor_hidden_sizes).to(DEVICE) self.actor_target = Actor(state_space, hidden_size, action_size, seed, hidden_units=actor_hidden_sizes).to(DEVICE) self.actor_target.eval() self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr, weight_decay=actor_weight_decay) # Critic Network self.critic_local = Critic(state_space, hidden_size, action_size, seed, hidden_units=critic_hidden_sizes).to(DEVICE) self.critic_target = Critic(state_space, hidden_size, action_size, seed, hidden_units=critic_hidden_sizes).to(DEVICE) self.critic_target.eval() self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr, weight_decay=critic_weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplyBuffer(buffer_size=buffer_size, seed=seed) # copy parameters of the local model to the target model self.soft_update(self.critic_local, self.critic_target, 1.) self.soft_update(self.actor_local, self.actor_target, 1.) self.seed = random.seed(seed) np.random.seed(seed) self.reset() def reset(self): self.noise.reset() def act(self, state, add_noise=True): state = np.asarray([state]) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1., 1.) def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use random sample from buffer to learn. """ # Save experience / reward # for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample(batch_size=self.batch_size) self.learn(experiences, self.gamma) def learn(self, experiences, gamma): """ Update policy and experiences parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-experiences Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences actions, rewards, dones = torch.from_numpy(actions).float().to(DEVICE), \ torch.from_numpy(rewards).float().to(DEVICE), \ torch.from_numpy(dones).to(DEVICE) # ------- update critic ------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) q_targets = rewards + (gamma * q_targets_next * (1 - dones)) q_targets = q_targets.detach() # Compute critic loss q_expected = self.critic_local(states, actions) assert q_expected.shape == q_targets.shape critic_loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) # clip the gradient (Udacity) self.critic_optimizer.step() # ------- update actor ------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local.forward(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) return actor_loss.item(), critic_loss.item() def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.detach_() target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self): """ Save model state """ torch.save(self.actor_local.state_dict(), "checkpoints/checkpoint_actor.pth") torch.save(self.actor_target.state_dict(), "checkpoints/checkpoint_actor_target.pth") torch.save(self.critic_local.state_dict(), "checkpoints/checkpoint_critic.pth") torch.save(self.critic_target.state_dict(), "checkpoints/checkpoint_critic_target.pth") def load(self): """ Load model state """ if not os.path.exists("checkpoints/checkpoint_actor.pth") or \ not os.path.exists("checkpoints/checkpoint_actor_target.pth") or \ not os.path.exists("checkpoints/checkpoint_critic.pth") or \ not os.path.exists("checkpoints/checkpoint_critic_target.pth"): return self.actor_local.load_state_dict(torch.load("checkpoints/checkpoint_actor.pth"), strict=False) self.actor_target.load_state_dict(torch.load("checkpoints/checkpoint_actor_target.pth"), strict=False) self.critic_local.load_state_dict(torch.load("checkpoints/checkpoint_critic.pth"), strict=False) self.critic_target.load_state_dict(torch.load("checkpoints/checkpoint_critic_target.pth"), strict=False) def __str__(self): return f"{str(self.actor_local)}\n{str(self.critic_local)}"
class MADDPG: def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, discount_factor=0.95, tau=0.02, device=device, random_seed=4, lr_critic=1.0e-4, weight_decay=0.0): super(MADDPG, self).__init__() # parameter configuration self.num_agents = num_agents self.device = device self.discount_factor = discount_factor self.tau = tau self.num_agents = num_agents self.global_action_size = global_action_size self.global_obs_dim = global_obs_dim torch.manual_seed(random_seed) random.seed(random_seed) self.random_seed = random_seed self.weight_decay = weight_decay # define actors self.actors = [ DDPGActor(num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, device=device) for _ in range(num_agents) ] # define centralized critic self.critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) self.target_critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # noise coef self.noise_coef = 1.0 self.noise_coef_decay = 1e-6 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, obs_all_agents): actions = [ ddpg_actor.act(local_obs, self.noise_coef) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def target_act(self, obs_all_agents): actions = [ ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full, dones, timestep): self.memory.add(obs, obs_full, actions, rewards, next_obs, next_obs_full, dones) timestep = timestep % TRAIN_EVERY # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep == 0: for _ in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, self.discount_factor) def learn(self, experiences, gamma): obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences obs = obs.permute(1, 0, -1) # agent_id * batch_size * state_size obs_full = obs_full.view(-1, self.global_obs_dim) next_obs = next_obs.permute(1, 0, -1) next_obs_full = next_obs_full.view(-1, self.global_obs_dim) action = action.reshape(-1, self.global_action_size) # ---------------- update centralized critic ----------------------- # self.critic_optimizer.zero_grad() # get target actions from all target_actors target_actions = np.array(self.target_act(next_obs)) target_actions = torch.from_numpy(target_actions).float().permute( 1, 0, -1) target_actions = target_actions.reshape(-1, self.global_action_size) # update critic with torch.no_grad(): q_next = self.target_critic.forward(next_obs_full, target_actions.to(self.device)) y = reward + gamma * q_next * (1 - done) q = self.critic.forward(obs_full, action) critic_loss = 0 for i in range(self.num_agents): critic_loss += F.mse_loss(q, y[:, i].detach().reshape( -1, 1)) / self.num_agents critic_loss.backward() self.critic_optimizer.step() # ---------------- update actor for all agents --------------------- # for ii in range(len(self.actors)): self.actors[ii].actor_optimizer.zero_grad() q_action = [ self.actors[i].actor_local(ob) if i == ii \ else self.actors[i].actor_local(ob).detach() for i, ob in enumerate(obs) ] q_action = torch.stack(q_action).permute(1, 0, -1) q_action = q_action.reshape(-1, self.global_action_size).to( self.device) # policy_gradient actor_loss = -self.critic.forward(obs_full, q_action).mean() actor_loss.backward() self.actors[ii].actor_optimizer.step() # --------------- soft update all target networks ------------------- # soft_update(self.target_critic, self.critic, self.tau) for actor in self.actors: actor.update_target(self.tau) # -------------- reset noise --------------------------------------- # for actor in self.actors: actor.action_noise.reset() self.noise_coef -= self.noise_coef_decay if self.noise_coef < 0.01: self.noise_coef = 0.01
class DDPG(object): """Interacts with and learns from the environment. There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions. Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action). The critic for each agents gets to see the actions and observations of all agents. """ def __init__(self, state_size, action_size, num_agents): """Initialize an Agent object. Params ====== state_size (int): dimension of each state for each agent action_size (int): dimension of each action for each agent """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(DEVICE) self.actor_target = Actor(state_size, action_size).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_target = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_critic) # Noise process self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # Make sure target is initialized with the same weight as the source (makes a big difference) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def act(self, states, i_episode, add_noise=True): """Returns actions for given state as per current policy.""" if i_episode > EPISODES_BEFORE_TRAINING and self.noise_scale > NOISE_END: #self.noise_scale *= NOISE_REDUCTION self.noise_scale = NOISE_REDUCTION**(i_episode - EPISODES_BEFORE_TRAINING) #else keep the previous value if not add_noise: self.noise_scale = 0.0 states = torch.from_numpy(states).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() #add noise actions += self.noise_scale * self.add_noise2( ) #works much better than OU Noise process #actions += self.noise_scale*self.noise.sample() return np.clip(actions, -1, 1) def add_noise2(self): noise = 0.5 * np.random.randn( 1, self.action_size ) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped return noise def reset(self): self.noise.reset() def learn(self, experiences, gamma): #for MADDPG """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ full_states, actor_full_actions, full_actions, agent_rewards, agent_dones, full_next_states, critic_full_next_actions = experiences # ---------------------------- update critic ---------------------------- # # Get Q values from target models Q_target_next = self.critic_target(full_next_states, critic_full_next_actions) # Compute Q targets for current states (y_i) Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones) # Compute critic loss Q_expected = self.critic_local(full_states, full_actions) critic_loss = F.mse_loss( input=Q_expected, target=Q_target ) #target=Q_targets.detach() #not necessary to detach # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1.0) #clip the gradient for the critic network (Udacity hint) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local.forward( full_states, actor_full_actions).mean( ) #-ve b'cse we want to do gradient ascent # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update_all(self): # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(source_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, num_agents=20): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ print("Running on: " + str(device)) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.eps = EPS_START self.eps_decay = 0.0005 # Actor network self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.noise = OUNoise((num_agents, action_size), seed) def step(self, state, action, reward, next_state, done, agent_id): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) self.t_step += 1 # Learn every UPDATE_EVERY time steps. if (self.t_step % UPDATE_EVERY) == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_id) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences, gamma, agent_id): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------------------- update critic network ------------------- # target_actions = self.actor_target.forward(next_states) # Construct next actions vector relative to the agent if agent_id == 0: target_actions = torch.cat((target_actions, actions[:, 2:]), dim=1) else: target_actions = torch.cat((actions[:, :2], target_actions), dim=1) next_critic_value = self.critic_target.forward(next_states, target_actions) critic_value = self.critic_local.forward(states, actions) # Q targets for current state # If the episode is over, the reward from the future state will not be incorporated Q_targets = rewards + (gamma * next_critic_value * (1 - dones)) critic_loss = F.mse_loss(critic_value, Q_targets) # Minimizing loss self.critic_local.train() self.critic_optim.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optim.step() self.critic_local.eval() # ------------------- update actor network ------------------- # self.actor_local.train() self.actor_optim.zero_grad() mu = self.actor_local.forward(states) # Construct mu vector relative to each agent if agent_id == 0: mu = torch.cat((mu, actions[:, 2:]), dim=1) else: mu = torch.cat((actions[:, :2], mu), dim=1) actor_loss = -self.critic_local(states, mu).mean() actor_loss.backward() self.actor_optim.step() self.actor_local.eval() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
class DDPGAgent: def __init__(self, plot=True, seed=1, env: gym.Env = None, batch_size=128, learning_rate_actor=0.001, learning_rate_critic=0.001, weight_decay=0.01, gamma=0.999): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.weight_decay = weight_decay self.gamma = gamma self.tau = 0.001 self._to_tensor = util.to_tensor self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.state_dim, self.action_dim).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate_actor, weight_decay=self.weight_decay) self.critic = Critic(self.state_dim, self.action_dim).to(self.device) self.target_critic = Critic(self.state_dim, self.action_dim).to(self.device) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), self.learning_rate_critic, weight_decay=self.weight_decay) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.t = 0 def _learn_from_memory(self, memory): ''' 从记忆学习,更新两个网络的参数 ''' # 随机获取记忆里的Transition trans_pieces = memory.sample(self.batch_size) s0 = np.vstack([x.state for x in trans_pieces]) a0 = np.vstack([x.action for x in trans_pieces]) r1 = np.vstack([x.reward for x in trans_pieces]) s1 = np.vstack([x.next_state for x in trans_pieces]) terminal_batch = np.vstack([x.is_done for x in trans_pieces]) # 优化评论家网络参数 s1 = self._to_tensor(s1, device=self.device) s0 = self._to_tensor(s0, device=self.device) next_q_values = self.target_critic.forward( state=s1, action=self.target_actor.forward(s1)).detach() target_q_batch = self._to_tensor(r1, device=self.device) + \ self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values q_batch = self.critic.forward(s0, self._to_tensor(a0, device=self.device)) # 计算critic的loss 更新critic网络参数 loss_critic = F.mse_loss(q_batch, target_q_batch) #self.critic_optimizer.zero_grad() self.critic.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 反向传播,以某状态的价值估计为策略目标函数 loss_actor = -self.critic.forward(s0, self.actor.forward(s0)) # Q的梯度上升 loss_actor = loss_actor.mean() self.actor.zero_grad() #self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning(self, memory): self.actor.train() return self._learn_from_memory(memory) def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print('Models loaded successfully')
class DDPG(object): """ Interacts with and learns from the environment. There are two agents and the observations of each agent has 24 dimensions, while each agent's action has 2 dimensions. Here we use two separate actor networks (one for each agent using each agent's observations only and output that agent's action). The critic for each agents gets to see the full observations and full actions of all agents. """ def __init__(self, agent_id, state_size, full_state_size, action_size, full_action_size, actor_hidden_sizes=(256, 128), actor_lr=1e-4, actor_weight_decay=0., critic_hidden_sizes=(256, 128), critic_lr=1e-3, critic_weight_decay=0., is_action_continuous=True): """ Initialize an Agent object. :param agent_id (int): ID of each each agent. :param state_size (int): Dimension of each state for each agent. :param full_state_size (int): Dimension of full state for all agents. :param action_size (int): Dimension of each action for each agent. :param full_action_size: Dimension of full action for all agents. :param actor_hidden_sizes (tuple): Hidden units of the actor network. :param actor_lr (float): Learning rate of the actor network. :param actor_weight_decay (float): weight decay (L2 penalty) of the actor network. :param critic_hidden_sizes (tuple): Hidden units of the critic network. :param critic_lr (float): Learning rate of the critic network. :param critic_weight_decay (float): weight decay (L2 penalty) of the critic network. :param is_action_continuous (bool): Whether action space is continuous or discrete. """ self.id = agent_id self.state_size = state_size self.full_state_size = full_state_size self.action_size = action_size self.full_action_size = full_action_size self.is_action_continuous = is_action_continuous # Actor Network (w/ Target Network) self.actor_local = Actor( state_size, actor_hidden_sizes, action_size, out_gate=nn.Tanh if is_action_continuous else None) self.actor_target = Actor( state_size, actor_hidden_sizes, action_size, out_gate=nn.Tanh if is_action_continuous else None) self.update(self.actor_local, self.actor_target, 1.) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr, weight_decay=actor_weight_decay) # Critic Network (w/ Target Network) num_agents = int(full_action_size / action_size) self.critic_local = Critic( full_state_size, full_action_size if is_action_continuous else num_agents, critic_hidden_sizes) self.critic_target = Critic( full_state_size, full_action_size if is_action_continuous else num_agents, critic_hidden_sizes) # self.critic_local, self.critic_target = get_critic(full_state_size, full_action_size, critic_hidden_sizes) self.update(self.critic_local, self.critic_target, 1.) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr, weight_decay=critic_weight_decay) self.use_actor = True # Noise Process self.noise_scale = 0. self.noise = OUNoise(action_size) def reset(self): self.noise.reset() def act(self, state, noise_scale=0.0): """ Returns action for given state using current policy. """ states = torch.from_numpy(state[np.newaxis]).float() # calculate actions self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states) self.actor_local.train() actions = actions.cpu().numpy().squeeze() # add noise actions += noise_scale * self.noise.sample() return np.clip(actions, -1, 1) if self.is_action_continuous else np.argmax(actions) def learn(self, states, actions, rewards, next_states, dones, full_actions_predicted, critic_full_next_actions, gamma=0.99): """ Update policy and value parameters. Q_targets = r + γ * critic_target(next_state, action_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value :param states: Full states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE) :param actions: Full actions for training which size is (BATCHES, NUM_AGENTS, ACTION_SIZE) :param rewards: Full rewards for training which size is (BATCHES, NUM_AGENTS) :param next_states: Full next states for training which size is (BATCHES, NUM_AGENTS, STATE_SIZE) :param dones: Full dones for training which size is (BATCHES, NUM_AGENTS) :param full_actions_predicted: :param critic_full_next_actions: Full next states which size is (BATCHES, NUM_AGENTS * STATE_SIZE) :param gamma: discount ratio """ full_states = states.view(-1, self.full_state_size) full_actions = actions.view(states.shape[0], -1).float() full_next_states = next_states.view(-1, self.full_state_size) critic_full_next_actions = torch.cat(critic_full_next_actions, dim=1).float().to(DEVICE) actor_rewards = rewards[:, self.id].view(-1, 1) actor_dones = dones[:, self.id].view(-1, 1) # ---------------------------- update critic ---------------------------- # q_next = self.critic_target.forward(full_next_states, critic_full_next_actions) q_target = actor_rewards + gamma * q_next * (1 - actor_dones) q_expected = self.critic_local(full_states, full_actions) # Compute critic loss critic_loss = F.mse_loss(q_expected, q_target.detach()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # if self.use_actor: # detach actions from other agents full_actions_predicted = [ actions if i == self.id else actions.detach() for i, actions in enumerate(full_actions_predicted) ] full_actions_predicted = torch.cat(full_actions_predicted, dim=1).float().to(DEVICE) # Compute actor loss actor_loss = -self.critic_local.forward( full_states, full_actions_predicted).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() else: actor_loss = torch.tensor(0) return actor_loss.cpu().item(), critic_loss.cpu().item() def update(self, source, target, tau=0.01): """ Update target model parameters: θ_target = τ*θ_local + (1 - τ)*θ_target :param source: Pytorch model which parameters are copied from :param target: Pytorch model which parameters are copied to :param tau: interpolation parameter """ for param, target_param in zip(source.parameters(), target.parameters()): target_param.data.copy_(target_param.data * (1 - tau) + param.data * tau)