class MADDPGAgent(): def __init__(self, seed, checkpoint_filename=None): self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed) self.t = 0 self.agents = [ DDPGAgent(index, NUM_AGENTS, seed, DEVICE) for index in range(NUM_AGENTS) ] if checkpoint_filename: for i, to_load in enumerate(self.agents): f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights" actor_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights", map_location=DEVICE) critic_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights", map_location=DEVICE) to_load.actor_local.load_state_dict(actor_file) to_load.actor_target.load_state_dict(actor_file) to_load.critic_local.load_state_dict(critic_file) to_load.critic_target.load_state_dict(critic_file) print(f'Files loaded with prefix {checkpoint_filename}') def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.t = (self.t + 1) % UPDATE_FREQUENCY if self.t == 0 and (len(self.memory) > BATCH_SIZE): experiences = [self.memory.sample() for _ in range(NUM_AGENTS)] self.learn(experiences, GAMMA) def act(self, all_states, random): all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, random=random) all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): all_actions = [] all_next_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(DEVICE) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) all_actions.append(agent.actor_local(state)) all_next_actions.append(agent.actor_target(next_state)) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)
class MADDPG(): def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize multiple Agents each with a Actor-Critic network but they share the replay buffer to learn from experience """ self.num_agents = num_agents self.agents = [] for _ in range(num_agents): agent = Agent(state_size, action_size, random_seed) self.agents.append(agent) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, add_noise=True): clipped_actions = [] for state, agent in zip(states, self.agents): clipped_actions.append(agent.act(state, add_noise)) return clipped_actions def reset(self): for agent in self.agents: agent.reset() def learn(self, experiences, gamma): for agent in self.agents: agent.learn(experiences, gamma) def saveCheckPoints(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoints/actor_agent_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoints/critic_agent_{i}.pth") def loadCheckPoints(self): for i, agent in enumerate(self.agents): agent.actor_local.load_state_dict( torch.load(f"checkpoints/actor_agent_{i}.pth")) agent.critic_local.load_state_dict( torch.load(f"checkpoints/critic_agent_{i}.pth")) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for agent in self.agents: experiences = self.memory.sample() self.learn(experiences, GAMMA)
class MADDPG: def __init__(self, num_agents=2, random_seed=1): #np.random.randint(1000) super(MADDPG, self).__init__() self.maddpg_agent = [ DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed), DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed) ] self.num_agents = num_agents # Replay memory action_size = 2 self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, obs_all_agents, noise_ampl=1): """get actions from all agents in the MADDPG object""" actions = [ agent.act(obs, noise_ampl) for agent, obs in zip(self.maddpg_agent, obs_all_agents) ] return actions def add_memory(self, state, action, reward, next_state, done): # Save experience / reward self.memory.num_agents = self.num_agents self.memory.add(state, action, reward, next_state, done) def step(self): """Save experience in replay memory, and use random sample from buffer to learn.""" # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for n in range(0, self.num_agents): experiences = self.memory.sample() self.maddpg_agent[n].step(experiences) def reset(self): for n in range(0, self.num_agents): self.maddpg_agent[n].reset()
class MultiAgent: """Interacts with and learns from the environment.""" def __init__(self, agent_count, state_size, action_size, random_seed): """Initialize a MultiAgent object. Params ====== agent_count (int): Number of agents """ self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.agents = [ Agent( memory=self.memory, state_size=state_size, action_size=action_size, random_seed=random_seed, ) for _ in range(agent_count) ] def step(self, states, actions, rewards, next_states, dones, timestep): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0: for agent in self.agents: agent.learn(self.memory.sample(), GAMMA) def act(self, all_states): """Get actions from all agents""" actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.agents, all_states) ] return actions def reset(self): for agent in self.agents: agent.reset()
class MultiAgent: def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPGAgent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) self.t_step = 0 def step_all(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for agent in self.agents: experiences = self.memory.sample() agent.learn(experiences, GAMMA) def act_all(self, multi_states): actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.agents, multi_states) ] return actions def save_weights_all(self): for index, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), 'agent{}_checkpoint_actor.pth'.format(index + 1)) torch.save(agent.critic_local.state_dict(), 'agent{}_checkpoint_critic.pth'.format(index + 1)) def reset_all(self): for agent in self.agents: agent.reset()
class MultiAgent: def __init__(self, config): self.random_seeds = config['random_seeds'] self.params = config['params'] self.memory = ReplayBuffer(self.params['action_size'], self.params['buffer_size'], self.params['batch_size'], device, self.random_seeds[0]) self.params['memory'] = self.memory self.ddpg_agents = [ Agent(self.params, self.random_seeds[i]) for i in range(2) ] self.t_step = 0 def act(self, states): actions = [ agent.act(np.expand_dims(state, axis=0)) for agent, state in zip(self.ddpg_agents, states) ] #actions = [agent.act(states) for agent in self.ddpg_agents] return actions def step(self, states, actions, rewards, next_states, dones): self.t_step += 1 for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) if (len(self.memory) > self.params['batch_size']) and ( self.t_step % self.params['num_steps_per_update'] == 0): for agent in self.ddpg_agents: experiences = self.memory.sample() agent.learn(experiences, self.params['gamma']) def reset(self): for agent in self.ddpg_agents: agent.reset()
class MADDPG(object): """ The main class that defines and trains all the DDPG agents. """ def __init__( self, num_agents, state_size, action_size, buffer_size=int(1e6), batch_size=128, writer=None, actor_hidden_sizes=(256, 128), actor_lr=1e-4, actor_weight_decay=0., critic_hidden_sizes=(256, 128), critic_lr=1e-3, critic_weight_decay=0., model_folder_path=None, ): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.full_state_size = num_agents * state_size self.full_action_size = num_agents * action_size # Replay memory self.memory = ReplayBuffer(buffer_size) # TensorboardX Writer self.writer = writer # Actor Network Parameters self.actor_hidden_sizes = actor_hidden_sizes self.actor_lr = actor_lr self.actor_weight_decay = actor_weight_decay # Critic Network Parameters self.critic_hidden_sizes = critic_hidden_sizes self.critic_lr = critic_lr self.critic_weight_decay = critic_weight_decay # Model Folder self.folder_path = Path() if model_folder_path is None else Path( model_folder_path) # MADDPG Agents self.agents = [] self._init_agents() def reset(self): for agent in self.agents: agent.reset() def act(self, states, noise=0.): return [ agent.act(obs, noise) for agent, obs in zip(self.agents, states) ] def step(self, i_episode, states, actions, rewards, next_states, dones, tau=0.01, num_learns=1): # save to replay buffer self.memory.add(states, actions, rewards, next_states, dones) # train the model if len(self.memory) >= self.batch_size and num_learns > 0: actor_loss_list, critic_loss_list = [], [] for _ in range(num_learns): # learn multiple times at every step states, actions, rewards, next_states, dones = self.memory.sample( self.batch_size) for agent_id in range(self.num_agents): # Learn one time for the agents actor_loss, critic_loss = self._learn( agent_id, states, actions, next_states, rewards, dones) actor_loss_list.append(actor_loss) critic_loss_list.append(critic_loss) # Record Losses for actor & critic if self.writer: for agent_id in range(self.num_agents): self.writer.add_scalars( f'agent{agent_id}/losses', { 'critic loss': np.mean(critic_loss_list), 'actor_loss': np.mean(actor_loss_list) }, i_episode) # Soft update self._update_all(tau) def save(self): for agent in self.agents: torch.save( agent.actor_local.state_dict(), self.folder_path / f'checkpoint_actor_local_{agent.id}.pth') torch.save( agent.critic_local.state_dict(), self.folder_path / f'checkpoint_critic_local_{agent.id}.pth') def load(self, agent_id=None): for agent in self.agents: agent_id_ = agent.id if agent_id is None else agent_id agent.actor_local.load_state_dict( torch.load(self.folder_path / f'checkpoint_actor_local_{agent_id_}.pth')) agent.critic_local.load_state_dict( torch.load(self.folder_path / f'checkpoint_critic_local_{agent_id_}.pth')) def _init_agents(self): for i in range(self.num_agents): agent = DDPG(i, self.state_size, self.full_state_size, self.action_size, self.full_action_size, self.actor_hidden_sizes, self.actor_lr, self.actor_weight_decay, self.critic_hidden_sizes, self.critic_lr, self.critic_weight_decay) self.agents.append(agent) def _learn(self, agent_id, states, actions, next_states, rewards, dones): critic_full_actions, critic_full_next_actions = [], [] for agent in self.agents: # current actions actor_actions = agent.actor_local(states[:, agent.id, :]) critic_full_actions.append(actor_actions) # next actions actor_next_actions = agent.actor_target.forward( next_states[:, agent.id, :]) critic_full_next_actions.append(actor_next_actions) # learn for the agent current_agent = self.agents[agent_id] actor_loss, critic_loss = current_agent.learn( states, actions, rewards, next_states, dones, critic_full_actions, critic_full_next_actions) return actor_loss, critic_loss def _update_all(self, tau): for agent in self.agents: agent.update(agent.actor_local, agent.actor_target, tau) agent.update(agent.critic_local, agent.critic_target, tau)
def main(env_name, num_actors, num_iters, logdir, cluster): logdir = pathlib.Path(logdir) if logdir.exists(): shutil.rmtree(logdir) summary_writer = tf.summary.create_file_writer(str(logdir)) if not cluster: ray.init() epsilons = np.linspace(0.01, 0.8, num_actors) print("==== ACTORS launch ====") actors = [ Actor.remote(pid=i, env_name=env_name, epsilon=epsilons[i]) for i in range(num_actors) ] replaybuffer = ReplayBuffer(buffer_size=2**15) print("==== LEARNER launch ====") learner = Learner.remote(env_name=env_name) current_weights = ray.put(ray.get(learner.get_weights.remote())) print("==== TESTER launch ====") tester = Actor.remote(pid=None, env_name=env_name, epsilon=0.0) wip_actors = [actor.rollout.remote(current_weights) for actor in actors] n = 0 print("==== Initialize buffer ====") for _ in range(50): finished_actor, wip_actors = ray.wait(wip_actors, num_returns=1) td_errors, transitions, pid = ray.get(finished_actor[0]) replaybuffer.add(td_errors, transitions) wip_actors.extend([actors[pid].rollout.remote(current_weights)]) n += 1 minibatchs = [ replaybuffer.sample_minibatch(batch_size=512) for _ in range(16) ] wip_learner = learner.update_network.remote(minibatchs) minibatchs = [ replaybuffer.sample_minibatch(batch_size=512) for _ in range(16) ] wip_tester = tester.test_play.remote(current_weights) t = time.time() lap_count = 0 while n <= num_iters: finished_actor, wip_actors = ray.wait(wip_actors, num_returns=1, timeout=0) if finished_actor: td_errors, transitions, pid = ray.get(finished_actor[0]) replaybuffer.add(td_errors, transitions) wip_actors.extend([actors[pid].rollout.remote(current_weights)]) n += 1 lap_count += 1 finished_learner, _ = ray.wait([wip_learner], num_returns=1, timeout=0) if finished_learner: current_weights, indices, td_errors, loss_info = ray.get( finished_learner[0]) wip_learner = learner.update_network.remote(minibatchs) current_weights = ray.put(current_weights) replaybuffer.update_priority(indices, td_errors) minibatchs = [ replaybuffer.sample_minibatch(batch_size=512) for _ in range(16) ] with summary_writer.as_default(): tf.summary.scalar("Buffer", len(replaybuffer), step=n) tf.summary.scalar("loss", loss_info, step=n) tf.summary.scalar("lap_count", lap_count, step=n) tf.summary.scalar("lap_time", time.time() - t, step=n) t = time.time() lap_count = 0 if n % 200 == 0: test_score = ray.get(wip_tester) wip_tester = tester.test_play.remote(current_weights) with summary_writer.as_default(): tf.summary.scalar("test_score", test_score, step=n)
class Agent(): """ Class implementation of a so-called "intelligent" agent. This agent interacts with and learns from the environment. """ double_dqn = False """ True for the Double-DQN method. """ dueling_network = False """ True for the Dueling Network (DN) method. """ prioritized_replay = False """ True for the Prioritized Replay memory buffer. """ def __init__(self, state_size, action_size, seed, lr_decay=9999e-4, double_dqn=False, dueling_network=False, prioritized_replay=False): """ Initialize an Agent instance. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed lr_decay (float): Multiplicative factor of learning rate decay double_dqn (bool): Toogle for using the Double-DQN method dueling_network (bool): Toogle for using the Dueling Network (DN) method prioritized_replay (bool): Toogle for using the Prioritized Replay method """ # Set the parameters. self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.dueling_network = dueling_network self.prioritized_replay = prioritized_replay # Q-Network hidden layers. hidden_layers = [128, 32] # Use the Dueling Network (DN) method. if self.dueling_network: # DN requires a hidden state value. hidden_state_value = [64, 32] self.qnetwork_local = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target.eval() else: # Use the Deep Q-Network (DQN) method. self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target.eval() # Optimize using Adam. self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Use the Prioritized Replay memory buffer if enabled. if self.prioritized_replay: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: # Use the Replay memory buffer instead. self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize the time step (until the THRESHOLD is reached). self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Update the network on each step. Params ====== state (array_like): Current state """ # Save experience in replay memory. self.memory.add(state, action, reward, next_state, done) # Learn every time step till THRESHOLD. self.t_step = (self.t_step + 1) % THRESHOLD if self.t_step == 0: # Initial time step. # If enough samples are available in memory, get random subset and learn. if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Return the actions for a given state as per current policy. Params ====== state (array_like): Current state eps (float): Epsilon (ε), for epsilon-greedy action selection """ # Epsilon-greedy action selection. if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Train the network. self.qnetwork_local.train() # Return the action. return np.argmax(action_values.cpu().data.numpy()) else: # Return a random action. return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples gamma (float): Discount factor """ # Set the parameters. states, actions, rewards, next_states, dones, w = experiences # Compute and minimize the loss. with torch.no_grad(): if self.double_dqn: # Use of Double-DQN method. # Select the greedy actions using the QNetwork Local. # Calculate the pair action/reward for each of the next_states. next_action_rewards_local = self.qnetwork_local(next_states) # Select the action with the maximum reward for each of the next actions. greedy_actions_local = next_action_rewards_local.max( dim=1, keepdim=True)[1] ## Get the rewards for the greedy actions using the QNetwork Target. # Calculate the pair action/reward for each of the next_states. next_action_rewards_target = self.qnetwork_target(next_states) # Get the target reward for each of the greedy actions selected, # following the local network. target_rewards = next_action_rewards_target.gather( 1, greedy_actions_local) else: # Use of the fixed Q-target method. # Calculate the pair action/reward for each of the next_states. next_action_rewards = self.qnetwork_target(next_states) # Select the maximum reward for each of the next actions. target_rewards = next_action_rewards.max(dim=1, keepdim=True)[0] # Calculate the discounted target rewards. target_rewards = rewards + (gamma * target_rewards * (1 - dones)) # Calculate the pair action/rewards for each of the states. # Here, shape: [batch_size, action_size]. expected_action_rewards = self.qnetwork_local(states) # Get the reward for each of the actions. # Here, shape: [batch_size, 1]. expected_rewards = expected_action_rewards.gather(1, actions) # If the Prioritized Replay memory buffer if enabled. if self.prioritized_replay: target_rewards.sub_(expected_rewards) target_rewards.squeeze_() target_rewards.pow_(2) with torch.no_grad(): td_error = target_rewards.detach() td_error.pow_(0.5) self.memory.update_priorities(td_error) target_rewards.mul_(w) loss = target_rewards.mean() else: # Calculate the loss. loss = F.mse_loss(expected_rewards, target_rewards) # Perform the back-propagation. self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # Update the target network. self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters: θ_target = τ * θ_local + (1 - τ) * θ_target. Params ====== local_model (PyTorch model): Weights will be copied from target_model (PyTorch model): Weights will be copied to tau (float): Interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
return sum_reward / 10 obs = env.reset() while setps < max_steps: p = agents.acting.predict(np.array([obs])) for i in range(n_ant): if setps < 10000: p[i] = 2 * np.random.rand(n_actions) - 1 else: p[i] = np.clip(p[i][0] + 0.1 * np.random.randn(n_actions), -1, 1) next_obs, reward, terminated, info = env.step(np.hstack(p)) setps += 1 ep_len += 1 buff.add(obs, p, reward, next_obs, terminated) obs = next_obs if (terminated) | (ep_len == max_ep_len): obs = env.reset() terminated = False ep_len = 0 if setps % 10000 == 0: print(test_agent()) if (setps < 1000) | (setps % 50 != 0): continue for e in range(50): batch = buff.getBatch(batch_size)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory=None, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if memory is not None: self.memory = memory else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() if add_noise: action += self.noise.sample() self.actor_local.train() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train_model(context, data, training_batch): # Crear clase de datos sintéticos context.synthetic_data = SyntheticData(context=context, data=data, window=10000, frequency=30) # Crear configuración del modelo junto con redes neuronales create_model(context) # Configurar resumen de operaciones summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter( "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries", context.sess.graph) if os.path.exists(context.model_path): context.saver.restore(context.sess, context.model_path) # Inicializar la memoria de repetición replay_buffer = ReplayBuffer(context.buffer_size) for episode in range(context.max_episodes): data, close_prices = context.synthetic_data.get_trayectory( t_intervals=context.max_ep_steps + context.n) # Resetear los valores del portafolio al inicio de cada episodio context.portfolio_value_memory = [] context.portfolio_value_memory.append(context.init_train_portfolio) context.train_invested_quantity = 0.0 context.assets_quantity_invested = [] context.portfolio_w_memory = [] context.init_portfolio_w = [] for i in range(len(context.assets) + 1): context.init_portfolio_w.append(0.0) context.portfolio_w_memory.append(context.init_portfolio_w) for i in range(len(context.assets)): context.assets_quantity_invested.append(0.0) context.train_cash = context.init_train_portfolio context.last_train_operation = 2 context.open_trade = False ep_reward = 0 ep_ave_max_q = 0 ep_loss = 0 # Se resta uno para tomar el cuenta la obtención del siguiente estado for i in range(context.max_ep_steps - 1): # Obtener el estado s = data[:, i:i + context.n, :] # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración random = np.random.rand() if random > context.epsilon: if s.shape == (len(context.assets), context.n, len(context.features)): a = context.actor.predict([s])[0] else: print("Episodio:", episode, "Paso:", i, "La forma del estado actual es incorrecta") continue else: rand_array = np.random.rand(len(context.assets) + 1) a = np.exp(rand_array) / np.sum(np.exp(rand_array)) context.epsilon = context.epsilon * context.epsilon_decay # Siguiente estado s2 = data[:, i + 1:i + 1 + context.n, :] if not s2.shape == (len( context.assets), context.n, len(context.features)): print("Episodio:", episode, "Paso:", i, "La forma del siguiente estado es incorrecta") continue # Recompensa this_closes = close_prices[:, i + context.n] previous_closes = close_prices[:, i + context.n - 1] r = get_reward(context, this_closes, previous_closes, a) # Punto terminal if i == (context.max_ep_steps - context.n - 2): t = True else: t = False replay_buffer.add(s, a, r, t, s2) if replay_buffer.size() > context.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( context.minibatch_size) # Calcular objetivos target_q = context.critic.predict_target( s2_batch, context.actor.predict_target(s2_batch)) y_i = [] for k in range(context.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + context.gamma * target_q[k]) # Actualizar el crítico dados los objetivos predicted_q_value_batch = np.reshape( y_i, (context.minibatch_size, 1)) predicted_q_value, losses, _ = context.critic.train( s_batch, a_batch, predicted_q_value_batch) ep_loss += np.mean(losses) ep_ave_max_q += np.amax(predicted_q_value) # Actualizar la política del actor utilizando el ejemplar de gradiente a_outs = context.actor.predict(s_batch) grads = context.critic.action_gradients(s_batch, a_outs) context.actor.train(s_batch, grads[0]) # Actualizar las redes objetivo context.actor.update_target_network() context.critic.update_target_network() ep_reward += r if i == (context.max_ep_steps - 2): summary_str = context.sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(i), summary_vars[2]: ep_loss / float(i) }) writer.add_summary(summary_str, episode) writer.flush() print( '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} ' .format(ep_reward, episode, (ep_ave_max_q / float(i)), context.portfolio_value_memory[-1], context.epsilon)) _ = context.saver.save(context.sess, context.model_path)
class MADDPG: def __init__(self, num_agents, state_size, action_size, hidden_layers, seed, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize MADDPG agent.""" super(MADDPG, self).__init__() self.seed = random.seed(seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.buffer_size = buffer_size self.batch_size = batch_size self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \ tau, lr_actor, lr_critic, weight_decay, seed) \ for _ in range(num_agents)] self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size) def act(self, states): actions = np.zeros([self.num_agents, self.action_size]) for index, agent in enumerate(self.agents): actions[index, :] = agent.act(states[index]) return actions def step(self, states, actions, rewards, next_states, dones): """One step for MADDPG agent, include store the current transition and update parameters.""" self.replay_buffer.add(states, actions, rewards, next_states, dones) if len(self.replay_buffer) > self.batch_size: ''' experiences = self.replay_buffer.sample() states_list, _, _, _, _ = experiences next_actions_list = [self.agents[idx].target_actor(states).detach() \ for idx, states in enumerate(states_list)] for i in range(self.num_agents): self.agents[i].step_learn(experiences, next_actions_list, i) ''' for agent in self.agents: experiences = self.replay_buffer.sample() agent.step_learn(experiences) def save_weights(self): for index, agent in enumerate(self.agents): torch.save( agent.critic.state_dict(), 'agent{}_critic_trained_with_DDPG.pth'.format(index + 1)) torch.save(agent.actor.state_dict(), 'agent{}_actor_trained_with_DDPG.pth'.format(index + 1)) def reset(self): for agent in self.agents: agent.reset()
loss = [] for step in range(max_steps + 1): # transition action = actor.get_action(observation, episode, mainQNet) next_observation, reward, done, _ = env.step(action) next_observation = np.reshape(next_observation, (1, input_size)) # if terminal if done: next_observation = np.zeros_like(observation) if step < 195: # failure reward = -1 else: #success reward = 1 memory.add((observation, action, reward, next_observation)) break else: reward = 0 score += 1 memory.add((observation, action, reward, next_observation)) observation = next_observation if memory.length() > batch_size: loss_value = mainQNet.train(batch_size, gamma, memory, targetQNet) loss.append(loss_value) # record score_record.append(score)
class Agent(): """ Class implementation of a so-called "intelligent" agent. This agent interacts with and learns from the environment. This agent employs the DDPG algorithm to solve this problem. """ # actor_local = None # actor_target = None # actor_optimizer = None """ Class-level Actor properties. """ # critic_local = None # critic_target = None # critic_optimizer = None """ Class-level Critic properties. """ # memory = None """ Class-level memory variable. """ def __init__(self, state_size, action_size, seed, add_noise=True): """ Initialize an Agent instance. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed add_noise (bool): Toggle for using the stochastic process """ # Set the parameters. self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Setting the Actor network (with the Target Network). self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) # Optimize the Actor using Adam. self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Setting the Critic network (with the Target Network). self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) # Optimize the Critic using Adam. self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Set up noise processing. if add_noise: self.noise = Noise((20, action_size), seed) # Use the Replay memory buffer (once per class). self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize the time step (until max NUM_TIME_STEPS is reached). # self.t_step = 0 def step(self, time_step, states, actions, rewards, next_states, dones): """ Update the network on each step. In other words, save the experience in replay memory, and then use random sampling from the buffer to learn. """ # Save experience in replay memory. for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every time step till NUM_TIME_STEPS is reached. # if time_step % NUM_TIME_STEPS != 0: # return # Save the experience in replay memory, then use random sampling from the buffer to learn. self.sample_and_learn() def sample_and_learn(self): """ For a specified number of agents, use random sampling from the buffer to learn. """ # If enough samples are available in memory, get random subset and learn. if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) # for _ in range(NUM_LEARN_UPDATES): # experiences = Agent.memory.sample() # self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """ Return the actions for a given state as per current policy. Params ====== state (array_like): Current state add_noise (bool): Toggle for using the stochastic process """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # If the stochastic process is enabled. if add_noise: action += self.noise.sample() # Return the action. return np.clip(action, -1, 1) def reset(self): """ Reset the state. """ # Reset the internal state (noise) to mean (mu). self.noise.reset() def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. i.e., Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where actor_target(state) -> action, and critic_target(state, action) -> Q-value. Params ====== experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples gamma (float): Discount factor """ # Set the parameters. states, actions, rewards, next_states, dones = experiences """ Update the Critic. """ # Get the predicted next-state actions and Q-values from the target models. # Calculate the pair action/reward for each of the next_states. actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q-targets for the current states, (y_i). Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute the Critic loss. Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss. self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() """ Update the Actor. """ # Compute the Actor loss. actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss. self.actor_optimizer.zero_grad() # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) actor_loss.backward() self.actor_optimizer.step() """ Update the target networks. """ self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. i.e., θ_target = τ * θ_local + (1 - τ) * θ_target. Params ====== local_model (PyTorch model): Weights will be copied from target_model (PyTorch model): Weights will be copied to tau (float): Interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class MPOAgent: def __init__(self, env_id: str, logdir: Path): self.env_id = env_id self.summary_writer = tf.summary.create_file_writer( str(logdir)) if logdir else None self.action_space = gym.make(self.env_id).action_space.shape[0] self.replay_buffer = ReplayBuffer(maxlen=10000) self.policy = GaussianPolicyNetwork(action_space=self.action_space) self.target_policy = GaussianPolicyNetwork( action_space=self.action_space) self.critic = QNetwork() self.target_critic = QNetwork() self.log_temperature = tf.Variable(1.) self.log_alpha_mu = tf.Variable(1.) self.log_alpha_sigma = tf.Variable(1.) self.eps = 0.1 self.eps_mu = 0.01 self.eps_sigma = 0.001 self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005) self.batch_size = 128 self.n_samples = 10 self.update_period = 4 self.gamma = 0.99 self.target_policy_update_period = 400 self.target_critic_update_period = 400 self.global_steps = 0 self.episode_count = 0 self.setup() def setup(self): """ Initialize network weights """ env = gym.make(self.env_id) dummy_state = env.reset() dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.action_space) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.policy(dummy_state) self.target_policy(dummy_state) self.critic(dummy_state, dummy_action) self.target_critic(dummy_state, dummy_action) self.target_policy.set_weights(self.policy.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) def save(self, save_dir): save_dir = Path(save_dir) self.policy.save_weights(str(save_dir / "policy")) self.critic.save_weights(str(save_dir / "critic")) def load(self, load_dir=None): load_dir = Path(load_dir) self.policy.load_weights(str(load_dir / "policy")) self.target_policy.load_weights(str(load_dir / "policy")) self.critic.load_weights(str(load_dir / "critic")) self.target_critic.load_weights(str(load_dir / "critic")) def rollout(self): episode_rewards, episode_steps = 0, 0 done = False env = gym.make(self.env_id) state = env.reset() while not done: action = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] try: next_state, reward, done, _ = env.step(action) except Exception as err: print(err) import pdb pdb.set_trace() #: Bipedalwalkerの転倒ペナルティ-100は大きすぎるためclip transition = Transition(state, action, np.clip(reward, -1., 1.), next_state, done) self.replay_buffer.add(transition) state = next_state episode_rewards += reward episode_steps += 1 self.global_steps += 1 if (len(self.replay_buffer) >= 5000 and self.global_steps % self.update_period == 0): self.update_networks() if self.global_steps % self.target_critic_update_period == 0: self.target_critic.set_weights(self.critic.get_weights()) if self.global_steps % self.target_policy_update_period == 0: self.target_policy.set_weights(self.policy.get_weights()) self.episode_count += 1 with self.summary_writer.as_default(): tf.summary.scalar("episode_reward_stp", episode_rewards, step=self.global_steps) tf.summary.scalar("episode_steps_stp", episode_steps, step=self.global_steps) tf.summary.scalar("episode_reward", episode_rewards, step=self.episode_count) tf.summary.scalar("episode_steps", episode_steps, step=self.episode_count) return episode_rewards, episode_steps def update_networks(self): (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(batch_size=self.batch_size) B, M = self.batch_size, self.n_samples # [B, obs_dim] -> [B, obs_dim * M] -> [B * M, obs_dim] next_states_tiled = tf.reshape(tf.tile(next_states, multiples=(1, M)), shape=(B * M, -1)) target_mu, target_sigma = self.target_policy(next_states_tiled) # For MultivariateGaussianPolicy #target_dist = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=target_sigma) # For IndependentGaussianPolicy target_dist = tfd.Independent(tfd.Normal(loc=target_mu, scale=target_sigma), reinterpreted_batch_ndims=1) sampled_actions = target_dist.sample() # [B * M, action_dim] #sampled_actions = tf.clip_by_value(sampled_actions, -1.0, 1.0) # Update Q-network: sampled_qvalues = tf.reshape(self.target_critic( next_states_tiled, sampled_actions), shape=(B, M, -1)) mean_qvalues = tf.reduce_mean(sampled_qvalues, axis=1) TQ = rewards + self.gamma * (1.0 - dones) * mean_qvalues with tf.GradientTape() as tape1: Q = self.critic(states, actions) loss_critic = tf.reduce_mean(tf.square(TQ - Q)) variables = self.critic.trainable_variables grads = tape1.gradient(loss_critic, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.critic_optimizer.apply_gradients(zip(grads, variables)) # E-step: # Obtain η* by minimising g(η) with tf.GradientTape() as tape2: temperature = tf.math.softplus(self.log_temperature) q_logsumexp = tf.math.reduce_logsumexp(sampled_qvalues / temperature, axis=1) loss_temperature = temperature * ( self.eps + tf.reduce_mean(q_logsumexp, axis=0)) grad = tape2.gradient(loss_temperature, self.log_temperature) if tf.math.is_nan(grad).numpy().sum() != 0: print("NAN GRAD in TEMPERATURE !!!!!!!!!") import pdb pdb.set_trace() else: self.temperature_optimizer.apply_gradients([ (grad, self.log_temperature) ]) # Obtain sample-based variational distribution q(a|s) temperature = tf.math.softplus(self.log_temperature) # M-step: Optimize the lower bound J with respect to θ weights = tf.squeeze(tf.math.softmax(sampled_qvalues / temperature, axis=1), axis=2) # [B, M, 1] if tf.math.is_nan(weights).numpy().sum() != 0: print("NAN in weights !!!!!!!!!") import pdb pdb.set_trace() with tf.GradientTape(persistent=True) as tape3: online_mu, online_sigma = self.policy(next_states_tiled) # For MultivariateGaussianPolicy #online_dist = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=online_sigma) # For IndependentGaussianPolicy online_dist = tfd.Independent(tfd.Normal(loc=online_mu, scale=online_sigma), reinterpreted_batch_ndims=1) log_probs = tf.reshape(online_dist.log_prob(sampled_actions) + 1e-6, shape=(B, M)) # [B * M, ] -> [B, M] cross_entropy_qp = tf.reduce_sum(weights * log_probs, axis=1) # [B, M] -> [B,] # For MultivariateGaussianPolicy # online_dist_fixedmu = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=online_sigma) # online_dist_fixedsigma = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=target_sigma) # For IndependentGaussianPolicy online_dist_fixedmu = tfd.Independent(tfd.Normal( loc=target_mu, scale=online_sigma), reinterpreted_batch_ndims=1) online_dist_fixedsigma = tfd.Independent( tfd.Normal(loc=online_mu, scale=target_sigma), reinterpreted_batch_ndims=1) kl_mu = tf.reshape( target_dist.kl_divergence(online_dist_fixedsigma), shape=(B, M)) # [B * M, ] -> [B, M] kl_sigma = tf.reshape( target_dist.kl_divergence(online_dist_fixedmu), shape=(B, M)) # [B * M, ] -> [B, M] alpha_mu = tf.math.softplus(self.log_alpha_mu) alpha_sigma = tf.math.softplus(self.log_alpha_sigma) loss_policy = -cross_entropy_qp # [B,] loss_policy += tf.stop_gradient(alpha_mu) * tf.reduce_mean(kl_mu, axis=1) loss_policy += tf.stop_gradient(alpha_sigma) * tf.reduce_mean( kl_sigma, axis=1) loss_policy = tf.reduce_mean(loss_policy) # [B,] -> [1] loss_alpha_mu = tf.reduce_mean( alpha_mu * tf.stop_gradient(self.eps_mu - tf.reduce_mean(kl_mu, axis=1))) loss_alpha_sigma = tf.reduce_mean( alpha_sigma * tf.stop_gradient(self.eps_sigma - tf.reduce_mean(kl_sigma, axis=1))) loss_alpha = loss_alpha_mu + loss_alpha_sigma variables = self.policy.trainable_variables grads = tape3.gradient(loss_policy, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.policy_optimizer.apply_gradients(zip(grads, variables)) variables = [self.log_alpha_mu, self.log_alpha_sigma] grads = tape3.gradient(loss_alpha, variables) grads, _ = tf.clip_by_global_norm(grads, 40.) self.alpha_optimizer.apply_gradients(zip(grads, variables)) del tape3 with self.summary_writer.as_default(): tf.summary.scalar("loss_policy", loss_policy, step=self.global_steps) tf.summary.scalar("loss_critic", loss_critic, step=self.global_steps) tf.summary.scalar("sigma", tf.reduce_mean(online_sigma), step=self.global_steps) tf.summary.scalar("kl_mu", tf.reduce_mean(kl_mu), step=self.global_steps) tf.summary.scalar("kl_sigma", tf.reduce_mean(kl_sigma), step=self.global_steps) tf.summary.scalar("temperature", temperature, step=self.global_steps) tf.summary.scalar("alpha_mu", alpha_mu, step=self.global_steps) tf.summary.scalar("alpha_sigma", alpha_sigma, step=self.global_steps) tf.summary.scalar("replay_buffer", len(self.replay_buffer), step=self.global_steps) def testplay(self, name, monitor_dir): total_rewards = [] env = wrappers.RecordVideo(gym.make(self.env_id), video_folder=monitor_dir, step_trigger=lambda i: True, name_prefix=name) state = env.reset() done = False total_reward = 0 while not done: action = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = env.step(action) total_reward += reward state = next_state total_rewards.append(total_reward) print(f"{name}", total_reward)
class Agent(object): def __init__(self, state_size, action_size, seed, config): self.state_size = state_size self.action_size = action_size self.config = config self.seed = random.seed(seed) self.local_q_net = QNetwork(state_size, action_size, seed).to(device) self.target_q_net = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_q_net.parameters(), lr=config["LR"]) self.memory = ReplayBuffer(action_size, config["BUFFER_SIZE"], config["BATCH_SIZE"], seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config["UPDATE_EVERY"] if self.t_step == 0: # if agent experienced enough if len(self.memory) > self.config["BATCH_SIZE"]: experiences = self.memory.sample() # Learn from previous experiences self.learn(experiences, self.config["GAMMA"]) def act(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_q_net.eval() with torch.no_grad(): action_values = self.local_q_net(state) self.local_q_net.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): # Double Q Learning states, actions, rewards, next_states, dones = experiences # Get next action estimation with local q network q_targets_next_expected = self.local_q_net(next_states).detach() q_targets_next_expected_actions = q_targets_next_expected.max( 1)[1].unsqueeze(1) # Calculate Next Targets q_targets_next = self.target_q_net(next_states).gather( 1, q_targets_next_expected_actions) # Non over-estimated targets q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Expected value q_expected = self.local_q_net(states).gather(1, actions) loss = torch.nn.functional.mse_loss(q_expected, q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.local_q_net, self.target_q_net, self.config["TAU"]) def soft_update(self, local_net, target_net, tau): for target_param, local_param in zip(target_net.parameters(), local_net.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
import gym import torch from buffer import ReplayBuffer from model import Actor gym.logger.set_level(40) num_episode = 5 env = gym.make('Pendulum-v0') buffer = ReplayBuffer(max_size=100) actor = Actor(env.observation_space.shape[0], env.action_space.shape[0]) for e in range(num_episode): cumulative_reward = 0 state = env.reset() for i in range(env.spec.max_episode_steps): action = actor(torch.FloatTensor(state)).detach().numpy() next_state, reward, done, info = env.step(action * env.action_space.high[0]) buffer.add([state, next_state, reward, done]) state = next_state cumulative_reward += reward print(f'Episode: {e:>3}, Reward: {cumulative_reward:>8.2f}') print(len(buffer))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_shape, action_size, seed, cnn=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed cnn (bool): whether to use convolutional NN """ self.state_shape = state_shape self.action_size = action_size self.seed = random.seed(seed) self.cnn = cnn if cnn: self.qnetwork_local = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) else: self.qnetwork_local = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ if self.cnn: state = torch.from_numpy(state).float().to(device) else: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if self.cnn: n, x, y, c = states.shape states = states.reshape(n, c, x, y) # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class MADDPG(): def __init__(self, num_agents, state_size, action_size, random_seed): super(MADDPG, self).__init__() self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.maddpg_agent = [ Agent(self.state_size, self.action_size, self.num_agents * self.state_size, self.num_agents * self.action_size, self.random_seed) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise_amplitud = 1 self.noise_reduction = 0.9995 self.t_step = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step += 1 if len(self.memory) > BATCH_SIZE and self.t_step % UPDATE_EVERY == 0: # Learn, if enough samples are available in memory for _ in range(round(UPDATE_AMOUNT)): for agent in range(self.num_agents): experiences = self.memory.sample() self.learn(experiences, agent, GAMMA) self.update_targets() def act(self, states): """get actions from all agents in the MADDPG object""" if self.t_step < NOISE_START: noise_amplitud = 0 else: noise_amplitud = self.noise_amplitud self.noise_amplitud = max( self.noise_amplitud * self.noise_reduction, 0.1) actions = np.array([ agent.act(state, noise_amplitud) for agent, state in zip(self.maddpg_agent, states) ]) return actions def target_actors(self, states): target_actions = torch.cat([ agent.actor_target(states[:, i, :]) for i, agent in enumerate(self.maddpg_agent) ], dim=1) return target_actions def actors(self, states): actions = torch.cat([ agent.actor(states[:, i, :]) for i, agent in enumerate(self.maddpg_agent) ], dim=1) return actions def learn(self, experiences, agent_number, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences agent = self.maddpg_agent[agent_number] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models target_actions_full = self.target_actors(next_states) next_states_full = next_states.view(-1, self.num_agents * self.state_size) # target_critic_input = torch.cat((next_states_full,target_actions_full), dim = 1) Q_targets_next = agent.critic_target(next_states_full, target_actions_full) # Compute Q targets for current states (y_i) Q_targets = rewards[:, agent_number].view( -1, 1) + (gamma * Q_targets_next * (1 - dones[:, agent_number].view(-1, 1))) # Compute critic loss actions_full = actions.view(-1, self.action_size * self.num_agents) states_full = states.view(-1, self.num_agents * self.state_size) # critic_input = torch.cat((states_full,actions_full), dim = 1) Q_expected = agent.critic(states_full, actions_full) critic_loss = F.mse_loss(Q_expected, Q_targets) # critic_loss = huber_loss(Q_expected, Q_targets.detach()) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1) agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_full_pred = self.actors(states) # critic_input_loss = torch.cat((states_batch, actions_full), dim = 1) actor_loss = -agent.critic(states_full, actions_full_pred).mean() # Minimize the loss agent.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1) agent.actor_optimizer.step() def update_targets(self): """soft update target networks""" for agent in self.maddpg_agent: self.soft_update(agent.actor, agent.actor_target, TAU) self.soft_update(agent.critic, agent.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): for ddpg_agent in self.maddpg_agent: ddpg_agent.noise.reset()
# 开始搜集数据 obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) reset = True duration = [] episode_start = 0 episode_end = 0 for t in range(total_timesteps): env.render() update_eps = tf.constant(exploration.value(t)) action = agent.step(tf.constant(obs), update_eps=update_eps) action = action[0].numpy() # tensor转换为numpy用于env输入 reset = False new_obs, rew, done, _ = env.step(action) new_obs = np.expand_dims(np.array(new_obs), axis=0) replay_buffer.add(obs[0], action, rew, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += rew if done: episode_end = t duration.append(episode_end - episode_start) episode_start = t obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
def train(sess, env, args, actor_critic): sess.run(tf.global_variables_initializer()) global_summary = tf.summary.FileWriter( 'summaries/' + 'feeding_sac_all' + datetime.datetime.now().strftime('%d-%m-%y%H%M'), sess.graph) actor_critic.update_target_network() replay_buffer = ReplayBuffer(int(args['buffer_size'])) pbar = tqdm(total=int(args['max_steps']), dynamic_ncols=True) tfirststart = time.perf_counter() total_step = 0 while total_step < int(args['max_steps']): state = env.reset() episode_reward = 0 end_step = 0 while True: action, greedy_action = actor_critic.actor_predict([state]) action = action[0] greedy_action = greedy_action[0] state2, reward, done, info = env.step(action) episode_reward += reward end_step += 1 total_step += 1 replay_buffer.add(state, action, reward, state2, done) state = state2 if total_step > 100 * int(args['minibatch_size']): batch_state, batch_actions, batch_rewards, batch_state2, batch_dones = replay_buffer.sample( int(args['minibatch_size'])) actor_loss, critic_loss, value_loss, all_loss, _ = actor_critic.all_train( batch_state, batch_state2, batch_actions, batch_rewards, batch_dones) actor_critic.update_target_network() summary = tf.Summary() summary.value.add(tag='loss/value_loss', simple_value=value_loss) summary.value.add(tag='loss/critic_loss', simple_value=critic_loss) summary.value.add(tag='loss/actor_loss', simple_value=actor_loss) summary.value.add(tag='loss/total_loss', simple_value=all_loss) global_summary.add_summary(summary, total_step) global_summary.flush() if total_step % 1000000 == 0 and total_step != 0: tnow = time.perf_counter() print('consume time', tnow - tfirststart) savepath = osp.join("my_model_sac/", '%.5i' % total_step) os.makedirs(savepath, exist_ok=True) savepath = osp.join(savepath, 'sacmodel') print('Saving to', savepath) save_state(savepath) if done: success_time = env.success_time() fall_time = env.fall_times() msg = 'step: {},episode reward: {},episode len: {},success_time: {},fall_time: {}' pbar.update(total_step) pbar.set_description( msg.format(total_step, episode_reward, end_step, success_time, fall_time)) summary = tf.Summary() summary.value.add(tag='Perf/Reward', simple_value=episode_reward) summary.value.add(tag='Perf/episode_len', simple_value=end_step) summary.value.add(tag='Perf/success_time', simple_value=success_time) summary.value.add(tag='Perf/fall_time', simple_value=fall_time) global_summary.add_summary(summary, total_step) global_summary.flush() break
class MADDPG(): """Agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, action_size=2, n_agents=2, seed=0): """ Params ====== action_size (int): dimension of each action seed (int): Random seed n_agents (int): number of agents """ self.n_agents = n_agents self.t_step = 0 self.noise_on = True # create two agents, each with their own actor and critic models = [ model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents) ] self.agents = [DDPG(i, models[i]) for i in range(n_agents)] # create shared replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.t_step = self.t_step + 1 if self.t_step % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: experiences = [ self.memory.sample() for _ in range(self.n_agents) ] self.learn(experiences, GAMMA) def act(self, all_states, add_noise=True): # pass each agent's state from the environment and calculate its action all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, add_noise=self.noise_on) #self.noise_weight *= noise_decay all_actions.append(action) return np.array(all_actions).reshape( 1, -1) # reshape 2x2 into 1x4 dim vector def learn(self, experiences, gamma): all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) # extract agent i's state and get action via actor network state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # extract agent i's next state and get action via target actor network next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoint_actor_agent_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoint_critic_agent_{i}.pth")
class Agent(): def __init__(self, state_size, action_size, seed=0, lr=1e-3, update_every=4, batch_size=4, buffer_size=64, gamma=0.0994, tau=1e-3, model_path="model.pth"): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") print("=== AGENT ===") print(f"Created agent on device: {self.device}") self.model_path = model_path self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.update_every = update_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # network variables self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.load() # Control variables self.memory = ReplayBuffer(action_size, buffer_size, self.batch_size, seed, self.device) self.t_step = 0 def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss and backpropagate loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def save(self): torch.save(self.qnetwork_local.state_dict(), self.model_path) torch.save(self.qnetwork_target.state_dict(), self.model_path.replace('.pth', '_target.pth')) print("Saved agent model.") def load(self): if (os.path.isfile(self.model_path)): self.qnetwork_local.load_state_dict(torch.load(self.model_path)) self.qnetwork_target.load_state_dict( torch.load(self.model_path.replace('.pth', '_target.pth'))) print(f"Loaded agent model: {self.model_path}")
class MADDPG_Trainer: def __init__(self, n_agents, act_spcs, ob_spcs, writer, args): self.args = args self.memory = ReplayBuffer(args.buffer_length, n_agents, device) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_agents = n_agents self.act_spcs = act_spcs self.ob_spcs = ob_spcs self.agents = [ DDPG_agent(self.act_spcs[i], self.ob_spcs[i], np.sum(self.ob_spcs), np.sum(self.act_spcs)) for i in range(n_agents) ] self.n_steps = 0 self.n_updates = 0 self.writer = writer self.criterion = nn.MSELoss() def get_actions(self, states): return [ agent.select_action(state)[0] for agent, state in zip(self.agents, states) ] def store_transitions(self, states, actions, rewards, next_states, dones): self.memory.add(states, actions, rewards, next_states, dones) def reset(self): pass def transform_states(self, states, N): obses = [] for i in range(N): states_ = [] for j in range(self.n_agents): states_.append(states[j][i]) obses.append(torch.cat([f.float().to(device) for f in states_])) return torch.stack(obses) def transform_actions(self, actions, N): acts = [] for i in range(N): actions_ = [] for j in range(self.n_agents): actions_.append(actions[j][i]) acts.append(torch.cat([f.float().to(device) for f in actions_])) return torch.stack(acts) def update_all_targets(self): for agent in self.agents: soft_update(agent.policy_targ, agent.policy, TAU) soft_update(agent.qnet_targ, agent.qnet, TAU) def prep_training(self): for agent in self.agents: agent.qnet.train() agent.policy.train() agent.qnet_targ.train() agent.policy_targ.train() def eval(self): for agent in self.agents: agent.qnet.eval() agent.policy.eval() agent.qnet_targ.eval() agent.policy_targ.eval() def sample_and_train(self, batch_size): # TODO ADD Model saving, optimize code batch = self.memory.sample(min(batch_size, len(self.memory))) states_i, actions_i, rewards_i, next_states_i, dones_i = batch states_all = torch.cat(states_i, 1) next_states_all = torch.cat(next_states_i, 1) actions_all = torch.cat(actions_i, 1) for i, agent in enumerate(self.agents): next_actions_all = [ onehot_from_logits(ag.policy_targ(next_state)) for ag, next_state in zip(self.agents, next_states_i) ] # computing target total_obs = torch.cat( [next_states_all, torch.cat(next_actions_all, 1)], 1) target_q = self.agents[i].qnet_targ(total_obs).detach() rewards = rewards_i[i].view(-1, 1) dones = dones_i[i].view(-1, 1) target_q = rewards + (1 - dones) * GAMMA * target_q # computing the inputs input_q = self.agents[i].qnet( torch.cat([states_all, actions_all], 1)) self.agents[i].q_optimizer.zero_grad() loss = self.criterion(input_q, target_q.detach()) # print("LOSS", loss) loss.backward() torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(), 0.5) self.agents[i].q_optimizer.step() actor_loss = 0 # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø # use gumbel softmax max temp trick policy_out = self.agents[i].policy(states_i[i]) gumbel_sample = gumbel_softmax(policy_out, hard=True) actions_curr_pols = [ onehot_from_logits(agent_.policy(state)) for agent_, state in zip(self.agents, states_i) ] for action_batch in actions_curr_pols: action_batch.detach_() actions_curr_pols[i] = gumbel_sample actor_loss = -self.agents[i].qnet( torch.cat( [states_all.detach(), torch.cat(actions_curr_pols, 1)], 1)).mean() actor_loss += (policy_out**2).mean() * 1e-3 self.agents[i].p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), 5) torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5) self.agents[i].p_optimizer.step() # detach the forward propagated action samples actions_i[i].detach_() if self.args.use_writer: self.writer.add_scalars("Agent_%i" % i, { "vf_loss": loss, "actor_loss": actor_loss }, self.n_updates) self.update_all_targets() self.n_updates += 1
class MADDPG_Trainer: def __init__(self, n_agents, act_spcs, ob_spcs, writer, args): self.args = args self.memory = ReplayBuffer(args.buffer_length, n_agents, device) # self.memory = ReplayMemory(args.buffer_length, n_agents, device) self.use_maddpg = args.algo == "maddpg" self.use_sac = args.use_sac self.use_td3 = args.use_td3 self.use_single_q = args.single_q self.all_obs = args.all_obs self.n_agents = n_agents self.act_spcs = act_spcs self.ob_spcs = ob_spcs qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i] for i in range(n_agents)] qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i] for i in range(n_agents)] if self.use_sac and not self.use_td3: self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] elif self.use_td3: self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] else: self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] self.n_steps = 0 self.n_updates = 0 self.writer = writer self.criterion = nn.MSELoss() self.sac_alpha = args.sac_alpha self.agent_actions = [[] for i in range(self.n_agents)] def plot_actions(self): for i in range(self.n_agents): sns.distplot(self.agent_actions[i], bins=self.agents[i].act_sp, kde=False) # __import__('ipdb').set_trace() plt.show() def get_actions(self, states): result = [] # with torch.no_grad(): for i, (agent, state) in enumerate(zip(self.agents, states)): action = agent.select_action(state)[0] result.append(action) # if self.args.use_writer: self.agent_actions[i].append(np.argmax(action.cpu()).item()) self.n_steps += 1 return result def store_transitions(self, states, actions, rewards, next_states, dones): # print(sys.getsizeof(states) + sys.getsizeof(actions) + sys.getsizeof(rewards) # + sys.getsizeof(next_states) + sys.getsizeof(dones)) self.memory.add(states, actions, rewards, next_states, dones) def reset(self): pass def transform_states(self, states, N): obses = [] for i in range(N): states_ = [] for j in range(self.n_agents): states_.append(states[j][i]) obses.append(torch.cat([f.float().to(device) for f in states_])) return torch.stack(obses) def transform_actions(self, actions, N): acts = [] for i in range(N): actions_ = [] for j in range(self.n_agents): actions_.append(actions[j][i]) acts.append(torch.cat([f.float().to(device) for f in actions_])) return torch.stack(acts) def update_all_targets(self): for agent in self.agents: agent.update_targets(TAU) def prep_training(self): for agent in self.agents: agent.set_train() def eval(self): for agent in self.agents: agent.set_eval() def sample_and_train_td3(self, batch_size): t = self.n_steps # print(self.n_steps) update_every = self.agents[0].update_every update_after = self.agents[0].update_after if (t + 1) > update_after and (t + 1) % update_every == 0: for i in range(update_every): self.train_td3(batch_size, i) def batch_add_random_acts(self, tensor, ag_i): # __import__('ipdb').set_trace() n_clip =self.agents[ag_i].target_noise_clip noise = (self.agents[ag_i].target_noise**0.5)*torch.randn(tensor.shape) noise = torch.clamp(noise, -n_clip, n_clip) tensor[:] = tensor[:] + noise # __import__('ipdb').set_trace() def train_td3(self, batch_size): self.n_updates += 1 batch = self.memory.sample(min(batch_size, len(self.memory))) states_i, actions_i, rewards_i, next_states_i, dones_i = batch # __import__('ipdb').set_trace() if self.use_maddpg: states_all = torch.cat(states_i, 1) next_states_all = torch.cat(next_states_i, 1) actions_all = torch.cat(actions_i, 1) for i, agent in enumerate(self.agents): # print("training_qnet") if not self.use_maddpg: states_all = states_i[i] next_states_all = next_states_i[i] actions_all = actions_i[i] if self.use_maddpg: next_actions_all = [ag.policy(next_state) for ag, next_state in zip(self.agents, next_states_i)] [self.batch_add_random_acts(e, i) for i, e in enumerate(next_actions_all)] next_actions_all = [onehot_from_logits(e) for e in next_actions_all] else: actions_and_logits = [onehot_from_logits(agent.policy(next_states_i[i]))] next_actions_all = [e[0] for e in actions_and_logits] total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1) qnet_targs = [] for qnet in self.agents[i].qnet_targs: qnet_targs.append(qnet(total_obs).detach()) rewards = rewards_i[i].view(-1, 1) dones = dones_i[i].view(-1, 1) qnet_mins = torch.min(qnet_targs[0], qnet_targs[1]) target_q = rewards + (1 - dones) * GAMMA * (qnet_mins) losses = [] for j, qnet in enumerate(self.agents[i].qnets): input_q = qnet(torch.cat([states_all, actions_all], 1)) self.agents[i].q_optimizers[j].zero_grad() loss = self.criterion(input_q, target_q.detach()) losses.append(loss.item()) loss.backward() # torch.nn.utils.clip_grad_norm_(qnet.parameters(), 0.5) self.agents[i].q_optimizers[j].step() if self.args.use_writer: self.writer.add_scalar(f"Agent_{i}: q_net_loss: ", np.mean(losses), self.n_updates) if self.n_updates % 2 == 0: for i in range(self.n_agents): # print("training policy") actor_loss = 0 # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø # use gumbel softmax max temp trick policy_out = self.agents[i].policy(states_i[i]) gumbel_sample = gumbel_softmax(policy_out, hard=True) if self.use_maddpg: actions_curr_pols = [onehot_from_logits(agent_.policy(state)) for agent_, state in zip(self.agents, states_i)] for action_batch in actions_curr_pols: action_batch.detach_() actions_curr_pols[i] = gumbel_sample actor_loss = - self.agents[i].qnets[0](torch.cat([states_all.detach(), torch.cat(actions_curr_pols, 1)], 1)).mean() else: actor_loss = - self.agents[i].qnets[0](torch.cat([states_all.detach(), gumbel_sample], 1)).mean() self.agents[i].p_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5) self.agents[i].p_optimizer.step() actions_i[i].detach_() if self.args.use_writer: self.writer.add_scalar(f"Agent_{i}: policy_objective: ", actor_loss.item(), self.n_updates) self.update_all_targets() # self.n_updates += 1 def sample_and_train_sac(self, batch_size): # TODO ADD Model saving, optimize code batch = self.memory.sample(min(batch_size, len(self.memory))) states_i, actions_i, rewards_i, next_states_i, dones_i = batch # __import__('ipdb').set_trace() if self.use_maddpg: states_all = torch.cat(states_i, 1) next_states_all = torch.cat(next_states_i, 1) actions_all = torch.cat(actions_i, 1) for i, agent in enumerate(self.agents): if not self.use_maddpg: states_all = states_i[i] next_states_all = next_states_i[i] actions_all = actions_i[i] if self.use_maddpg: actions_and_logits = [onehot_from_logits(ag.policy(next_state), logprobs=True) for ag, next_state in zip(self.agents, next_states_i)] next_actions_all = [e[0] for e in actions_and_logits] next_logits_all = [self.sac_alpha*e[1] for e in actions_and_logits] # __import__('ipdb').set_trace() else: actions_and_logits = [onehot_from_logits(agent.policy(next_states_i[i]), logprobs=True)] next_actions_all = [e[0] for e in actions_and_logits] next_logits_all = [self.sac_alpha*e[1] for e in actions_and_logits] # computing target total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1) # target_q = self.agents[i].qnet_targ(total_obs).detach() qnet_targs = [] for qnet in self.agents[i].qnet_targs: qnet_targs.append(qnet(total_obs).detach()) rewards = rewards_i[i].view(-1, 1) dones = dones_i[i].view(-1, 1) qnet_mins = torch.min(qnet_targs[0], qnet_targs[1]) # __import__('ipdb').set_trace() logits_idx = i if self.use_maddpg else 0 logits_agent = next_logits_all[logits_idx] # if len(qnet_mins.squeeze(-1)) != len(logits_agent.squeeze(-1)): # __import__('ipdb').set_trace() target_q = rewards + (1 - dones) * GAMMA * (qnet_mins - logits_agent.reshape(qnet_mins.shape)) # __import__('ipdb').set_trace() # computing the inputs for j, qnet in enumerate(self.agents[i].qnets): input_q = qnet(torch.cat([states_all, actions_all], 1)) self.agents[i].q_optimizers[j].zero_grad() # print("----") # __import__('ipdb').set_trace() loss = self.criterion(input_q, target_q.detach()) # print('after') loss.backward() torch.nn.utils.clip_grad_norm_(qnet.parameters(), 0.5) self.agents[i].q_optimizers[j].step() # __import__('ipdb').set_trace() actor_loss = 0 # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø # use gumbel softmax max temp trick policy_out = self.agents[i].policy(states_i[i]) gumbel_sample, act_logprobs = gumbel_softmax(policy_out, hard=True, logprobs=True) act_logprobs = self.sac_alpha*act_logprobs # __import__('ipdb').set_trace() if self.use_maddpg: with torch.no_grad(): actions_curr_pols = [onehot_from_logits(agent_.policy(state)) for agent_, state in zip(self.agents, states_i)] actions_curr_pols[i] = gumbel_sample total_obs = torch.cat([states_all, torch.cat(actions_curr_pols, 1)], 1) qnet_outs = [] for qnet in self.agents[i].qnets: qnet_outs.append(qnet(total_obs)) qnet_mins = torch.min(qnet_outs[0], qnet_outs[1]) actor_loss = - qnet_mins.mean() # __import__('ipdb').set_trace() else: # actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(), # gumbel_sample], 1)).mean() # actions_curr_pols[i] = gumbel_sample # __import__('ipdb').set_trace() total_obs = torch.cat([states_all, gumbel_sample], 1) qnet_outs = [] for qnet in self.agents[i].qnets: qnet_outs.append(qnet(total_obs)) qnet_mins = torch.min(qnet_outs[0], qnet_outs[1]) actor_loss = - qnet_mins.mean() # actor_loss += (policy_out**2).mean() * 1e-3 self.agents[i].p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), 5) # torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5) self.agents[i].p_optimizer.step() # detach the forward propagated action samples actions_i[i].detach_() # __import__('ipdb').set_trace() if self.args.use_writer: self.writer.add_scalars("Agent_%i" % i, { "vf_loss": loss, "actor_loss": actor_loss }, self.n_updates) self.update_all_targets() self.n_updates += 1 def sample_and_train(self, batch_size): return # TODO ADD Model saving, optimize code batch = self.memory.sample(min(batch_size, len(self.memory))) states_i, actions_i, rewards_i, next_states_i, dones_i = batch # __import__('ipdb').set_trace() if self.use_maddpg: states_all = torch.cat(states_i, 1) next_states_all = torch.cat(next_states_i, 1) actions_all = torch.cat(actions_i, 1) for i, agent in enumerate(self.agents): if not self.use_maddpg: states_all = states_i[i] next_states_all = next_states_i[i] actions_all = actions_i[i] if self.use_maddpg: next_actions_all = [onehot_from_logits(ag.policy_targ(next_state)) for ag, next_state in zip(self.agents, next_states_i)] else: next_actions_all = [onehot_from_logits(agent.policy_targ(next_states_i[i]))] # computing target total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1) target_q = self.agents[i].qnet_targ(total_obs).detach() rewards = rewards_i[i].view(-1, 1) dones = dones_i[i].view(-1, 1) target_q = rewards + (1 - dones) * GAMMA * target_q # computing the inputs input_q = self.agents[i].qnet(torch.cat([states_all, actions_all], 1)) self.agents[i].q_optimizer.zero_grad() loss = self.criterion(input_q, target_q.detach()) # print("LOSS", loss) loss.backward() torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(), 0.5) self.agents[i].q_optimizer.step() actor_loss = 0 # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø # use gumbel softmax max temp trick policy_out = self.agents[i].policy(states_i[i]) gumbel_sample = gumbel_softmax(policy_out, hard=True) if self.use_maddpg: actions_curr_pols = [onehot_from_logits(agent_.policy(state)) for agent_, state in zip(self.agents, states_i)] for action_batch in actions_curr_pols: action_batch.detach_() actions_curr_pols[i] = gumbel_sample actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(), torch.cat(actions_curr_pols, 1)], 1)).mean() else: actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(), gumbel_sample], 1)).mean() actor_loss += (policy_out**2).mean() * 1e-3 self.agents[i].p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), 5) torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5) self.agents[i].p_optimizer.step() # detach the forward propagated action samples actions_i[i].detach_() # __import__('ipdb').set_trace() if self.args.use_writer: self.writer.add_scalars("Agent_%i" % i, { "vf_loss": loss, "actor_loss": actor_loss }, self.n_updates) self.update_all_targets() self.n_updates += 1
class MADDPGAgent: """Interacts and learns from the environment using multiple DDPG agents""" def __init__(self): """Initialize a MADDPG Agent object.""" super(MADDPGAgent, self).__init__() self.config = Config.getInstance() self.action_num = self.config.action_size * self.config.num_agents self.t_step = 0 self.maddpg_agent = [ DDPGAgent() for _ in range(self.config.num_agents) ] self.memory = ReplayBuffer() def get_actors(self): """get actors of all the agents in the MADDPG object""" actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent] return actors # def get_target_actors(self): # """get target_actors of all the agents in the MADDPG object""" # target_actors = [ # ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent] # return target_actors def act(self, obs_all_agents, noise=0.0): """get actions from all agents in the MADDPG object""" actions = [ agent.act(obs, noise) for agent, obs in zip(self.maddpg_agent, obs_all_agents) ] return np.concatenate(actions) def update_act(self, obs_all_agents, agent_num, noise_decay_parameter=0.0): """ get target network actions from all the agents in the MADDPG object """ actions_ = [] for a_i, ddpg_agent in enumerate(self.maddpg_agent): obs = obs_all_agents[:, a_i, :].to(self.config.device) acn = ddpg_agent.actor( obs) + noise_decay_parameter * ddpg_agent.noise.sample() if a_i != agent_num: acn = acn.detach() actions_.append(acn) return actions_ def target_act(self, obs_all_agents, noise=0.0): """ get target network actions from all the agents in the MADDPG object """ target_actions = [ ddpg_agent.target_act(obs_all_agents[:, a_i, :], noise) for a_i, ddpg_agent in enumerate(self.maddpg_agent) ] return target_actions def step(self, _states, _actions, _rewards, _next_states, _dones): """Save experience in replay memory, and use random sample from buffer to learn.""" states_full = np.reshape(_states, newshape=(-1)) next_states_full = np.reshape(_next_states, newshape=(-1)) self.memory.add(_states, states_full, _actions, _rewards, _next_states, next_states_full, _dones) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.config.update_every if self.t_step == 0: if len(self.memory) > self.config.batch_size: for a_i in range(self.config.num_agents): samples = self.memory.sample() self.update(samples, a_i) self.update_targets() def update_critic(self, samples, agent_number): """Update critic weights""" states, states_full, actions, rewards, next_states, next_states_full, dones = samples agent = self.maddpg_agent[agent_number] agent.critic_optimizer.zero_grad() # ---------------------------- update critic ---------------------- # actions_next = self.target_act(next_states) actions_next = torch.cat(actions_next, dim=1) Q_target_next = agent.target_critic(next_states_full, actions_next) Q_targets = rewards[:, agent_number].view(-1, 1) + self.config.gamma * \ Q_target_next * (1 - dones[:, agent_number].view(-1, 1)) Q_expected = agent.critic(states_full, actions.reshape(-1, self.action_num)) critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss.backward() agent.critic_optimizer.step() def update_actor(self, samples, agent_number): """Update actor weights""" states, states_full, actions, rewards, next_states, next_states_full, dones = samples agent = self.maddpg_agent[agent_number] agent.actor_optimizer.zero_grad() actions_pred = self.update_act(states, agent_number) actions_pred = torch.cat(actions_pred, dim=1) actor_loss = -agent.critic(states_full, actions_pred).mean() actor_loss.backward() agent.actor_optimizer.step() def update(self, samples, agent_number): """update the critics and actors of all the agents """ # ---------------------------- update critic ---------------------- # self.update_critic(samples, agent_number) # ---------------------------- update actor ------------------------- # self.update_actor(samples, agent_number) def update_targets(self): """soft update targets""" for ddpg_agent in self.maddpg_agent: soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.config.tau) soft_update(ddpg_agent.target_critic, ddpg_agent.critic, self.config.tau) def reset(self): """Resets weight of all agents""" for ddpg_agent in self.maddpg_agent: ddpg_agent.reset()
class DDPGAgent: def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random_seed # ------------------ actor ------------------ # self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) # ------------------ critic ----------------- # self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) # ------------------ optimizers ------------- # self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, random_seed) # Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self): torch.save(self.actor_local.state_dict(), 'actor_checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), 'critic_checkpoint_critic.pth')
class MADDPG(MultiAgentAlgorithm): def __init__(self, action_size, n_agents, seed, state_size): super().__init__(action_size, n_agents, seed) # critic input = obs_full + actions = 14+2+2+2=20 self.agents = [ DDPGAgent(state_size, ACTOR_FC1_UNITS, ACTOR_FC2_UNITS, action_size, (state_size + action_size) * n_agents, CRITIC_FC1_UNITS, CRITIC_FC2_UNITS, LR_ACTOR, LR_CRITIC, WEIGHT_DECAY_ACTOR, WEIGHT_DECAY_CRITIC) for i in range(n_agents) ] self.n_agents = n_agents self.epsilon = 0 self.iter = 0 self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) def save_model(self, model_file): """Save networks and all other model parameters Params ====== model_file (string): name of the file that will store the model """ checkpoint = { 'actor_local1': self.agents[0].actor.state_dict(), 'critic_local1': self.agents[0].critic.state_dict(), 'actor_target1': self.agents[0].target_actor.state_dict(), 'critic_target1': self.agents[0].target_critic.state_dict(), 'actor_local2': self.agents[1].actor.state_dict(), 'critic_local2': self.agents[1].critic.state_dict(), 'actor_target2': self.agents[1].target_actor.state_dict(), 'critic_target2': self.agents[1].target_critic.state_dict() } torch.save(checkpoint, model_file) def load_model(self, model_file): """Load networks and all other model parameters Params ====== model_file (string): name of the file that stores the model """ checkpoint = torch.load(model_file) self.agents[0].actor.load_state_dict(checkpoint['actor_local1']) self.agents[0].critic.load_state_dict(checkpoint['critic_local1']) self.agents[0].target_actor.load_state_dict( checkpoint['actor_target1']) self.agents[0].target_critic.load_state_dict( checkpoint['critic_target1']) self.agents[1].actor.load_state_dict(checkpoint['actor_local2']) self.agents[1].critic.load_state_dict(checkpoint['critic_local2']) self.agents[1].target_actor.load_state_dict( checkpoint['actor_target2']) self.agents[1].target_critic.load_state_dict( checkpoint['critic_target2']) def act(self, states): """get actions from all agents in the MADDPG object""" actions = [] for agent, state in zip(self.agents, states): if np.random.rand() < self.epsilon: actions_agent = np.random.randn(2) actions_agent = np.clip(actions_agent, -1, 1) actions.append(actions_agent) else: actions.append(agent.act(state)) return actions def target_act(self, states): """get target network actions from all the agents in the MADDPG object """ target_actions = [ agent.target_act(obs) for agent, obs in zip(self.agents, states) ] return target_actions def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn. Params ====== states (array_like): current state (for each agent) actions (array_like): action taken at the current state (for each agent) rewards (array_like): reward from an action (for each agent) next_states (array_like): next state of environment (for each agent) dones (array_like): true if the next state is the final one, false otherwise (for each agent) """ # Save experience / reward self.buffer.add(states, actions, rewards, next_states, dones) self.iter = (self.iter + 1) % UPDATE_EVERY if self.iter == 0: # Learn, if enough samples are available in buffer if len(self.buffer) > BATCH_SIZE: for i in range(N_UPDATES): experiences = self.buffer.sample() for agent in range(self.n_agents): self.learn(experiences, agent) self.update_targets(agent) def learn(self, experiences, agent_number): """update the critics and actors of all the agents """ # need to transpose each element of the samples # to flip obs[parallel_agent][agent_number] to # obs[agent_number][parallel_agent] states, actions, rewards, next_states, dones = experiences agent = self.agents[agent_number] agent.critic_optimizer.zero_grad() #critic loss = batch mean of (y- Q(s,a) from target network)^2 #y = reward of this timestep + discount * Q(st+1,at+1) from target network target_actions = self.target_act(next_states) target_actions = torch.cat(target_actions, dim=1) t = torch.tensor(transpose_list(next_states.cpu().data.numpy())) next_states_all = t.view(t.shape[0], -1).to('cpu') target_critic_input = torch.cat( (next_states_all, target_actions.to('cpu')), dim=1).to(device) with torch.no_grad(): q_next = agent.target_critic(target_critic_input) y = rewards[agent_number].view( -1, 1) + GAMMA * q_next * (1 - dones[agent_number].view(-1, 1)) actions_all = torch.cat(torch.unbind(actions), dim=1) t = torch.tensor(transpose_list(states.cpu().data.numpy())) states_all = t.view(t.shape[0], -1).to('cpu') critic_input = torch.cat((states_all, actions_all.to('cpu')), dim=1).to(device) q = agent.critic(critic_input) critic_loss = F.mse_loss(q, y.detach()) critic_loss.backward(retain_graph=True) agent.critic_optimizer.step() # update actor network using policy gradient agent.actor_optimizer.zero_grad() # make input to agent # detach the other agents to save computation # saves some time for computing derivative q_input = [self.agents[i].actor(state) if i == agent_number \ else self.agents[i].actor(state).detach() for i, state in enumerate(states)] q_input = torch.cat(q_input, dim=1) # combine all the actions and observations for input to critic # many of the obs are redundant, and obs[1] contains all useful information already q_input2 = torch.cat((states_all.to('cpu'), q_input.to('cpu')), dim=1) # get the policy gradient actor_loss = -agent.critic(q_input2).mean() actor_loss.backward(retain_graph=True) agent.actor_optimizer.step() def update_targets(self, i): """soft update targets""" soft_update(self.agents[i].target_actor, self.agents[i].actor, TAU) soft_update(self.agents[i].target_critic, self.agents[i].critic, TAU)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 1024 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() rewards = self.learn(experiences) # Roll over last state and action self.last_state = next_state return rewards def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.noise()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) return rewards def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)