class Agent(): """Meta agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, action_size=2, seed=0, load_file=None, n_agents=2, buffer_size=int(3e4), batch_size=128, gamma=0.99, update_every=2, noise_start=1.0, noise_decay=1.0, evaluation_only=False): """ Params ====== action_size (int): dimension of each action seed (int): Random seed load_file (str): path of checkpoint file to load n_agents (int): number of distinct agents buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor noise_start (float): initial noise weighting factor noise_decay (float): noise decay rate update_every (int): how often to update the network evaluation_only (bool): set to True to disable updating gradients and adding noise """ self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.n_agents = n_agents self.noise_weight = noise_start self.noise_decay = noise_decay self.t_step = 0 self.evaluation_only = evaluation_only # create two agents, each with their own actor and critic models = [model.LowDim2x(n_agents=n_agents) for _ in range(n_agents)] self.agents = [ DDPG(0, models[0], load_file=None), DDPG(1, models[1], load_file=None) ] # create shared replay buffer self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) if load_file: for i, save_agent in enumerate(self.agents): actor_file = torch.load(load_file + '.' + str(i) + '.actor.pth', map_location='cpu') critic_file = torch.load(load_file + '.' + str(i) + '.critic.pth', map_location='cpu') save_agent.actor_local.load_state_dict(actor_file) save_agent.actor_target.load_state_dict(actor_file) save_agent.critic_local.load_state_dict(critic_file) save_agent.critic_target.load_state_dict(critic_file) print('Loaded: {}.actor.pth'.format(load_file)) print('Loaded: {}.critic.pth'.format(load_file)) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector all_next_states = all_next_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0 and self.evaluation_only == False: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: # each agent does it's own sampling from the replay buffer experiences = [ self.memory.sample() for _ in range(self.n_agents) ] self.learn(experiences, self.gamma) def act(self, all_states, add_noise=True): # pass each agent's state from the environment and calculate it's action all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, noise_weight=self.noise_weight, add_noise=True) self.noise_weight *= self.noise_decay all_actions.append(action) return np.array(all_actions).reshape( 1, -1) # reshape 2x2 into 1x4 dim vector def learn(self, experiences, gamma): # each agent uses it's own actor to calculate next_actions all_next_actions = [] for i, agent in enumerate(self.agents): _, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) # each agent uses it's own actor to calculate actions all_actions = [] for i, agent in enumerate(self.agents): states, _, _, _, _ = experiences[i] agent_id = torch.tensor([i]).to(device) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # each agent learns from it's experience sample for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, model, action_size, seed=0, load_file=None, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, use_double_dqn=True, use_prioritized_experience_replay=False, alpha_start=0.5, alpha_decay=0.9992, action_map=None): """ Params ====== model: model object action_size (int): dimension of each action seed (int): Random seed load_file (str): path of checkpoint file to load buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network use_double_dqn (bool): wheter to use double DQN algorithm use_prioritized_experience_replay (bool): wheter to use PER algorithm alpha_start (float): initial value for alpha, used in PER alpha_decay (float): decay rate for alpha, used in PER action_map (dict): how to map action indexes from model output to gym environment """ random.seed(seed) self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.use_double_dqn = use_double_dqn self.use_prioritized_experience_replay = use_prioritized_experience_replay self.loss_list = [] # track loss across steps self.entropy_list = [] # track entropy across steps # Q-Network self.qnetwork_local = model.local self.qnetwork_target = model.target # DEBUG weight initialization #print(self.qnetwork_local.fc_s.weight.data[0]) #print(self.qnetwork_target.fc_s.weight.data[0]) #self.qnetwork_local.fc_s.weight.data[0] = torch.tensor([0.0, 0.0, 0.0, 0.0]) #print(self.qnetwork_local.fc_s.weight.data[0]) #print(self.qnetwork_target.fc_s.weight.data[0]) #input('->') self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) #self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=.00025, momentum=0.95) # Replay memory if use_prioritized_experience_replay: self.memory = PrioritizedReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every update_every steps) self.t_step = 0 # initalize alpha (used in prioritized experience sampling probability) self.alpha_start = alpha_start self.alpha_decay = alpha_decay self.alpha = self.alpha_start if load_file: self.qnetwork_local.load_state_dict(torch.load(load_file + '.pth')) self.qnetwork_target.load_state_dict(torch.load(load_file + '.pth')) #self.memory = dill.load(open(load_file + '.buffer.pck','rb')) print('Loaded: {}'.format(load_file)) self.action_map = action_map def step(self, state, action, reward, next_state, done): # Save experience in replay memory if self.use_prioritized_experience_replay: priority = 100.0 # set initial priority to max value self.memory.add(state, action, reward, next_state, done, priority) else: self.memory.add(state, action, reward, next_state, done) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: # if prioritized experience replay is enabled if self.use_prioritized_experience_replay: self.memory.sort() indexes, experiences = self.memory.sample(self.alpha) self.learn(experiences, self.gamma, indexes) self.alpha = self.alpha_decay * self.alpha else: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ if len( state.shape ) == 1: # reshape 1-D states into 2-D (as expected by the model) state = np.expand_dims(state, axis=0) state = torch.from_numpy(state).float().to(device) # calculate action values self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, indexes=None): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_prioritized_experience_replay: states, actions, rewards, next_states, dones, priorities = experiences else: states, actions, rewards, next_states, dones = experiences # DEBUG replay memory #print('learning:') #show_frames(states) #show_frames(next_states) # Select double DQN or regular DQN if self.use_double_dqn: # get greedy actions (for next states) from local model q_local_argmax = self.qnetwork_local(next_states).detach().argmax( dim=1).unsqueeze(1) # get predicted q values (for next states) from target model indexed by q_local_argmax q_targets_next = self.qnetwork_target(next_states).gather( 1, q_local_argmax).detach() else: # get max predicted q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # get q values from local model q_local = self.qnetwork_local(states) # get q values for chosen action predictions = q_local.gather(1, actions) # calculate td targets targets = rewards + (gamma * q_targets_next * (1 - dones)) # calculate new priorities if self.use_prioritized_experience_replay: with torch.no_grad(): new_priorities = torch.abs(targets - predictions).to(device) self.memory.batch_update(indexes, (states, actions, rewards, next_states, dones, new_priorities)) # calculate loss using mean squared error: (targets - predictions).pow(2).mean() loss = F.mse_loss(predictions, targets) # minimize loss self.optimizer.zero_grad() loss.backward() # clip gradients #for param in self.qnetwork_local.parameters(): # param.grad.data.clamp_(-10, 10) self.optimizer.step() # update stats with torch.no_grad(): self.loss_list.append(loss.item()) # calculate sparse softmax cross entropy self.entropy_list.append( F.cross_entropy(q_local, actions.squeeze(1))) # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, model, action_size, seed=0, load_file=None, n_agents=1, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0.0001, clip_gradients=False, theta=0.15, sigma=0.2, update_every=1, use_prioritized_experience_replay=False, alpha_start=0.5, alpha_decay=0.9992, evaluation_only=False): """ Params ====== model: model object action_size (int): dimension of each action seed (int): Random seed load_file (str): path of checkpoint file to load n_agents (int): number of agents to train simultaneously buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic weight_decay (float): L2 weight decay clip_gradients (bool): whether to clip gradients on both actor and critic theta (float): OU noise parameter sigma (float): OU noise parameter update_every (int): how often to update the network use_prioritized_experience_replay (bool): wheter to use PER algorithm alpha_start (float): initial value for alpha, used in PER alpha_decay (float): decay rate for alpha, used in PER evaluation_only (bool): set to True to disable updating gradients and adding noise """ random.seed(seed) self.action_size = action_size self.n_agents = n_agents self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every self.use_prioritized_experience_replay = use_prioritized_experience_replay self.clip_gradients = clip_gradients self.evaluation_only = evaluation_only self.loss_list = [] # track loss across steps # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # DEBUG weight initialization #print(self.actor_local.fcs1.weight.data[0]) #print(self.actor_target.fcs1.weight.data[0]) #print(self.critic_local.fcs1.weight.data[0]) #print(self.critic_target.fcs1.weight.data[0]) #input('->') # Noise process self.noise = OUNoise((n_agents, action_size), seed, theta=theta, sigma=sigma) # Replay memory if use_prioritized_experience_replay: self.memory = PrioritizedReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) else: self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every update_every steps) self.t_step = 0 # initalize alpha (used in prioritized experience sampling probability) self.alpha_start = alpha_start self.alpha_decay = alpha_decay self.alpha = self.alpha_start if load_file: if device.type == 'cpu': self.actor_local.load_state_dict( torch.load(load_file + '.actor.pth', map_location='cpu')) self.actor_target.load_state_dict( torch.load(load_file + '.actor.pth', map_location='cpu')) self.critic_local.load_state_dict( torch.load(load_file + '.critic.pth', map_location='cpu')) self.critic_target.load_state_dict( torch.load(load_file + '.critic.pth', map_location='cpu')) #self.memory = dill.load(open(load_file + '.buffer.pck','rb')) elif device.type == 'cuda:0': self.actor_local.load_state_dict( torch.load(load_file + '.actor.pth')) self.actor_target.load_state_dict( torch.load(load_file + '.actor.pth')) self.critic_local.load_state_dict( torch.load(load_file + '.critic.pth')) self.critic_target.load_state_dict( torch.load(load_file + '.critic.pth')) #self.memory = dill.load(open(load_file + '.buffer.pck','rb')) print('Loaded: {}'.format(load_file)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory if self.use_prioritized_experience_replay: priority = 100.0 # set initial priority to max value if self.n_agents == 1: self.memory.add(state, action, reward, next_state, done, priority) else: for i in range(self.n_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i], priority[i, :]) else: if self.n_agents == 1: self.memory.add(state, action, reward, next_state, done) else: for i in range(self.n_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0 and self.evaluation_only == False: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: # if prioritized experience replay is enabled if self.use_prioritized_experience_replay: self.memory.sort() indexes, experiences = self.memory.sample(self.alpha) self.learn(experiences, self.gamma, indexes) self.alpha = self.alpha_decay * self.alpha else: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" if len( state.shape ) == 1: # reshape 1-D states into 2-D (as expected by the model) state = np.expand_dims(state, axis=0) state = torch.from_numpy(state).float().to(device) # calculate action values self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() #print('pre: {}'.format(action)) # DEBUG if add_noise: action += self.noise.sample() #print('pst: {}'.format(action)) # DEBUG return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, indexes=None): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_prioritized_experience_replay: states, actions, rewards, next_states, dones, priorities = experiences else: states, actions, rewards, next_states, dones = experiences # DEBUG replay memory #print('learning:') #show_frames(states) #show_frames(next_states) # ---------------------------- update critic ---------------------------- # # get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) q_expected = self.critic_local(states, actions) # compute critic loss q_targets = rewards + (gamma * q_targets_next * (1 - dones)) critic_loss = F.mse_loss(q_expected, q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # DEBUG gradients #for m in self.critic_local.parameters(): # print(m.grad) if self.clip_gradients: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 10, norm_type=2) #for param in self.qnetwork_local.parameters(): # param.grad.data.clamp_(-10, 10) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() # DEBUG gradients #for m in self.actor_local.parameters(): # print(m.grad) if self.clip_gradients: torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 10, norm_type=2) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # ---------------- update prioritized experience replay ---------------- # if self.use_prioritized_experience_replay: with torch.no_grad(): new_priorities = torch.abs(q_targets - q_expected).to(device) self.memory.batch_update(indexes, (states, actions, rewards, next_states, dones, new_priorities)) # ---------------------------- update stats ---------------------------- # with torch.no_grad(): self.loss_list.append(critic_loss.item()) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)