def __init__(self, state_size, action_size, num_agents, seed, actor_hidden_layers, critic_hidden_layers, use_batch_norm=False, use_noise=False): super(Agent, self).__init__() self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(seed) # Actor networks self.actor_local = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device) self.actor_target = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) copy_weights(self.actor_local, self.actor_target) # Critic networks self.critic_local = Critic(state_size, action_size, seed, critic_hidden_layers).to(device) self.critic_target = Critic(state_size, action_size, seed, critic_hidden_layers).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) copy_weights(self.critic_local, self.critic_target) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Noise process self.noise = OUNoise((num_agents, action_size), seed) self.use_noise = use_noise self.t_step = 0
def __init__(self, env, config: DDPGConfig): super().__init__(env) self.config = config self.replay_buffer = ReplayBuffer(config.buffer_size, config.batch_size) # Actor self.actor_current = Actor(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.actor_target = Actor(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.actor_optimizer = torch.optim.Adam( self.actor_current.parameters(), lr=config.learning_rate) # Critic self.critic_current = Critic(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.critic_target = Critic(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.critic_optimizer = torch.optim.Adam( self.critic_current.parameters(), lr=config.learning_rate) self.metrics = Metrics()
def __init__(self, state_size, action_size, random_seed=42): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) self.noise = OUNoise(action_size, random_seed)
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.copy_weights(self.actor_local, self.actor_target) self.copy_weights(self.critic_local, self.critic_target) print("\nActor network...\n", self.actor_local) print("\nCritic network...\n", self.critic_local) # Noise process self.noise = OUNoise(self.num_agents * action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, id, state_size, action_size, config = Config()): """Initialize an Agent object. Params ====== id (int): id used to identify the agent state_size (int): dimension of each state action_size (int): dimension of each action config (Config): the agents configuration """ self.state_size = state_size self.action_size = action_size self.id = id self.t_step = 0 self.config = config random.seed(config.random_seed) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Actor & Target Network self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic & Target Network self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma) # Replay memory if config.use_per: self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon) else: self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
class DDPGAgent(Agent): def __init__(self, env, config: DDPGConfig): super().__init__(env) self.config = config self.replay_buffer = ReplayBuffer(config.buffer_size, config.batch_size) # Actor self.actor_current = Actor(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.actor_target = Actor(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.actor_optimizer = torch.optim.Adam( self.actor_current.parameters(), lr=config.learning_rate) # Critic self.critic_current = Critic(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.critic_target = Critic(env.state_size, env.action_size, config.fc1_units, config.fc2_units).to(device) self.critic_optimizer = torch.optim.Adam( self.critic_current.parameters(), lr=config.learning_rate) self.metrics = Metrics() def restore(self, actor_file, critic_file): self.actor_current.load_state_dict(torch.load(actor_file)) self.critic_current.load_state_dict(torch.load(critic_file)) def compute_action(self, state, epsilon=0): action = self.actor_current.action_values_for(state) if np.random.random() < epsilon: action += np.random.randn(self.env.action_size) * epsilon action = np.clip(action, -1, 1) return action def train(self, n_steps, update_every, print_every, epsilon_init=1.0, epsilon_decay=0.995, epsilon_min=0.01): epsilon = epsilon_init state = self._warmup(epsilon) self.metrics.plot() for t_step in range(1, n_steps + 1): state = self._step(state, epsilon) epsilon = max(epsilon_min, epsilon * epsilon_decay) if t_step % update_every == 0: self._batch_train() if self._check_solved(): break if t_step % print_every == 0: print(f"Step #{t_step}" + f", Running score {self.metrics.running_score():.2f}" + f", Total episodes {self.metrics.episode_count}") def _warmup(self, epsilon): state = self.env.reset(train_mode=True) needed_experiences = max( 0, self.replay_buffer.batch_size - len(self.replay_buffer)) for i in range(needed_experiences): state = self._step(state, epsilon) return state def _step(self, state, epsilon): action = self.compute_action(state, epsilon) next_state, reward, done = self.env.step(action) self.replay_buffer.add( Experience(state, action, reward, next_state, done)) self.metrics.on_step(reward, done) if done: return self.env.reset(train_mode=True) return next_state def _batch_train(self): states, actions, rewards, next_states, dones = self.replay_buffer.sample( ) # Update Critic target_actions_next = self.actor_target(next_states) target_values_next = self.critic_target( next_states, target_actions_next).detach().max(1)[0].unsqueeze(1) target_values = rewards + (self.config.gamma * target_values_next * (1 - dones)) expected_values = self.critic_current(states, actions) critic_loss = F.mse_loss(expected_values, target_values) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.critic_target.soft_update(self.critic_current, self.config.tau) # Update Actor current_actions = self.actor_current(states) actor_loss = -self.critic_current(states, current_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_target.soft_update(self.actor_current, self.config.tau) def _check_solved(self): if self.metrics.running_score() >= 30: print( f"\nEnvironment solved in {self.metrics.episode_count} episodes!\t" + f"Average Score: {self.metrics.running_score():.2f}") torch.save(self.actor_current.state_dict(), "actor_model.pt") torch.save(self.critic_current.state_dict(), "critic_model.pt") return True return False
state_size = env.state_size action_size = env.action_size print('There are {} agents.'.format(states.shape[0])) print('Each agent observes a state with length: {}'.format(state_size)) print('Each agent performs an action of size: {}'.format(action_size)) print('The state for the first agent looks like:', states[0]) print('The state shape looks like:', states.shape) #################################################################################################### BUFFER_SIZE = int(1e6) BATCH_SIZE = 256 random_seed = 0 # Local and Target Actor Networks actor_local = Actor(state_size, action_size, random_seed) actor_target = Actor(state_size, action_size, random_seed) # Local and Traget Critic Networks state_action_size = state_size + action_size critic_local = Critic(num_agents * state_action_size, num_agents, random_seed) critic_target = Critic(num_agents * state_action_size, num_agents, random_seed) # Noise processes noise_process1 = OUNoise(action_size, random_seed, mu=0., theta=0.15, sigma=0.1) noise_process2 = OUNoise(action_size, random_seed,
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.copy_weights(self.actor_local, self.actor_target) self.copy_weights(self.critic_local, self.critic_target) print("\nActor network...\n", self.actor_local) print("\nCritic network...\n", self.critic_local) # Noise process self.noise = OUNoise(self.num_agents * action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) @classmethod def copy_weights(cls, src, dst): """Clones the weights from the source to the target""" for dst_wts, src_wts in zip(src.parameters(), dst.parameters()): dst_wts.data.copy_(src_wts.data) def step(self, states, actions, rewards, next_states, is_dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # Doing this for state, action, reward, next_state, is_done in zip( states, actions, rewards, next_states, is_dones): self.memory.add(state, action, reward, next_state, is_done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True, weight=1.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += weight * self.noise.sample().reshape( (-1, self.action_size)) return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Agent that interacts with and learns from the environment.""" def __init__(self, id, state_size, action_size, config = Config()): """Initialize an Agent object. Params ====== id (int): id used to identify the agent state_size (int): dimension of each state action_size (int): dimension of each action config (Config): the agents configuration """ self.state_size = state_size self.action_size = action_size self.id = id self.t_step = 0 self.config = config random.seed(config.random_seed) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Actor & Target Network self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic & Target Network self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma) # Replay memory if config.use_per: self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon) else: self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed) def step(self, state, action, reward, next_state, done, beta=None): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every n time steps. self.t_step = (self.t_step + 1) % self.config.update_n_step if self.t_step != 0: return # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: if self.config.use_per: assert(beta != None) experiences, weights = self.memory.sample(beta) states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) weights = torch.from_numpy(np.vstack(weights)).float().to(self.device) experiences = (states, actions, rewards, next_states, dones) self.learn(experiences, self.config.gamma, weights) else: experiences = self.memory.sample() states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) experiences = (states, actions, rewards, next_states, dones) self.learn(experiences, self.config.gamma) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if self.config.add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, weights=None): """ Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor weights (array_like): list of weights for compensation the non-uniform sampling (used only with prioritized experience replay) """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) if self.config.use_per: td_error = Q_expected - Q_targets critic_loss = (td_error) ** 2 critic_loss = critic_loss * weights critic_loss = critic_loss.mean() self.memory.update_priorities(np.hstack(td_error.detach().cpu().numpy())) else: critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------- update target networks ------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def getId(self): """ Return the ID of the agent """ return self.id def summary(self): """ Return a brief summary of the agent""" s = 'DDPG Agent {}:\n'.format(self.id) s += self.config.__str__() s += self.actor_local.__str__() s += self.critic_local.__str__() return s
class Agent(): #''' DDPG agent ''' def __init__(self, state_size, action_size, num_agents, seed, actor_hidden_layers, critic_hidden_layers, use_batch_norm=False, use_noise=False): super(Agent, self).__init__() self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(seed) # Actor networks self.actor_local = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device) self.actor_target = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) copy_weights(self.actor_local, self.actor_target) # Critic networks self.critic_local = Critic(state_size, action_size, seed, critic_hidden_layers).to(device) self.critic_target = Critic(state_size, action_size, seed, critic_hidden_layers).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) copy_weights(self.critic_local, self.critic_target) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Noise process self.noise = OUNoise((num_agents, action_size), seed) self.use_noise = use_noise self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): ''' Save experience in replay memory, and use random sample from buffer to learn. ''' for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # update time steps self.t_s = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # time to learn again # provided that there are enough #if len(shared_memory.shared_buffer) > BATCH_SIZE: if len(self.memory) > BATCH_SIZE: #experiences = self.memory.sample() #experiences = shared_memory.shared_buffer.sample() experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state): ''' Returns actions for a given state as per current policy ''' # Make current state into a Tensor that can be passed as input to the network state = torch.from_numpy(state).float().to(device) # Set network in evaluation mode to prevent things like dropout from happening self.actor_local.eval() # Turn off the autograd engine with torch.no_grad(): # Do a forward pass through the network action_values = self.actor_local(state).cpu().data.numpy() # Put network back into training mode self.actor_local.train() if self.use_noise: action_values += self.noise.sample() return np.clip(action_values, -1, 1) def reset(self): ''' Reset the noise in the OU process ''' self.noise.reset() def learn(self, experiences, gamma): ''' Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) ''' states, actions, rewards, next_states, dones = experiences # ------------------------ Update Critic Network ------------------------ # next_actions = self.actor_target(next_states) Q_targets_prime = self.critic_target(next_states, next_actions) # Compute y_i Q_targets = rewards + (gamma * Q_targets_prime * (1 - dones)) # Compute the critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.critic_optimizer.zero_grad() # Reset the gradients to prevent accumulation critic_loss.backward() # Compute gradients torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Update weights # ------------------------ Update Actor Network ------------------------- # # Compute the actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimise the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------------ Update Target Networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def __init__( self, state_size, action_size, observed_state_size, observed_action_size, random_seed, actor_local_load_filename=None, actor_target_load_filename=None, critic_local_load_filename=None, critic_target_load_filename=None, ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action observed_state_size(int): dimension of the states of all agents observed_action_size(int): dimension of the actions of all agents random_seed (int): random seed actor_local_load_filename : if given, the initial weights of the local NN critic_local_load_filename : if given, the initial weights if the target NN actor_target_load_filename : if given, the initial weights of the local NN critic_target_load_filename : if given, the initial weights if the target NN """ self.state_size = state_size self.action_size = action_size self.observed_state_size = observed_state_size self.observed_action_size = observed_action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(observed_state_size, observed_action_size, random_seed).to(device) self.critic_target = Critic(observed_state_size, observed_action_size, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=cfg.LR_CRITIC, weight_decay=cfg.WEIGHT_DECAY, ) self.load( actor_local_load_filename, actor_target_load_filename, critic_local_load_filename, critic_target_load_filename, ) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(cfg.BUFFER_SIZE, cfg.BATCH_SIZE, random_seed) self.t_step = 0 self.epsilon = cfg.EPSILON self.epsilon_decay = cfg.EPSILON_DECAY
class Agent: """Interacts with and learns from the environment.""" def __init__( self, state_size, action_size, observed_state_size, observed_action_size, random_seed, actor_local_load_filename=None, actor_target_load_filename=None, critic_local_load_filename=None, critic_target_load_filename=None, ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action observed_state_size(int): dimension of the states of all agents observed_action_size(int): dimension of the actions of all agents random_seed (int): random seed actor_local_load_filename : if given, the initial weights of the local NN critic_local_load_filename : if given, the initial weights if the target NN actor_target_load_filename : if given, the initial weights of the local NN critic_target_load_filename : if given, the initial weights if the target NN """ self.state_size = state_size self.action_size = action_size self.observed_state_size = observed_state_size self.observed_action_size = observed_action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(observed_state_size, observed_action_size, random_seed).to(device) self.critic_target = Critic(observed_state_size, observed_action_size, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=cfg.LR_CRITIC, weight_decay=cfg.WEIGHT_DECAY, ) self.load( actor_local_load_filename, actor_target_load_filename, critic_local_load_filename, critic_target_load_filename, ) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(cfg.BUFFER_SIZE, cfg.BATCH_SIZE, random_seed) self.t_step = 0 self.epsilon = cfg.EPSILON self.epsilon_decay = cfg.EPSILON_DECAY def step(self, states, actions, rewards, next_states, dones, t_step): pass def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: self.epsilon = self.epsilon_decay * self.epsilon action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): pass def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save( self, actor_local_save_filename, actor_target_save_filename, critic_local_save_filename, critic_target_save_filename, ): torch.save(self.actor_local.state_dict(), actor_local_save_filename) torch.save(self.actor_target.state_dict(), actor_target_save_filename) torch.save(self.critic_local.state_dict(), critic_local_save_filename) torch.save(self.critic_target.state_dict(), critic_target_save_filename) def load( self, actor_local_load_filename, actor_target_load_filename=None, critic_local_load_filename=None, critic_target_load_filename=None, ): if actor_local_load_filename is not None: self.actor_local.load_state_dict( torch.load(actor_local_load_filename)) if actor_target_load_filename is not None: self.actor_target.load_state_dict( torch.load(actor_target_load_filename)) if critic_local_load_filename is not None: self.critic_local.load_state_dict( torch.load(critic_local_load_filename)) if critic_target_load_filename is not None: self.critic_target.load_state_dict( torch.load(critic_target_load_filename))
class Agent(): def __init__(self, state_size, action_size, random_seed=42): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) self.noise = OUNoise(action_size, random_seed) #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, memory, state, action, reward, next_state, done): memory.add(state, action, reward, next_state, done) if len(memory) > BATCH_SIZE: experiences = memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if GRAD_CLIPPING > 0.0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy_weights(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)