class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.seed = random.seed(random_seed) np.random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = ActorNet(self.state_size, self.action_size, random.random()).to(device) self.actor_target = ActorNet(self.state_size, self.action_size, random.random()).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) def act(self, states, add_noise=True, noise_strength=1.0): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: noise = np.random.randn(*actions.shape) * NOISE * noise_strength actions += noise return np.clip(actions, -1, 1)
def __init__(self, state_size, action_size, device, seed, LR=5e-4, gamma=0.95, entropy_weight=0.02, actor_network_max_grad_norm=5, critic_network_max_grad_norm=5, nstepqlearning_size=5, gae_lambda=1.0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed LR (float): learning rate GAMMA (float): factor used to discount future values entropy_weight (float): weight of the entropy value using with the entropy_loss actor_network_max_grad_norm (float): threshold value used in gradient clipping in the actor model critic_network_max_grad_norm (float): threshold value used in gradient clipping in the critic model nstepqlearning_size (int): the number of steps used for the N-step bootstrapping algorithm gae_lambda (float): lambda used in GAE algorithm to use as discount factor in getting a mixture of every available estimated N-step bootstrapping results (from 1-5). """ self.state_size = state_size self.action_size = action_size self.entropy_weight = entropy_weight random.seed(seed) self.gamma = gamma self.actor_network_max_grad_norm = actor_network_max_grad_norm self.critic_network_max_grad_norm = critic_network_max_grad_norm self.nstepqlearning_size = nstepqlearning_size self.gae_lambda = gae_lambda self.device = device print("----Dumping agent hyperparameters---- ") print("LR: ", LR) print("gamma: ", gamma) print("actor_network_max_grad_norm: ", self.actor_network_max_grad_norm) print("critic_network_max_grad_norm: ", self.critic_network_max_grad_norm) print("nstepqlearning_size: ", self.nstepqlearning_size) print("gae_lambda: ", self.gae_lambda) print("entropy_weight: ", self.entropy_weight) print("------------------------------------- ") self.actor_net = ActorNet(state_size, action_size, device, seed).to(self.device) # Theta self.critic_net = CriticNet(state_size, action_size, seed).to(self.device) # Thetav self.actor_optimizer = optim.RMSprop(self.actor_net.parameters(), lr=LR) self.critic_optimizer = optim.RMSprop(self.critic_net.parameters(), lr=LR)
def __init__(self, state_size, action_size, fc1_units, fc2_units, seed, gamma, lr_actor, lr_critic, tau, buffer_size, batch_size, weight_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action fc1_units (int): number of nodes in layer 1 of neural network fc2_units (int): number of nodes in layer 2 of neural network seed (int): seed gamma (float): discount parameter lr_actor (float): learning rate for Actor lr_critic (float): leanring rate for Critic tau (float): interpolation parameter buffer_size (int): size of memory buffer batch_size (int): number of experiences to sample during learning weight_decay (int): weight decay parameter """ self.state_size = state_size self.action_size = action_size self.gamma = gamma self.lr_actor = lr_actor self.lr_critic = lr_critic self.tau = tau self.batch_size = batch_size # Neural Netowrk Params self.actor_target = ActorNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.actor_local = ActorNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_target = CriticNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.critic_local = CriticNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.critic_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_critic) # Noise process self.noise = OUNoise(action_size, seed) # Memory buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed self.seed = random.seed(random_seed) np.random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = ActorNet(self.state_size, self.action_size, random.random()).to(device) self.actor_target = ActorNet(self.state_size, self.action_size, random.random()).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
def __init__(self, state_size, action_size, num_agents, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents #2 self.seed = random.seed(seed) # Actor Network self.actor_local = ActorNet(state_size, action_size, seed).to(device) self.actor_target = ActorNet(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = CriticNet(state_size, action_size, seed).to(device) self.critic_target = CriticNet(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Q-Network #self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) #self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Noise process (Instead of epsilon in DQN) - taken from example self.noise = G_Noise((num_agents, action_size), seed, sigma=NOISE_SIGMA) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def train(train_env_id: str, eval_env_id: str, logdir: str, cfg: ExperimentConfig, save_path: str, pretrain_path: Optional[str] = None) -> DDPGAgent: pretrain = torch.load(os.path.join(pretrain_path)) \ if pretrain_path is not None \ else None env = set_env_metadata(train_env_id, cfg) train_env = make_vec_env(train_env_id, num_envs=cfg.episodes_per_cycle, no_timeout=True, seed=cfg.seed) eval_env = make_vec_env(eval_env_id, num_envs=cfg.num_eval_envs, no_timeout=True, seed=cfg.seed + 100) replay = HERReplayBuffer(cfg=cfg) tf_logger = TensorboardLogger(logdir) actor = ActorNet(obs_dim=cfg.obs_dim, goal_dim=cfg.goal_dim, action_dim=cfg.action_dim, action_range=cfg.action_range, zero_last=(pretrain_path is not None)) critic = CriticNet(obs_dim=cfg.obs_dim, goal_dim=cfg.goal_dim, action_dim=cfg.action_dim, action_range=cfg.action_range) normalizer = Normalizer(cfg.obs_dim+cfg.goal_dim) \ if pretrain is None \ else pretrain.normalizer agent = DDPGAgent(cfg=cfg, actor=actor, critic=critic, normalizer=normalizer, reward_fn=env.compute_reward, pretrain=getattr(pretrain, 'actor', None)) engine = DDPGEngine(cfg=cfg, agent=agent, train_env=train_env, eval_env=eval_env, replay=replay, tf_logger=tf_logger) engine.train() env.close() train_env.close() eval_env.close() torch.save(agent, os.path.join(save_path)) return agent
def __init__(self, state_size, action_size, seed): """ Initializes a DDPG Agent. params: - state_size (int) : dimension of each state. - action_size (int) : dimension of each action. - seed (int) : random seed. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.eps = EPSILON_START # Setup Actor Network self.actor_net = ActorNet(self.state_size, self.action_size, seed).to(device) self.target_actor_net = ActorNet(self.state_size, self.action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=LR_ACTOR) # Setup Critic Network self.critic_net = CriticNet(self.state_size, self.action_size, seed).to(device) self.target_critc_net = CriticNet(self.state_size, self.action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=LR_CRITIC) # noise process self.noise = OUNoise(self.action_size, seed) # create replay buffer self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # timestep counter self.tstep = 0
def __init__(self, env, batch_size, replay_capacity, episodes_before_train, device='cpu'): self.env = env self.n_agents = env.n self.memory = memory.ReplayMemory(replay_capacity) self.actors = [ ActorNet(env.observation_space[i].shape[0], env.action_space[i].n) for i in range(self.n_agents) ] self.critics = [ CriticNet(env.observation_space[i].shape[0], env.n) for i in range(self.n_agents) ] self.critic_optimizers = [ Adam(x.parameters(), lr=0.01) for x in self.critics ] self.actor_optimizers = [ Adam(x.parameters(), lr=0.01) for x in self.actors ] self.actor_targets = deepcopy(self.actors) self.critic_targets = deepcopy(self.critics) self.device = device self.episodes_before_train = episodes_before_train self.batch_size = batch_size self.GAMMA = 0.95 self.epsilon = 0.3 for x in self.actors: x.to(device) for x in self.critics: x.to(device) for x in self.actor_targets: x.to(device) for x in self.critic_targets: x.to(device)
class A2C_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, device, seed, LR=5e-4, gamma=0.95, entropy_weight=0.02, actor_network_max_grad_norm=5, critic_network_max_grad_norm=5, nstepqlearning_size=5, gae_lambda=1.0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed LR (float): learning rate GAMMA (float): factor used to discount future values entropy_weight (float): weight of the entropy value using with the entropy_loss actor_network_max_grad_norm (float): threshold value used in gradient clipping in the actor model critic_network_max_grad_norm (float): threshold value used in gradient clipping in the critic model nstepqlearning_size (int): the number of steps used for the N-step bootstrapping algorithm gae_lambda (float): lambda used in GAE algorithm to use as discount factor in getting a mixture of every available estimated N-step bootstrapping results (from 1-5). """ self.state_size = state_size self.action_size = action_size self.entropy_weight = entropy_weight random.seed(seed) self.gamma = gamma self.actor_network_max_grad_norm = actor_network_max_grad_norm self.critic_network_max_grad_norm = critic_network_max_grad_norm self.nstepqlearning_size = nstepqlearning_size self.gae_lambda = gae_lambda self.device = device print("----Dumping agent hyperparameters---- ") print("LR: ", LR) print("gamma: ", gamma) print("actor_network_max_grad_norm: ", self.actor_network_max_grad_norm) print("critic_network_max_grad_norm: ", self.critic_network_max_grad_norm) print("nstepqlearning_size: ", self.nstepqlearning_size) print("gae_lambda: ", self.gae_lambda) print("entropy_weight: ", self.entropy_weight) print("------------------------------------- ") self.actor_net = ActorNet(state_size, action_size, device, seed).to(self.device) # Theta self.critic_net = CriticNet(state_size, action_size, seed).to(self.device) # Thetav self.actor_optimizer = optim.RMSprop(self.actor_net.parameters(), lr=LR) self.critic_optimizer = optim.RMSprop(self.critic_net.parameters(), lr=LR) def tensor(self, x): if isinstance(x, torch.Tensor): return x x = np.asarray(x, dtype=np.float32) x = torch.from_numpy(x).to(self.device) return x def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.actor_net.eval() self.critic_net.eval() (actor_values, log_prob, entropy) = self.actor_net(state) critic_values = self.critic_net(state) self.actor_net.train() self.critic_net.train() return actor_values, log_prob, entropy, critic_values def train_one_episode(self, env, brain_name): env_info = env.reset( train_mode=True)[brain_name] # reset the environment num_agents = len(env_info.agents) states = env_info.vector_observations # get the current state (for each agent) episode_terminated = False scores = np.zeros(num_agents) # initialize the score (for each agent) while episode_terminated == False: l_states = [] l_actions = [] l_rewards = [] #np.zeros(( nstepqlearning_size, num_agents )) l_masks = [] l_next_states = [] l_values = [] l_log_probs = [] l_entropy = [] nstep_memory_size = self.nstepqlearning_size for i in range(self.nstepqlearning_size): # Get a(t) according to actor policy (actions, log_prob, entropy, values) = self.act(states) actions = np.clip( actions, -1, 1 ) # Put all actions between -1 and 1. ( The last activation of the Actor is tanh, which puts the out values in this range, # but later we are sampling it which can produce values outside of this range) # Perform a(t) in all environments env_info = env.step(actions)[ brain_name] # send all actions to tne environment # get s(t+1), r(t) and wasLastAction(t) next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished masks = 1 - np.asarray(dones, np.int) l_states.append(states) l_actions.append(actions) l_rewards.append(rewards) l_masks.append(masks) l_next_states.append(next_states) l_values.append(values) l_log_probs.append(log_prob) l_entropy.append(entropy) # update score scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode terminated nstep_memory_size = i + 1 episode_terminated = True break # get one prediction for the last estimated Q value (_, _, _, values) = self.act(states) l_values.append(values) # Add to the list, GAE will use it advantages = self.tensor(torch.zeros((num_agents))).to(self.device) returns = values.reshape((num_agents, )).to( self.device ) # last estimated Value ( of s(t+nstep_memory_size) ) l_advantages = [None] * nstep_memory_size l_rets = [None] * nstep_memory_size l_masks = torch.tensor(np.array(l_masks)).to(self.device) l_rewards = torch.tensor(np.array(l_rewards)).to(self.device) for i in reversed(range(nstep_memory_size)): returns = l_rewards[i] + self.gamma * l_masks[i] * returns # Normal advantage calculation. #advantages = returns - l_values[i].detach().reshape((num_agents, )) # GAE td_error = l_rewards[i] + self.gamma * l_masks[i] * l_values[ i + 1] - l_values[i] advantages = advantages * self.gae_lambda * self.gamma * l_masks[ i] + td_error # GAE end l_advantages[i] = advantages.detach() l_rets[i] = returns.detach() # bring log_probs list to Tensor with shape [ num_agents,nstepqlearning_size ] logprobs = torch.cat(l_log_probs).squeeze() logprobs = logprobs.reshape( (nstep_memory_size * num_agents)).to(self.device) ents = torch.cat(l_entropy).squeeze() advantages_tensor = torch.cat( l_advantages, dim=0).squeeze().detach().to(self.device) policy_loss = -(logprobs * advantages_tensor).mean() # entropy: currently it's constant but I left it here, to make it possible to use different distribution parameters during the training process entropy_loss = ents.mean() # ==== train Critic ==== self.critic_optimizer.zero_grad() l_rets = torch.cat(l_rets, dim=0).squeeze().detach().to(self.device) l_values = torch.cat(l_values[:nstep_memory_size], dim=0).squeeze().to(self.device) v = 0.5 * (l_rets - l_values) value_loss = v.pow(2).mean() value_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.critic_network_max_grad_norm) self.critic_optimizer.step() # ==== train Actor ==== self.actor_optimizer.zero_grad() # Add entropy term to the loss function to encourage having evenly distributed actions (policy_loss - self.entropy_weight * entropy_loss).backward() torch.nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.actor_network_max_grad_norm) self.actor_optimizer.step() return scores
class Agents(): # based on DQN """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size,num_agents, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) # Actor Network self.actor_local = ActorNet(state_size, action_size, seed).to(device) self.actor_target = ActorNet(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = CriticNet(state_size, action_size, seed).to(device) self.critic_target = CriticNet(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Q-Network #self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) #self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Noise process (Instead of epsilon in DQN) - taken from example self.noise = G_Noise((num_agents, action_size), seed,sigma=NOISE_SIGMA) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done,debugFlag=False): # Save experience in replay memory (For N agents) for i in range(self.num_agents): self.memory.add(state[i,:], action[i,:], reward[i], next_state[i,:], done[i]) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA,debugF = debugFlag) def act(self, state, noiseFlag=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ states = torch.from_numpy(state).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i in range(self.num_agents): actions[i, :] = self.actor_local(states[i,:]).cpu().numpy() self.actor_local.train() if noiseFlag: actions += self.noise.sample() return np.clip(actions, -1, 1) # Epsilon-greedy action selection #if random.random() > eps: # return np.argmax(action_values.cpu().data.numpy()) #else: # return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma,debugF=False): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor Actor should learn the best argmax_a(Q(s,a) Critic should learn to expect the Q(s,a) where a is the chosen action by the actor """ states, actions, rewards, next_states, dones = experiences #### CRITIC LEARN #### # calc a_next and Q(s,a)_next action_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, action_next) # calc estimated Q(s,a) (one-step boot straping) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #calc Q(s,a) from critic local (expected) Q_local = self.critic_local(states,actions.float()) critic_loss = F.mse_loss(Q_local, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #### ACTOR LEARN #### actions_predict = self.actor_local(states) actor_loss = -self.critic_local(states,actions_predict).mean() ## we expected low value when actions are good, and minus for the learning direction # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) ''' # Get max predicted Q values (for next states) from target model act_next_local = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, act_next_local) #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) if debugF: #self.qnetwork_target.eval() #tmp = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) #self.qnetwork_target.train() print(Q_targets_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) ''' def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, fc1_units, fc2_units, seed, gamma, lr_actor, lr_critic, tau, buffer_size, batch_size, weight_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action fc1_units (int): number of nodes in layer 1 of neural network fc2_units (int): number of nodes in layer 2 of neural network seed (int): seed gamma (float): discount parameter lr_actor (float): learning rate for Actor lr_critic (float): leanring rate for Critic tau (float): interpolation parameter buffer_size (int): size of memory buffer batch_size (int): number of experiences to sample during learning weight_decay (int): weight decay parameter """ self.state_size = state_size self.action_size = action_size self.gamma = gamma self.lr_actor = lr_actor self.lr_critic = lr_critic self.tau = tau self.batch_size = batch_size # Neural Netowrk Params self.actor_target = ActorNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.actor_local = ActorNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_target = CriticNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.critic_local = CriticNet(state_size, action_size, fc1_units, fc2_units, seed).to(device) self.critic_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_critic) # Noise process self.noise = OUNoise(action_size, seed) # Memory buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn if there are enough samples for a batch if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state) self.actor_local.train() if add_noise: action += torch.as_tensor(self.noise.sample()).float().to(device) return torch.clamp(action, -1, 1).cpu().data.numpy().tolist() def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences ### UPDATE CRITIC ### # Get predicted Q values (for next states) from target model actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss (critic) Q_expected = self.critic_local(states, actions.float()) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss (critic) self.critic_optimizer.zero_grad() critic_loss.backward() # Use grad clipping to prevent exploding gradients torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() ### UPDATE ACTOR ### # Compute loss (actor) actions_pred = self.actor_local(states) actor_loss = self.critic_local(states, actions_pred).mean() # Minimise the loss (actor) self.actor_optimizer.zero_grad() actor_loss.backward() # Use grad clipping to prevent exploding gradients torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() self.soft_update(self.actor_local, self.actor_target, self.tau) self.soft_update(self.critic_local, self.critic_target, self.tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env_name = "BipedalWalkerHardcore-v3" env = gym.make(env_name).unwrapped model_dir = "models/" model_fn = "20201124-080538-checkpoint-4000.pth" results_dir = "./results/" if not os.path.exists(results_dir): os.mkdir(results_dir) cs_cnn = [16, 32, 64, 64] agent = ActorNet(cs_cnn, size=STATE_SPACE_SIZE, c_in=HISTORY_LENGTH, c_out=ACTION_SPACE_SIZE[0]).to( device) # input is the state st = torch.load('/'.join([model_dir, model_fn]), map_location=device)['actor'] agent.load_state_dict(st) for param in agent.parameters(): param.requires_grad_(False) agent.eval() state = FrameHistory(HISTORY_LENGTH) episode_rewards = [] # run episodes for i in range(N_EPISODES): episode_reward = run_episode(env, agent, state, rendering=RENDER)
timestamp = time.strftime("%Y%m%d-%H%M%S") model_fn = timestamp + "-checkpoint" log_dir = "./logs/" log_fn = timestamp + "-log.txt" tensorboard_dir = "./runs/" if not os.path.exists(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.exists(model_dir): os.mkdir(model_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) cs_cnn = [16, 32, 64, 64] cs_fcn = [256, 64] actor = ActorNet(cs_cnn, size=STATE_SPACE_SIZE, c_in=HISTORY_LENGTH, c_out=ACTION_SPACE_SIZE[0]).to(device) # input is the state critic = CriticNet(cs_cnn, cs_fcn, size_action=ACTION_SPACE_SIZE, size_state=(HISTORY_LENGTH, *STATE_SPACE_SIZE)).to(device) # input is the concatenation of state and action actor_target = ActorNet(cs_cnn, size=STATE_SPACE_SIZE, c_in=HISTORY_LENGTH, c_out=ACTION_SPACE_SIZE[0]).to(device) # input is the state critic_target = CriticNet(cs_cnn, cs_fcn, size_action=ACTION_SPACE_SIZE, size_state=(HISTORY_LENGTH, *STATE_SPACE_SIZE)).to(device) # input is the concatenation of state and action # actor_target.load_state_dict(actor.state_dict()) # critic_target.load_state_dict(critic.state_dict()) # for param in actor_target.parameters(): param.requires_grad_(False) # for param in critic_target.parameters(): param.requires_grad_(False) # actor_target.eval() # critic_target.eval() for param in actor_target.parameters(): param.requires_grad_(False) for param in critic_target.parameters(): param.requires_grad_(False) hard_update(actor_target, actor) hard_update(critic_target, critic)
class DDPGAgent: """ A DDPG Agent which interacts and learns from the environment. """ def __init__(self, state_size, action_size, seed): """ Initializes a DDPG Agent. params: - state_size (int) : dimension of each state. - action_size (int) : dimension of each action. - seed (int) : random seed. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.eps = EPSILON_START # Setup Actor Network self.actor_net = ActorNet(self.state_size, self.action_size, seed).to(device) self.target_actor_net = ActorNet(self.state_size, self.action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=LR_ACTOR) # Setup Critic Network self.critic_net = CriticNet(self.state_size, self.action_size, seed).to(device) self.target_critc_net = CriticNet(self.state_size, self.action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=LR_CRITIC) # noise process self.noise = OUNoise(self.action_size, seed) # create replay buffer self.buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # timestep counter self.tstep = 0 def step(self, states, actions, rewards, next_states, dones): # iterate through 20 agents for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): # save experiences in replay buffer self.buffer.push(state, action, reward, next_state, done) # Learn every C timesteps self.tstep = (self.tstep+1) % LEARN_EVERY if self.tstep == 0: # check if enough samples are available in buffer if len(self.buffer) > BATCH_SIZE: # Learn for a few iterations for _ in range(LEARN_FOR): experiences = self.buffer.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): """ Updates policy and value params using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) = action critic_target(state, action) = Qvalue params: - experiences (Tuple([torch.Tensor])) : tuple of (s, a, r, s', done). - gamma (float) : discount factor. """ # unpack experiences s, a, r, ns, d = experiences #################### Update Critic #################### # get predicted next state actions from target models next_actions = self.target_actor_net(ns) # get predicted next state and Q values from target models next_Q_targets = self.target_critc_net(ns, next_actions) # Compute Q targets for current states Q_targets = r + (gamma * next_Q_targets * (1 - d)) # Compute critic loss Q_expected = self.critic_net(s, a) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm(self.critic_net.parameters(), 1.0) self.critic_optimizer.step() ####################################################### #################### Update Actor #################### # compute actor loss predicted_actions = self.actor_net(s) actor_loss = - self.critic_net(s, predicted_actions).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ####################################################### #################### Update Target Networks #################### self.soft_update(self.critic_net, self.target_critc_net, TAU) self.soft_update(self.actor_net, self.target_actor_net, TAU) # decay epsilon if self.eps > EPSILON_END: self.eps *= EPSILON_DECAY self.noise.reset() else: self.eps = EPSILON_END def soft_update(self, local, target, tau): """ Performs a soft update for the parameters. theta_target = tau * theta_local + (1 - tau) * theta_target params: - TAU (float) : interpolation parameter. """ for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def reset(self): """ This function resets the noise. """ self.noise.reset() def act(self, state, add_noise=True): """ Returns actions for a given state as per current policy. params: - state (array like) : current state. - add_noise (boolean) : flag for adding noise. """ state = torch.from_numpy(state).float().to(device) # set actor to eval mode self.actor_net.eval() with torch.no_grad(): # get action values act_vals = self.actor_net(state).cpu().data.numpy() # turn back to train mode self.actor_net.train() # add noise if add_noise: act_vals += self.noise.sample()*self.eps return np.clip(act_vals, -1, 1)