class DDPG: def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0) def act(self, obs, noise_scale=0.0): obs = obs.to(device) # debug noise # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device) # action = self.local_actor(obs) + noise action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device) return action def target_act(self, obs, noise_scale=0.0): obs = obs.to(device) # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device) # action = self.target_actor(obs) + noise_scale * noise action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device) return action def reset(self): self.noise.reset()
class DDPG_agent(nn.Module): def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" action = self.actor_target(state) return action def reset(self): """ Resets noise """ self.noise.reset()
class Agent(): def __init__(self, actor_size, action_size, critic_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(actor_size, action_size).to(self.device) self.actor_target = Actor(actor_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(critic_size).to(self.device) self.critic_target = Critic(critic_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.gamma = 0.95 #0.99 self.tau = 0.001 self.noise = OUNoise((action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0) def select_actions(self, state): state = torch.from_numpy(state).float().to(self.device).view(1, -1) #print(state.shape) self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.squeeze(0) self.actor.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def target_network_update(self, target_network, network, tau): for network_param, target_param in zip(network.parameters(), target_network.parameters()): target_param.data.copy_(tau * network_param.data + (1.0 - tau) * target_param.data)
class PPO(): def __init__(self, state_dim, action_dim): self.actor = Actor(state_dim, action_dim) self.critic = Critic(state_dim) self.optimizer = torch.optim.Adam( itertools.chain(self.actor.parameters(), self.critic.parameters()), LR) def _calc_loss(self, state, action, old_log_prob, expected_values, gae): new_log_prob, action_distr = self.actor.compute_proba(state, action) state_values = self.critic.get_value(state).squeeze(1) critic_loss = ((expected_values - state_values)**2).mean() unclipped_ratio = torch.exp(new_log_prob - old_log_prob) clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP) actor_loss = -torch.min(clipped_ratio * gae, unclipped_ratio * gae).mean() entropy_loss = -action_distr.entropy().mean() return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF def update(self, trajectories): trajectories = map(self._compute_lambda_returns_and_gae, trajectories) transitions = sum( trajectories, []) # Turn a list of trajectories into list of transitions state, action, old_log_prob, target_value, advantage = zip( *transitions) state = np.array(state) action = np.array(action) old_log_prob = np.array(old_log_prob) target_value = np.array(target_value) advantage = np.array(advantage) advnatage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) for _ in range(BATCHES_PER_UPDATE): idx = np.random.randint(0, len(transitions), BATCH_SIZE) # Choose random batch s = torch.from_numpy(state[idx]).float() a = torch.from_numpy(action[idx]).float() op = torch.from_numpy(old_log_prob[idx]).float( ) # Log probability of the action in state s.t. old policy v = torch.from_numpy( target_value[idx]).float() # Estimated by lambda-returns adv = torch.from_numpy(advantage[idx]).float( ) # Estimated by generalized advantage estimation loss = self._calc_loss(s, a, op, v, adv) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def _compute_lambda_returns_and_gae(self, trajectory): lambda_returns = [] gae = [] last_lr = 0. last_v = 0. for s, _, r, _ in reversed(trajectory): ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA) last_lr = ret last_v = self.get_value(s) lambda_returns.append(last_lr) gae.append(last_lr - last_v) # Each transition contains state, action, old action probability, value estimation and advantage estimation return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip( trajectory, reversed(lambda_returns), reversed(gae))] def get_value(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0) value = self.critic.get_value(state) return value.cpu().item() def act(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0) action, pure_action, log_prob = self.actor.act(state) return action.cpu().numpy()[0], pure_action.cpu().numpy( )[0], log_prob.cpu().item() def save(self): torch.save(self.actor, "agent.pkl")
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed=2018): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # # Learn, if enough samples are available in memory # if len(self.memory) > BATCH_SIZE: # experiences = self.memory.sample() # self.learn(experiences, GAMMA) def sampleandlearn(self): ''' Learn from stored experiences ''' if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) # Deactivate gradients and perform forward pass self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for a in range(self.num_agents): action[a] += self.noise.sample() # Clip action return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
states = env_info.vector_observations state_size = states.shape[1] vis = Visdom() win_score = None win_actor_score = None win_critic_loss = None actor = Actor(state_size * 2, action_size * 2).to(device) actor_target = Actor(state_size * 2, action_size * 2).to(device) critic = Critic(state_size * 2, n_action=action_size * 2).to(device) critic_target = Critic(state_size * 2, n_action=action_size * 2).to(device) for target_param, param in zip(critic_target.parameters(), critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(actor_target.parameters(), actor.parameters()): target_param.data.copy_(param.data) replay_buffer = ReplayMemory(args.replay_capacity) criterion = nn.MSELoss() optim_critic = torch.optim.Adam(critic.parameters(), lr=args.lr_critic, weight_decay=args.weight_decay_critic) optim_actor = torch.optim.Adam(actor.parameters(), lr=args.lr_actor) loss_critic = [] score_actor = [] score = 0 steps = 0 noise_std = args.noise_std_start for i in range(args.episodes):
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [ OUNoise(action_size, random_seed, sigma=0.1) for i in range(self.num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done, self.num_agents) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for i in range(self.num_agents): agent_action = action[i] for j in agent_action: j += self.noise[i].sample() return np.clip(action, -1, 1) def reset(self): for i in range(self.num_agents): self.noise[i].reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = t*?_local + (1 - t)*?_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class PPO(): def __init__(self, state_dim, action_dim, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim).to(device) self.critic = Critic(state_dim).to(device) self.optimizer = torch.optim.Adam( itertools.chain(self.actor.parameters(), self.critic.parameters()), LR) self.philosophers = list() for i in range(P_COUNT): self.philosophers.append(Critic(state_dim).to(device)) self.p_optimizers = [ torch.optim.Adam(p.parameters(), lr=P_LR) for p in self.philosophers ] self.update_cnt = 0 def _calc_loss(self, state, action, old_log_prob, expected_values, gae): new_log_prob, action_distr = self.actor.compute_proba(state, action) state_values = self.critic.get_value(state).squeeze(1) critic_loss = ((expected_values - state_values)**2).mean() unclipped_ratio = torch.exp(new_log_prob - old_log_prob) clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP) actor_loss = -torch.min(clipped_ratio * gae, unclipped_ratio * gae).mean() entropy_loss = -action_distr.entropy().mean() p_loss = 0 for p in self.philosophers: p_state_values = self.critic.get_value(state).squeeze(1) p_loss += ((p_state_values - state_values.detach())**2).mean() return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF + p_loss def update(self, trajectories): trajectories = map(self._compute_lambda_returns_and_gae, trajectories) transitions = sum( trajectories, []) # Turn a list of trajectories into list of transitions state, action, old_log_prob, target_value, advantage = zip( *transitions) state = torch.from_numpy(np.array(state)).float().to(self.device) action = torch.from_numpy(np.array(action)).float().to(self.device) old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to( self.device) target_value = torch.from_numpy(np.array(target_value)).float().to( self.device) advantage = torch.from_numpy(np.array(advantage)).float().to( self.device) for _ in range(BATCHES_PER_UPDATE): idx = np.random.randint(0, len(transitions), BATCH_SIZE) loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx], target_value[idx], advantage[idx]) self.optimizer.zero_grad() for p_optimizer in self.p_optimizers: p_optimizer.zero_grad() loss.backward() self.optimizer.step() for p_optimizer in self.p_optimizers: p_optimizer.step() self.update_cnt += 1 if self.update_cnt % P_DELAY == 0: self.critic = self.philosophers[0] self.optimizer = self.p_optimizers[0] self.philosophers.pop(0) self.philosophers.append(Critic(self.state_dim).to(self.device)) self.p_optimizers.pop(0) self.p_optimizers.append( torch.optim.Adam(self.philosophers[-1].parameters(), lr=P_LR)) def _compute_lambda_returns_and_gae(self, trajectory): lambda_returns = [] gae = [] last_lr = 0. last_v = 0. for s, _, r, _ in reversed(trajectory): ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA) last_lr = ret last_v = self.get_value(s) lambda_returns.append(last_lr) gae.append(last_lr - last_v) # Each transition contains state, action, old action probability, value estimation and advantage estimation return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip( trajectory, reversed(lambda_returns), reversed(gae))] def get_value(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0).to( self.device) value = self.critic.get_value(state) return value.cpu().item() def act(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0).to( self.device) action, pure_action, log_prob = self.actor.act(state) return action.cpu().numpy()[0], pure_action.cpu().numpy( )[0], log_prob.cpu().item() def save(self): torch.save(self.actor, "agent.pkl")
class PPO(): def __init__(self, state_dim, action_dim, num_shared, device): self.state_dim = state_dim self.action_dim = action_dim self.device = device self.actor = Actor(state_dim, action_dim, num_shared).to(device) self.critic = Critic(state_dim, num_shared).to(device) def parameters(self): return itertools.chain(self.actor.parameters(), self.critic.parameters()) def first_parameters(self): return itertools.chain(self.actor.first_parameters(), self.critic.first_parameters()) def shared_parameters(self): return itertools.chain(self.actor.shared_parameters(), self.critic.shared_parameters()) def rest_parameters(self): return itertools.chain(self.actor.rest_parameters(), self.critic.rest_parameters()) def _calc_loss(self, state, action, old_log_prob, expected_values, gae): new_log_prob, action_distr = self.actor.compute_proba(state, action) state_values = self.critic.get_value(state).squeeze(1) critic_loss = ((expected_values - state_values) ** 2).mean() unclipped_ratio = torch.exp(new_log_prob - old_log_prob) clipped_ratio = torch.clamp(unclipped_ratio, 1 - CLIP, 1 + CLIP) actor_loss = -torch.min(clipped_ratio * gae, unclipped_ratio * gae).mean() entropy_loss = -action_distr.entropy().mean() return critic_loss * VALUE_COEFF + actor_loss + entropy_loss * ENTROPY_COEF def update(self, trajectories): trajectories = map(self._compute_lambda_returns_and_gae, trajectories) transitions = sum(trajectories, []) # Turn a list of trajectories into list of transitions state, action, old_log_prob, target_value, advantage = zip(*transitions) state = torch.from_numpy(np.array(state)).float().to(self.device) action = torch.from_numpy(np.array(action)).float().to(self.device) old_log_prob = torch.from_numpy(np.array(old_log_prob)).float().to(self.device) target_value = torch.from_numpy(np.array(target_value)).float().to(self.device) advantage = torch.from_numpy(np.array(advantage)).float().to(self.device) for _ in range(BATCHES_PER_UPDATE): idx = np.random.randint(0, len(transitions), BATCH_SIZE) loss = self._calc_loss(state[idx], action[idx], old_log_prob[idx], target_value[idx], advantage[idx]) # ugly code yeah =) # optimization outside yield loss def _compute_lambda_returns_and_gae(self, trajectory): lambda_returns = [] gae = [] last_lr = 0. last_v = 0. for s, _, r, _ in reversed(trajectory): ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA) last_lr = ret last_v = self.get_value(s) lambda_returns.append(last_lr) gae.append(last_lr - last_v) # Each transition contains state, action, old action probability, value estimation and advantage estimation return [(s, a, p, v, adv) for (s, a, _, p), v, adv in zip(trajectory, reversed(lambda_returns), reversed(gae))] def get_value(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) value = self.critic.get_value(state) return value.cpu().item() def act(self, state): with torch.no_grad(): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) action, pure_action, log_prob = self.actor.act(state) return action.cpu().numpy()[0], pure_action.cpu().numpy()[0], log_prob.cpu().item() def save(self): torch.save(self.actor, "agent.pkl")
class DDPG: def __init__(self, state_size, action_size, memory_size=int(1e5), # replay buffer size batch_size=128, # minibatch size gamma=0.99, # discount factor tau=1e-3, # for soft update of target parameters update_every=10, lr_actor=1e-4, lr_critic=1e-3, random_seed=2): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "gamma": gamma, "tau": tau, "memory_size": memory_size, "batch_size": batch_size, "optimizer": "adam"} self.actor_local = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_local = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.memory = ReplayBuffer(action_size, memory_size, batch_size, random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) self.learn_steps = 0 self.update_every = update_every def reset(self): self.noise.reset() def act(self, state, add_noise=True): # for single agent only state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() # must set to eval mode, since BatchNorm used with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action.squeeze(), -1, 1) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.params["batch_size"]: experiences = self.memory.sample() self.learn(experiences, self.params["gamma"]) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ------------------------------------------ # update critic # ------------------------------------------ # recall DQN # Q[s][a] = Q[s][a] + alpha * (r + gamma * np.max(Q[s_next]) - Q[s][a]) # thus, here # Q_local = Q[s][a] # = critic_local(s, a) # Q_target = r + gamma * np.max(Q[s_next]) # = r + gamma * (critic_target[s_next, actor_target(s_next)]) # # calculate np.max(Q[s_next]) with critic_target[s_next, actor_target(s_next)] # because actor suppose to output action which max Q(s) # # loss = mse(Q_local - Q_target) best_actions = self.actor_target(next_states) # supposed to be best actions, however Q_next_max = self.critic_target(next_states, best_actions) Q_target = rewards + gamma * Q_next_max * (1 - dones) # Q_target_detached = Q_target.detach() Q_local = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_local, Q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ------------------------------------------ # update critic # ------------------------------------------ # suppose critic(s,a) give us q_max as a baseline or guidance # we want actor(s) to output the right a # which let critic(s,a)->q_max happen # so we want find a_actor to max Q_critic(s, a) # a_actor is function of θ # so the gradient is dQ/da*da/dθ actions_pred = self.actor_local(states) Q_baseline = self.critic_local(states, actions_pred) actor_loss = -Q_baseline.mean() # I think this is a good trick to make loss to scalar # note, gradients from both actor_local and critic_local will be calculated # however we only update actor_local self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # if self.learn_steps % self.update_every == 0: self.soft_update(self.critic_local, self.critic_target, self.params["tau"]) self.soft_update(self.actor_local, self.actor_target, self.params["tau"]) self.learn_steps += 1 def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent: def __init__(self, env, hidden_size=256, actor_lr=1e-4, critic_lr=1e-3, gamma=0.99, tau=1e-3, max_memory=int(1e6)): obs = env.reset() self.num_states = obs['desired_goal'].shape[0] + obs[ 'observation'].shape[0] self.num_actions = env.action_space.shape[0] self.gamma = gamma self.tau = tau self.action_max = env.action_space.high[0] self.actor = Actor(self.num_states, hidden_size, self.num_actions) self.critic = Critic(self.num_states + self.num_actions, hidden_size, 1) self.target_actor = Actor(self.num_states, hidden_size, self.num_actions) self.target_critic = Critic(self.num_states + self.num_actions, hidden_size, 1) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.experience_replay = ExperienceReplay(max_memory) self.critic_loss_func = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) def get_action(self, state): state = Variable(torch.from_numpy(state).float().unsqueeze(0)) action = self.actor.forward(state) action = action.detach().numpy()[0] return action def update(self, size): states, actions, rewards, next_states, _ = self.experience_replay.sample( size) states = torch.FloatTensor(states) actions = torch.FloatTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) with torch.no_grad(): next_actions = self.target_actor.forward(next_states) q_next = self.target_critic.forward(next_states, next_actions).detach() target_q = rewards.reshape((128, 1)) + self.gamma * q_next target_q = target_q.detach() c = 1 / (1 - self.gamma) target_q = torch.clamp(target_q, -c, 0) real_q = self.critic.forward(states, actions) dif = (target_q - real_q) critic_loss = dif.pow(2).mean() real_actions = self.actor.forward(states) actor_loss = -self.critic.forward(states, real_actions).mean() actor_loss += (real_actions / self.action_max).pow(2).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update target networks for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
class DDPG(): """ This is an Individual DDPG Agent """ def __init__(self, state_size, action_size, seed): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) for this decentralized actor :param action_size: dimension of action (output) for this decentralized actor :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.seed = seed self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Hyperparameters self.buffer_size = 100000 self.batch_size = 256 self.gamma = 0.99 self.tau = 0.01 self.lr_actor = 0.0001 self.lr_critic = 0.001 # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value) self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic) # Initialize local and taret networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, self.seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def act(self, state, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param state: observations for this individual agent :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise and epsilon > np.random.random(): actions += self.noise.sample() return np.clip(actions, -1,1) def step(self): if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class A2C(): def __init__(self, state_dim, action_dim, action_lim, update_type='soft', lr_actor=1e-4, lr_critic=1e-3, tau=1e-3, mem_size=1e6, batch_size=256, gamma=0.99, other_cars=False, ego_dim=None): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.joint_model = False if len(state_dim) == 3: self.model = ActorCriticCNN(state_dim, action_dim, action_lim) self.model_optim = optim.Adam(self.model.parameters(), lr=lr_actor) self.target_model = ActorCriticCNN(state_dim, action_dim, action_lim) self.target_model.load_state_dict(self.model.state_dict()) self.model.to(self.device) self.target_model.to(self.device) self.joint_model = True else: self.actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=lr_actor) self.target_actor = Actor(state_dim, action_dim, action_lim, other_cars=other_cars, ego_dim=ego_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_actor.eval() self.critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.critic_optim = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2) self.target_critic = Critic(state_dim, action_dim, other_cars=other_cars, ego_dim=ego_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.target_critic.eval() self.actor.to(self.device) self.target_actor.to(self.device) self.critic.to(self.device) self.target_critic.to(self.device) self.action_lim = action_lim self.tau = tau # hard update if tau is None self.update_type = update_type self.batch_size = batch_size self.gamma = gamma if self.joint_model: mem_size = mem_size//100 self.memory = Memory(int(mem_size), action_dim, state_dim) mu = np.zeros(action_dim) sigma = np.array([0.5, 0.05]) self.noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.target_noise = OrnsteinUhlenbeckActionNoise(mu, sigma) self.initialised = True self.training = False def select_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(np.expand_dims(obs, axis=0)).to(self.device) if self.joint_model: action, _ = self.model(obs) action = action.data.cpu().numpy().flatten() else: action = self.actor(obs).data.cpu().numpy().flatten() if self.training: action += self.noise() return action else: return action def append(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) def reset_noise(self): self.noise.reset() self.target_noise.reset() def train(self): if self.joint_model: self.model.train() self.target_model.train() else: self.actor.train() self.target_actor.train() self.critic.train() self.target_critic.train() self.training = True def eval(self): if self.joint_model: self.model.eval() self.target_model.eval() else: self.actor.eval() self.target_actor.eval() self.critic.eval() self.target_critic.eval() self.training = False def save(self, folder, episode, previous=None, solved=False): filename = lambda type, ep : folder + '%s' % type + \ (not solved) * ('_ep%d' % (ep)) + \ (solved * '_solved') + '.pth' if self.joint_model: torch.save(self.model.state_dict(), filename('model', episode)) torch.save(self.target_model.state_dict(), filename('target_model', episode)) else: torch.save(self.actor.state_dict(), filename('actor', episode)) torch.save(self.target_actor.state_dict(), filename('target_actor', episode)) torch.save(self.critic.state_dict(), filename('critic', episode)) torch.save(self.target_critic.state_dict(), filename('target_critic', episode)) if previous is not None and previous > 0: if self.joint_model: os.remove(filename('model', previous)) os.remove(filename('target_model', previous)) else: os.remove(filename('actor', previous)) os.remove(filename('target_actor', previous)) os.remove(filename('critic', previous)) os.remove(filename('target_critic', previous)) def load_actor(self, actor_filepath): qualifier = '_' + actor_filepath.split("_")[-1] folder = actor_filepath[:actor_filepath.rfind("/")+1] filename = lambda type : folder + '%s' % type + qualifier if self.joint_model: self.model.load_state_dict(torch.load(filename('model'), map_location=self.device)) self.target_model.load_state_dict(torch.load(filename('target_model'), map_location=self.device)) else: self.actor.load_state_dict(torch.load(filename('actor'), map_location=self.device)) self.target_actor.load_state_dict(torch.load(filename('target_actor'), map_location=self.device)) def load_all(self, actor_filepath): self.load_actor(actor_filepath) qualifier = '_' + actor_filepath.split("_")[-1] folder = actor_filepath[:actor_filepath.rfind("/")+1] filename = lambda type : folder + '%s' % type + qualifier if not self.joint_model: self.critic.load_state_dict(torch.load(filename('critic'), map_location=self.device)) self.target_critic.load_state_dict(torch.load(filename('target_critic'), map_location=self.device)) def update(self, target_noise=True): try: minibatch = self.memory.sample(self.batch_size) # dict of ndarrays except ValueError as e: print('Replay memory not big enough. Continue.') return None, None states = Variable(torch.FloatTensor(minibatch['obs0'])).to(self.device) actions = Variable(torch.FloatTensor(minibatch['actions'])).to(self.device) rewards = Variable(torch.FloatTensor(minibatch['rewards'])).to(self.device) next_states = Variable(torch.FloatTensor(minibatch['obs1'])).to(self.device) terminals = Variable(torch.FloatTensor(minibatch['terminals1'])).to(self.device) if self.joint_model: target_actions, _ = self.target_model(next_states) if target_noise: for sample in range(target_actions.shape[0]): target_actions[sample] += self.target_noise() target_actions[sample].clamp(-self.action_lim, self.action_lim) _, target_qvals = self.target_model(next_states, target_actions=target_actions) y = rewards + self.gamma * (1 - terminals) * target_qvals _, model_qvals = self.model(states, target_actions=actions) value_loss = F.mse_loss(y, model_qvals) model_actions, _ = self.model(states) _, model_qvals = self.model(states, target_actions=model_actions) action_loss = -model_qvals.mean() self.model_optim.zero_grad() (value_loss + action_loss).backward() self.model_optim.step() else: target_actions = self.target_actor(next_states) if target_noise: for sample in range(target_actions.shape[0]): target_actions[sample] += self.target_noise() target_actions[sample].clamp(-self.action_lim, self.action_lim) target_critic_qvals = self.target_critic(next_states, target_actions) y = rewards + self.gamma * (1 - terminals) * target_critic_qvals # optimise critic critic_qvals = self.critic(states, actions) value_loss = F.mse_loss(y, critic_qvals) self.critic_optim.zero_grad() value_loss.backward() self.critic_optim.step() # optimise actor action_loss = -self.critic(states, self.actor(states)).mean() self.actor_optim.zero_grad() action_loss.backward() self.actor_optim.step() # optimise target networks if self.update_type == 'soft': if self.joint_model: soft_update(self.target_model, self.model, self.tau) else: soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) else: if self.joint_model: hard_update(self.target_model, self.model) else: hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) return action_loss.item(), value_loss.item()
class DDPG(): def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] self.a_dim = env.action_space.shape[0] self.env = env self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer() #updates the target network to slowly track the main network def track_network(self, target, main): with torch.no_grad(): for pt, pm in zip(target.parameters(), main.parameters()): pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data) # updates the target nets to slowly track the main ones def track_networks(self): self.track_network(self.targ_mu, self.mu) self.track_network(self.targ_Q, self.Q) def run_episode(self): done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) t = 0 tot_r = 0 while not done: self.mu = self.mu.eval() a = torch.squeeze(self.mu(s)).detach().numpy() self.mu = self.mu.train() ac_noise = self.noise().detach().numpy() a = a + ac_noise s = s.detach().numpy() s_p, r, done, _ = self.env.step(a) tot_r += r self.buffer.add_tuple(s, a, r, s_p, done) s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample( batch_size=self.batch_size) # update critic with torch.no_grad(): q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch)) q_p_pred = torch.squeeze(q_p_pred) y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred self.Q_optimizer.zero_grad() q_pred = self.Q(s_batch, a_batch) q_pred = torch.squeeze(q_pred) #print(torch.mean(q_pred)) Q_loss = self.mse_fn(q_pred, y) Q_loss.backward(retain_graph=False) self.Q_optimizer.step() # update actor self.mu_optimizer.zero_grad() q_pred_mu = self.Q(s_batch, self.mu(s_batch)) q_pred_mu = torch.squeeze(q_pred_mu) #print(torch.mean(q_pred_mu)) mu_loss = -torch.mean(q_pred_mu) # print(mu_loss) mu_loss.backward(retain_graph=False) #print(torch.sum(self.mu.layers[0].weight.grad)) self.mu_optimizer.step() self.track_networks() s = torch.tensor(s_p.astype(np.float32), requires_grad=False) t += 1 return tot_r, t def train(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models/model_' + str(i)) np.save(self.log_dir + '/results_train.npy', np.array(results)) def train1(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models1/model_' + str(i)) np.save(self.log_dir + '/results_train1.npy', np.array(results)) def train2(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models2/model_' + str(i)) np.save(self.log_dir + '/results_train2.npy', np.array(results)) def train3(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 20 == 0: torch.save(self.mu, self.log_dir + '/models3/model_' + str(i)) np.save(self.log_dir + '/results_train3.npy', np.array(results)) def eval_all(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval.npy', np.array(results)) def eval(self, num_eps=10, mu=None): if mu == None: mu = self.mu results = [] mu = mu.eval() for i in range(num_eps): r, t = self.run_eval_episode(mu=mu) results.append([r, t]) print('{} reward: {:.2f}, length: {}'.format(i, r, t)) return np.mean(results, axis=0) def run_eval_episode(self, mu=None): if mu == None: mu = self.mu done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) tot_r = t = 0 while not done: a = mu(s).view(-1).detach().numpy() s_p, r, done, _ = self.env.step(a) tot_r += r t += 1 s = torch.tensor(s_p.astype(np.float32), requires_grad=False) return tot_r, t def fill_buffer(self): print('Filling buffer') s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) while self.buffer.size < self.buffer_min: a = np.random.uniform(self.env.action_space.low, self.env.action_space.high, size=(self.a_dim)) s_p, r, done, _ = self.env.step(a) if done: self.env.reset() self.buffer.add_tuple(s, a, r, s_p, done) s = s_p
class PPO(object): def __init__(self, args, env): self.learning_rate = args.learning_rate self.gamma = args.gamma self.lamb = args.lamb self.batch_size = args.batch_size self.step = 0 self.epochs = args.epochs self.actor = Actor() self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate) self.critic = Critic() self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate) self.env = env self.num_actions = env.num_actions self.num_states = env.num_states self.data = {'step' : [], 'reward' : [], 'losses' : []} def train(self): with torch.no_grad(): #no-grad makes computation faster batch = {'s' : [], 'a' : [], 'r' : [], 'w' : [], 'V_target' : [], 'pi' : []} for i in range(self.batch_size): traj = {'s' : [], 'a' : [], 'r' : [], 'V' : [], 'pi' : []} s = self.env.reset() done = False while done == False: (mu, std) = self.actor(torch.from_numpy(s)) dist = torch.distributions.normal.Normal(mu, std) a = dist.sample().numpy() s1, r, done = self.env.step(a) V = self.critic(torch.from_numpy(s)).item() traj['s'].append(s) traj['a'].append(a) traj['r'].append(r) traj['V'].append(V) traj['pi'].append(dist.log_prob(torch.tensor(a))) s = s1 traj_len = len(traj['r']) r = np.append(traj['r'], 0.) V = np.append(traj['V'], 0.) delta = r[:-1] + (self.gamma * V[1:]) - V[:-1] A = delta.copy() for t in reversed(range(traj_len - 1)): A[t] = A[t] + (self.gamma * self.lamb * A[t + 1]) for t in reversed(range(traj_len)): V[t] = r[t] + (self.gamma * V[t + 1]) V = V[:-1] batch['s'].extend(traj['s']) batch['a'].extend(traj['a']) batch['r'].extend(traj['r']) batch['w'].extend(A) batch['V_target'].extend(V) batch['pi'].extend(traj['pi']) batch['num_steps'] = len(batch['r']) batch['s'] = torch.tensor(batch['s'], requires_grad=False, dtype=torch.double) batch['a'] = torch.tensor(batch['a'], requires_grad=False, dtype=torch.double) batch['r'] = torch.tensor(batch['r'], requires_grad=False, dtype=torch.double) batch['w'] = torch.tensor(batch['w'], requires_grad=False, dtype=torch.double) batch['V_target'] = torch.tensor(batch['V_target'], requires_grad=False, dtype=torch.double) batch['pi'] = torch.tensor(batch['pi'], requires_grad=False, dtype=torch.double) with torch.no_grad(): N = batch['r'].shape[0] / self.batch_size #optimize Actor network for actor_epoch in range(10): self.actor_optimizer.zero_grad() (mu, std) = self.actor(batch['s']) dist = torch.distributions.normal.Normal(mu, std) pi = dist.log_prob(batch['a']).sum(axis=-1) ratio = torch.exp(pi - batch['pi']) surrogate = ratio * batch['w'] clipped = torch.clamp(ratio, min= 1 - 0.2, max = 1 + 0.2) * batch['w'] loss = - torch.mean((torch.min(surrogate, clipped))) loss.backward() self.actor_optimizer.step() #optimize Critic network for critic_epoch in range(10): self.critic_optimizer.zero_grad() V = self.critic(batch['s']) loss = nn.MSELoss()(V.squeeze(1), batch['V_target']) loss.backward() self.critic_optimizer.step() self.data['losses'].append(loss.item()) #logging self.step += batch['r'].shape[0] self.data['step'].append(self.step) self.data['reward'].append(batch['r'].mean() * N) def save_model(self): if not os.path.exists('./model_save/'): os.makedirs('./model_save/') save_dir = './model_save/' + str(self.epochs) + '_epochs.pt' torch.save({'data': self.data, 'actor': self.actor, 'actor_optim': self.actor_optimizer, 'critic_optim': self.critic_optimizer, 'critic': self.critic}, save_dir) def generate_results(self): #LOAD MODEL if not os.path.isdir('./results/'): os.makedirs('./results/') model_steps = np.array(self.data['step']) model_rewards = np.array(self.data['reward']) model_losses = np.array(self.data['losses']) #LEARNING CURVES print('Generating Learning Curves...') fig = plt.figure() plt.plot(model_steps, model_rewards) plt.xlabel('Simulation Steps') plt.ylabel('Total Reward') plt.title('Actor Learning Curve') plt.savefig('./results/Actor_Learning_Curve_' + str(self.epochs) + '_Epochs.png') fig = plt.figure() plt.plot(model_steps, model_losses) plt.xlabel('Simulation Steps') plt.ylabel('Critic Loss') plt.title('Critic Learning Curve') plt.savefig('./results/Critic_Learning_Curve_' + str(self.epochs) + '_Epochs.png') #EXAMPLE TRAJECTORY print('Generating Example Trajectory...') s = self.env.reset() # Create dict to store data from simulation data = { 't': [0], 's': [s], 'a': [], 'r': [], } # Simulate until episode is done done = False while not done: (mu, std) = self.actor(torch.from_numpy(s)) dist = torch.distributions.normal.Normal(mu, std) a = dist.sample().numpy() s, r, done = self.env.step(a) data['t'].append(data['t'][-1] + 1) data['s'].append(s) data['a'].append(a) data['r'].append(r) # Parse data from simulation data['s'] = np.array(data['s']) theta = data['s'][:, 0] thetadot = data['s'][:, 1] # Plot data and save to png file fig = plt.figure() plt.plot(data['t'], theta, label='theta') plt.plot(data['t'], thetadot, label='thetadot') plt.legend() plt.savefig('./results/Example_Trajectory_' + str(self.epochs) + '_Epochs.png' ) #ANIMATED TRAJECTORY print('Generating Animated Tragjectory...') filename='./results/Animated_Trajectory_' + str(self.epochs) + '_Epochs.gif' writer='imagemagick' s = self.env.reset() s_traj = [s] done = False while not done: (mu, std) = self.actor(torch.from_numpy(s)) dist = torch.distributions.normal.Normal(mu, std) a = dist.sample().numpy() s, r, done = self.env.step(a) s_traj.append(s) fig = plt.figure(figsize=(5, 4)) ax = fig.add_subplot(111, autoscale_on=False, xlim=(-1.2, 1.2), ylim=(-1.2, 1.2)) ax.set_aspect('equal') ax.grid() line, = ax.plot([], [], 'o-', lw=2) text = ax.set_title('') def animate(i): theta = s_traj[i][0] line.set_data([0, -np.sin(theta)], [0, np.cos(theta)]) text.set_text(f'time = {i * self.env.dt:3.1f}') return line, text anim = animation.FuncAnimation(fig, animate, len(s_traj), interval=(1000 * self.env.dt), blit=True, repeat=False) anim.save(filename, writer=writer, fps=10) plt.close() #POLICY VISUALIZATION print('Generating Policy Visualization...') theta_range = np.linspace(-np.pi, np.pi, 200) theta_dot_range = np.linspace(-self.env.max_thetadot_for_init, self.env.max_thetadot_for_init, 200) policy = np.zeros((len(theta_range), len(theta_dot_range))) for i in range(len(theta_range)): for j in range(len(theta_dot_range)): state = torch.tensor([theta_range[i], theta_dot_range[j]], dtype=torch.float64) (mu, std) = self.actor(state) dist = torch.distributions.normal.Normal(mu, std) a = dist.sample().numpy() policy[i][j] = a fig = plt.figure() plt.imshow(policy, cmap='coolwarm') plt.xlabel('theta dot') plt.ylabel('theta') plt.colorbar() plt.title('Policy Visualization') fig.savefig('./results/Policy_Visualization_' + str(self.epochs) + '_Epochs.png') fig.clf() #VALUE FUNCTION VISUALIZATION print('Generating Value Function Visualization...') theta_range = np.linspace(-np.pi, np.pi, 200) theta_dot_range = np.linspace(-self.env.max_thetadot_for_init, self.env.max_thetadot_for_init, 200) value = np.zeros((len(theta_range), len(theta_dot_range))) for i in range(len(theta_range)): for j in range(len(theta_dot_range)): state = torch.tensor([theta_range[i], theta_dot_range[j]], dtype=torch.float64) V = self.critic(state).item() value[len(theta_range)-i][j] = V fig = plt.figure() plt.imshow(value, cmap='coolwarm') plt.xlabel('theta dot') plt.ylabel('theta') plt.colorbar() plt.title('Value Function Visualization') fig.savefig('./results/Value_Visualization_' + str(self.epochs) + '_Epochs.png') fig.clf() print('done')
class A2C(): """ Advantage Actor-Critic RL agent. Notes ----- * GPU implementation is still work in progress. * Always uses 2 separate networks for the critic,one that learns from new experience (student/critic) and the other one (critic_target/teacher)that is more conservative and whose weights are updated through an exponential moving average of the weights of the critic, i.e. target.params = (1-tau)*target.params + tau* critic.params * In the case of Monte Carlo estimation the critic_target is never used * Possible to use twin networks for the critic and the critic target for improved stability. Critic target is used for updates of both the actor and the critic and its output is the minimum between the predictions of its two internal networks. """ def __init__(self, observation_space, action_space, lr, gamma, TD=True, discrete=False, project_dim=8, hiddens=[64, 32], twin=False, tau=1., n_steps=1, device='cpu', debug=False): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take lr: float in [0,1] Learning rate gamma: float in [0,1] Discount factor TD: bool (default=True) If True, uses Temporal Difference for the critic's estimates Otherwise uses Monte Carlo estimation discrete: bool (default=False) If True, adds an embedding layer both in the actor and the critic networks before processing the state. Should be used if the state is a simple integer in [0, observation_space -1] project_dim: int (default=8) Number of dimensions of the embedding space (e.g. number of dimensions of embedding(state) ). Higher dimensions are more expressive. hiddens: list of int (default = [64,32]) List containing the number of neurons of each linear hidden layer. Same architecture is considered for the actor and the critic, except from the output layer, than in one case has the dimension of the action space and a LogSoftmax activation, in the other outputs a scalar (state value) twin: bool (default=False) Enables twin networks both for critic and critic_target tau: float in [0,1] (default = 1.) Regulates how fast the critic_target gets updates, i.e. what percentage of the weights inherits from the critic. If tau=1., critic and critic_target are identical at every step, if tau=0. critic_target is unchangable. As a default this feature is disabled setting tau = 1, but if one wants to use it a good empirical value is 0.005. n_steps: int (default=1) Number of steps considered in TD update device: str in {'cpu','cuda'} (default='cpu') Implemented, but GPU slower than CPU because it's difficult to optimize a RL agent without a replay buffer, that can be used only in off-policy algorithms. """ self.gamma = gamma self.lr = lr self.n_actions = action_space self.discrete = discrete self.TD = TD self.twin = twin self.tau = tau self.n_steps = n_steps self.actor = Actor(observation_space, action_space, discrete, project_dim, hiddens=hiddens) self.critic = Critic(observation_space, discrete, project_dim, twin, hiddens=hiddens) if self.TD: self.critic_trg = Critic(observation_space, discrete, project_dim, twin, target=True, hiddens=hiddens) # Init critic target identical to critic for trg_params, params in zip(self.critic_trg.parameters(), self.critic.parameters()): trg_params.data.copy_(params.data) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr) self.device = device self.actor.to(self.device) self.critic.to(self.device) if self.TD: self.critic_trg.to(self.device) if debug: print("=" * 10 + " A2C HyperParameters " + "=" * 10) print("Discount factor: ", self.gamma) print("Learning rate: ", self.lr) print("Action space: ", self.n_actions) print("Discrete state space: ", self.discrete) print("Temporal Difference learning: ", self.TD) if self.TD: print("Number of TD steps: ", self.n_steps) print("Twin networks: ", self.twin) print("Update critic target factor: ", self.tau) print("Device used: ", self.device) print("\n\n" + "=" * 10 + " A2C Architecture " + "=" * 10) print("Actor architecture: \n", self.actor) print("Critic architecture: \n", self.critic) print("Critic target architecture: ") if self.TD: print(self.critic_trg) else: print("Not used") def get_action(self, state, return_log=False): log_probs = self.forward(state) dist = torch.exp(log_probs) probs = Categorical(dist) action = probs.sample().item() if return_log: return action, log_probs.view(-1)[action] else: return action def forward(self, state): """ Makes a tensor out of a numpy array state and then forward it with the actor network. Parameters ---------- state: If self.discrete is True state.shape = (episode_len,) Otherwise state.shape = (episode_len, observation_space) """ if self.discrete: state = torch.from_numpy(state).to(self.device) else: state = torch.from_numpy(state).float().unsqueeze(0).to( self.device) log_probs = self.actor(state) return log_probs def update(self, *args): if self.TD: critic_loss, actor_loss = self.update_TD(*args) else: critic_loss, actor_loss = self.update_MC(*args) return critic_loss, actor_loss def update_TD(self, rewards, log_probs, states, done, bootstrap=None): ### Compute n-steps rewards, states, discount factors and done mask ### n_step_rewards = self.compute_n_step_rewards(rewards) if debug: print("n_step_rewards.shape: ", n_step_rewards.shape) print("rewards.shape: ", rewards.shape) print("n_step_rewards: ", n_step_rewards) print("rewards: ", rewards) if bootstrap is not None: done[bootstrap] = False if debug: print("done.shape: (before n_steps)", done.shape) print("done: (before n_steps)", done) if self.discrete: old_states = torch.tensor(states[:-1]).to(self.device) new_states, Gamma_V, done = self.compute_n_step_states( states, done) new_states = torch.tensor(new_states).to(self.device) else: old_states = torch.tensor(states[:, :-1]).float().to(self.device) new_states, Gamma_V, done = self.compute_n_step_states( states[0], done) new_states = torch.tensor(new_states).float().unsqueeze(0).to( self.device) if debug: print("done.shape: (after n_steps)", done.shape) print("Gamma_V.shape: ", Gamma_V.shape) print("done: (after n_steps)", done) print("Gamma_V: ", Gamma_V) print("old_states.shape: ", old_states.shape) print("new_states.shape: ", new_states.shape) ### Wrap variables into tensors ### done = torch.LongTensor(done.astype(int)).to(self.device) log_probs = torch.stack(log_probs).to(self.device) n_step_rewards = torch.tensor(n_step_rewards).float().to(self.device) Gamma_V = torch.tensor(Gamma_V).float().to(self.device) ### Update critic and then actor ### critic_loss = self.update_critic_TD(n_step_rewards, new_states, old_states, done, Gamma_V) actor_loss = self.update_actor_TD(n_step_rewards, log_probs, new_states, old_states, done, Gamma_V) return critic_loss, actor_loss def update_critic_TD(self, n_step_rewards, new_states, old_states, done, Gamma_V): # Compute loss with torch.no_grad(): V_trg = self.critic_trg(new_states).squeeze() if debug: print("V_trg.shape (after critic): ", V_trg.shape) V_trg = (1 - done) * Gamma_V * V_trg + n_step_rewards if debug: print("V_trg.shape (after sum): ", V_trg.shape) V_trg = V_trg.squeeze() if debug: print("V_trg.shape (after squeeze): ", V_trg.shape) if self.twin: V1, V2 = self.critic(old_states) loss1 = 0.5 * F.mse_loss(V1.squeeze(), V_trg) loss2 = 0.5 * F.mse_loss(V2.squeeze(), V_trg) loss = loss1 + loss2 else: V = self.critic(old_states).squeeze() loss = F.mse_loss(V, V_trg) # Backpropagate and update self.critic_optim.zero_grad() loss.backward() self.critic_optim.step() # Update critic_target: (1-tau)*old + tau*new for trg_params, params in zip(self.critic_trg.parameters(), self.critic.parameters()): trg_params.data.copy_((1. - self.tau) * trg_params.data + self.tau * params.data) return loss.item() def update_actor_TD(self, n_step_rewards, log_probs, new_states, old_states, done, Gamma_V): # Compute gradient if self.twin: V1, V2 = self.critic(old_states) V_pred = torch.min(V1.squeeze(), V2.squeeze()) V1_new, V2_new = self.critic(new_states) V_new = torch.min(V1_new.squeeze(), V2_new.squeeze()) V_trg = (1 - done) * Gamma_V * V_new + n_step_rewards else: V_pred = self.critic(old_states).squeeze() V_trg = (1 - done) * Gamma_V * self.critic( new_states).squeeze() + n_step_rewards A = V_trg - V_pred policy_gradient = -log_probs * A if debug: print("V_trg.shape: ", V_trg.shape) print("V_pred.shape: ", V_pred.shape) print("A.shape: ", A.shape) print("policy_gradient.shape: ", policy_gradient.shape) policy_grad = torch.sum(policy_gradient) # Backpropagate and update self.actor_optim.zero_grad() policy_grad.backward() self.actor_optim.step() return policy_grad.item() def compute_n_step_rewards(self, rewards): """ Computes n-steps discounted reward padding with zeros the last elements of the trajectory. This means that the rewards considered are AT MOST n, but can be less for the last n-1 elements. """ T = len(rewards) # concatenate n_steps zeros to the rewards -> they do not change the cumsum r = np.concatenate((rewards, [0 for _ in range(self.n_steps)])) Gamma = np.array([self.gamma**i for i in range(r.shape[0])]) # reverse everything to use cumsum in right order, then reverse again Gt = np.cumsum(r[::-1] * Gamma[::-1])[::-1] G_nstep = Gt[:T] - Gt[ self.n_steps:] # compute n-steps discounted return Gamma = Gamma[:T] assert len( G_nstep) == T, "Something went wrong computing n-steps reward" n_steps_r = G_nstep / Gamma return n_steps_r def compute_n_step_states(self, states, done): """ Computes n-steps target states (to be used by the critic as target values together with the n-steps discounted reward). For last n-1 elements the target state is the last one available. Adjusts also the `done` mask used for disabling the bootstrapping in the case of terminal states and returns Gamma_V, that are the discount factors for the target state-values, since they are n-steps away (except for the last n-1 states, whose discount is adjusted accordingly). Return ------ new_states, Gamma_V, done: arrays with first dimension = len(states)-1 """ # Compute indexes for (at most) n-step away states n_step_idx = np.arange(len(states) - 1) + self.n_steps diff = n_step_idx - len(states) + 1 mask = (diff > 0) n_step_idx[mask] = len(states) - 1 # Compute new states new_states = states[n_step_idx] # Compute discount factors pw = np.array([self.n_steps for _ in range(len(new_states))]) pw[mask] = self.n_steps - diff[mask] Gamma_V = self.gamma**pw # Adjust done mask mask = (diff >= 0) done[mask] = done[-1] return new_states, Gamma_V, done def update_MC(self, rewards, log_probs, states, done, bootstrap=None): ### Compute MC discounted returns ### if bootstrap is not None: if bootstrap[-1] == True: last_state = torch.tensor(states[0, -1, :]).float().to( self.device).view(1, -1) if self.twin: V1, V2 = self.critic(last_state) V_bootstrap = torch.min(V1, V2).cpu().detach().numpy().reshape( 1, ) else: V_bootstrap = self.critic( last_state).cpu().detach().numpy().reshape(1, ) rewards = np.concatenate((rewards, V_bootstrap)) Gamma = np.array([self.gamma**i for i in range(rewards.shape[0])]) # reverse everything to use cumsum in right order, then reverse again Gt = np.cumsum(rewards[::-1] * Gamma[::-1])[::-1] # Rescale so that present reward is never discounted discounted_rewards = Gt / Gamma if bootstrap is not None: if bootstrap[-1] == True: discounted_rewards = discounted_rewards[:-1] # drop last ### Wrap variables into tensors ### dr = torch.tensor(discounted_rewards).float().to(self.device) if self.discrete: old_states = torch.tensor(states[:-1]).to(self.device) new_states = torch.tensor(states[1:]).to(self.device) else: old_states = torch.tensor(states[:, :-1]).float().to(self.device) new_states = torch.tensor(states[:, 1:]).float().to(self.device) done = torch.LongTensor(done.astype(int)).to(self.device) log_probs = torch.stack(log_probs).to(self.device) ### Update critic and then actor ### critic_loss = self.update_critic_MC(dr, old_states) actor_loss = self.update_actor_MC(dr, log_probs, old_states) return critic_loss, actor_loss def update_critic_MC(self, dr, old_states): # Compute loss if self.twin: V1, V2 = self.critic(old_states) V_pred = torch.min(V1.squeeze(), V2.squeeze()) else: V_pred = self.critic(old_states).squeeze() loss = F.mse_loss(V_pred, dr) # Backpropagate and update self.critic_optim.zero_grad() loss.backward() self.critic_optim.step() return loss.item() def update_actor_MC(self, dr, log_probs, old_states): # Compute gradient if self.twin: V1, V2 = self.critic(old_states) V_pred = torch.min(V1.squeeze(), V2.squeeze()) else: V_pred = self.critic(old_states).squeeze() A = dr - V_pred policy_gradient = -log_probs * A policy_grad = torch.sum(policy_gradient) # Backpropagate and update self.actor_optim.zero_grad() policy_grad.backward() self.actor_optim.step() return policy_grad.item()
class Agent(): """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, fc1_units, fc2_units): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = torch.manual_seed(SEED) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_target = Critic(state_size, action_size, fc1_units, fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OrnsteinUhlenbeck(action_size, SEED) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, SEED, device) def step(self, time_step, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn only every N_TIME_STEPS if time_step % N_TIME_STEPS != 0: return # Learn if enough samples are available in replay buffer if len(self.memory) > BATCH_SIZE: for i in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """ Returns actions for given state as per current policy. """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets from current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def store(self): torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth') def load(self): if os.path.isfile('checkpoint_actor.pth') and os.path.isfile( 'checkpoint_critic.pth'): print("=> loading checkpoints for Actor and Critic... ") self.actor_local.load_state_dict('checkpoint_actor') self.critic_local.load_state_dict('checkpoint_critic') print("done !") else: print("no checkpoints found for Actor and Critic...")
class Agent(): def __init__(self, state_size, action_size): super().__init__() gpu = torch.cuda.is_available() if (gpu): print('GPU/CUDA works! Happy fast training :)') torch.cuda.current_device() torch.cuda.empty_cache() self.device = torch.device("cuda") else: print('training on cpu...') self.device = torch.device("cpu") self.actor = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.001, weight_decay=0) self.replay_buffer = deque(maxlen=1000000) #1m self.gamma = 0.95 #0.99 self.batch_size = 128 self.tau = 0.001 self.seed = random.seed(2) self.noise = OUNoise((20, action_size), 2) self.target_network_update(self.actor_target, self.actor, 1.0) self.target_network_update(self.critic_target, self.critic, 1.0) def select_actions(self, state): state = torch.from_numpy(state).float().to(self.device) self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.numpy() self.actor.train() actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def add(self, sars): self.replay_buffer.append(sars) def train(self): if (len(self.replay_buffer) > self.batch_size): states, actions, rewards, next_states, dones = self.sample() next_actions = self.actor_target(next_states) next_state_q_v = self.critic_target(next_states, next_actions) #print(next_state_q_v) q_targets = rewards + (self.gamma * next_state_q_v * (1 - dones)) current_q_v = self.critic(states, actions) critic_loss = F.mse_loss(current_q_v, q_targets) self.critic_optim.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 1) self.critic_optim.step() actions = self.actor(states) actor_loss = -self.critic(states, actions).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.target_network_update(self.actor_target, self.actor, self.tau) self.target_network_update(self.critic_target, self.critic, self.tau) def target_network_update(self, target_network, network, tau): for network_param, target_param in zip(network.parameters(), target_network.parameters()): target_param.data.copy_(tau * network_param.data + (1.0 - tau) * target_param.data) def sample(self): samples = random.sample(self.replay_buffer, k=self.batch_size) states = torch.tensor([s[0] for s in samples]).float().to(self.device) actions = torch.tensor([s[1] for s in samples]).float().to(self.device) rewards = torch.tensor([s[2] for s in samples ]).float().unsqueeze(1).to(self.device) next_states = torch.tensor([s[3] for s in samples]).float().to(self.device) dones = torch.tensor([s[4] for s in samples ]).float().unsqueeze(1).to(self.device) return states, actions, rewards, next_states, dones
class PolicyGradEnt(): """ Implements an RL agent with policy gradient method. Notes ----- GPU implementation is just sketched; it works but it's slower than with CPU. """ def __init__(self, observation_space, action_space, lr, gamma, H, discrete=True, project_dim=4, device='cpu'): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.lr = lr self.H = H # entropy coeff self.n_actions = action_space self.discrete = discrete if self.discrete: self.net = Actor(observation_space, action_space, discrete, project_dim) else: self.net = Actor(observation_space, action_space, discrete) self.optim = torch.optim.Adam(self.net.parameters(), lr=self.lr) self.device = device self.net.to(self.device) # move network to device def get_action(self, state, return_log=False): log_probs = self.forward(state) dist = torch.exp(log_probs) probs = Categorical(dist) action = probs.sample().item() if return_log: return action, log_probs.view(-1)[action], dist else: return action def forward(self, state): if self.discrete: state = torch.from_numpy(state).to(self.device) else: state = torch.from_numpy(state).float().unsqueeze(0).to( self.device) return self.net(state) def update(self, rewards, log_probs, distributions): ### Compute MC discounted returns ### Gamma = np.array([self.gamma**i for i in range(rewards.shape[0])]) # reverse everything to use cumsum in right order, then reverse again Gt = np.cumsum(rewards[::-1] * Gamma[::-1])[::-1] # Rescale so that present reward is never discounted discounted_rewards = Gt / Gamma dr = torch.tensor(discounted_rewards).to(self.device) dr = (dr - dr.mean()) / dr.std() policy_gradient = [] for log_prob, Gt in zip(log_probs, dr): policy_gradient.append( -log_prob * Gt) # "-" for minimization instead of maximization distributions = torch.stack(distributions).squeeze() # shape = (T,2) # Compute negative entropy (no - in front) entropy = torch.sum(distributions * torch.log(distributions), axis=1).sum() policy_grad = torch.stack(policy_gradient).sum() loss = policy_grad + self.H * entropy self.optim.zero_grad() loss.backward() self.optim.step() return policy_grad.item()
class DDPG(): def __init__(self, env, log_dir, gamma=0.99, batch_size=64, sigma=0.2, batch_norm=True, merge_layer=2, buffer_size=int(1e6), buffer_min=int(1e4), tau=1e-3, Q_wd=1e-2, num_episodes=1000): self.s_dim = env.reset().shape[0] # self.a_dim = env.action_space.shape[0] self.a_dim = env.action_space2.shape[0] # self.a_dim = 1 self.env = env # self.mu = Actor(self.s_dim, self.a_dim, env.action_space, batch_norm=batch_norm) self.mu = Actor(self.s_dim, self.a_dim, env.action_space2, batch_norm=batch_norm) self.Q = Critic(self.s_dim, self.a_dim, batch_norm=batch_norm, merge_layer=merge_layer) self.targ_mu = copy.deepcopy(self.mu).eval() self.targ_Q = copy.deepcopy(self.Q).eval() self.noise = OrnsteinUhlenbeck(mu=torch.zeros(self.a_dim), sigma=sigma * torch.ones(self.a_dim)) self.buffer = Buffer(buffer_size, self.s_dim, self.a_dim) self.buffer_min = buffer_min self.mse_fn = torch.nn.MSELoss() self.mu_optimizer = torch.optim.Adam(self.mu.parameters(), lr=1e-4) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=1e-3, weight_decay=Q_wd) self.gamma = gamma self.batch_size = batch_size self.num_episodes = num_episodes self.tau = tau self.log_dir = log_dir self.fill_buffer() #updates the target network to slowly track the main network def track_network(self, target, main): with torch.no_grad(): for pt, pm in zip(target.parameters(), main.parameters()): pt.data.copy_(self.tau * pm.data + (1 - self.tau) * pt.data) # updates the target nets to slowly track the main ones def track_networks(self): self.track_network(self.targ_mu, self.mu) self.track_network(self.targ_Q, self.Q) def run_episode(self): done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) t = 0 tot_r = 0 while not done: self.mu = self.mu.eval() # a_ = torch.squeeze(self.mu(s)).detach().numpy() a = torch.squeeze(self.mu(s)).detach().numpy() # print("a {}\n".format(a)) self.mu = self.mu.train() ac_noise = self.noise().detach().numpy() a = a + ac_noise # print("ac_noise {}\n".format(ac_noise)) # print("a+ac_noise {}\n".format(a)) if a < self.env.action_space2.low: a = self.env.action_space2.low elif a > self.env.action_space2.high: a = self.env.action_space2.high s = s.detach().numpy() a_updated = self.LQR(s, a) # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) tot_r += r self.buffer.add_tuple(s, a, r, s_p, done) s_batch, a_batch, r_batch, s_p_batch, done_batch = self.buffer.sample( batch_size=self.batch_size) # update critic with torch.no_grad(): q_p_pred = self.targ_Q(s_p_batch, self.targ_mu(s_p_batch)) q_p_pred = torch.squeeze(q_p_pred) y = r_batch + (1.0 - done_batch) * self.gamma * q_p_pred self.Q_optimizer.zero_grad() q_pred = self.Q(s_batch, a_batch) q_pred = torch.squeeze(q_pred) #print(torch.mean(q_pred)) Q_loss = self.mse_fn(q_pred, y) Q_loss.backward(retain_graph=False) self.Q_optimizer.step() # update actor self.mu_optimizer.zero_grad() q_pred_mu = self.Q(s_batch, self.mu(s_batch)) q_pred_mu = torch.squeeze(q_pred_mu) #print(torch.mean(q_pred_mu)) mu_loss = -torch.mean(q_pred_mu) # print(mu_loss) mu_loss.backward(retain_graph=False) #print(torch.sum(self.mu.layers[0].weight.grad)) self.mu_optimizer.step() self.track_networks() s = torch.tensor(s_p.astype(np.float32), requires_grad=False) t += 1 return tot_r, t def train(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models/model_' + str(i)) np.save(self.log_dir + '/results_train.npy', np.array(results)) def train1(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models1/model_' + str(i)) np.save(self.log_dir + '/results_train1.npy', np.array(results)) def train2(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models2/model_' + str(i)) np.save(self.log_dir + '/results_train2.npy', np.array(results)) def train3(self): results = [] for i in range(self.num_episodes): r, t = self.run_episode() print('{} reward: {:.2f}, length: {}'.format(i, r, t)) results.append([r, t]) if i % 10 == 0: torch.save(self.mu, self.log_dir + '/models3/model_' + str(i)) np.save(self.log_dir + '/results_train3.npy', np.array(results)) def eval_all(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval.npy', np.array(results)) def eval_all1(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval1.npy', np.array(results)) def eval_all2(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval2.npy', np.array(results)) def eval_all3(self, model_dir, num_eps=5): results = [] for model_fname in sorted(os.listdir(model_dir), key=lambda x: int(x.split('_')[1])): print(model_fname) mu = torch.load(os.path.join(model_dir, model_fname)) r, t = self.eval(num_eps=num_eps, mu=mu) results.append([r, t]) np.save(self.log_dir + '/results_eval3.npy', np.array(results)) def eval(self, num_eps=10, mu=None): if mu == None: mu = self.mu results = [] mu = mu.eval() for i in range(num_eps): r, t = self.run_eval_episode(mu=mu) results.append([r, t]) print('{} reward: {:.2f}, length: {}'.format(i, r, t)) return np.mean(results, axis=0) def run_eval_episode(self, mu=None): if mu == None: mu = self.mu done = False s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) tot_r = t = 0 while not done: a = mu(s).view(-1).detach().numpy() a_updated = self.LQR(s, a) # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) tot_r += r t += 1 s = torch.tensor(s_p.astype(np.float32), requires_grad=False) return tot_r, t def LQR(self, s, a): FPS = 50 SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well VIEWPORT_W = 600 VIEWPORT_H = 400 gravity = 9.8 / FPS / FPS # gravity is enhanced by scaling thrust_main_max = gravity / 0.56 thrust_side_max = thrust_main_max * 0.095 / 0.7 # m/frame^2 # determined by test m_main_inv = thrust_main_max # gravity*0.57 m_side_inv = thrust_side_max # gravity*0.225 a_i_inv = 0.198 / 100 # rad/frame^2 # determined by test # not depend on SCALE align = 0.87 # 0.87 = sin30 # target point set x_target = 0 y_target = 0 # the landing point is 0 Vx_target = 0 Vy_target = 0 theta_target = 0 omega_target = 0 if a < self.env.action_space2.low: a = self.env.action_space2.low elif a > self.env.action_space2.high: a = self.env.action_space2.high a_float = float(a) y_target = s[1] * (VIEWPORT_H / SCALE / 2) / a_float # 1.6 succeeds all the times X = np.array([ \ [s[0]*(VIEWPORT_W/SCALE/2)-x_target], \ [s[1]*(VIEWPORT_H/SCALE/2)-y_target], \ [s[2]/(VIEWPORT_W/SCALE/2)-Vx_target], \ [s[3]/(VIEWPORT_H/SCALE/2)-Vy_target], \ [s[4]-theta_target], \ [s[5]/20.0-omega_target]]) # print("X {}\n".format(X)) A = np.array([ \ [0, 0, 1, 0, 0, 0], \ [0, 0, 0, 1, 0, 0], \ [0, 0, 0, 0, -1*gravity, 0], \ [0, 0, 0, 0, 0, 0], \ [0, 0, 0, 0, 0, 1], \ [0, 0, 0, 0, 0, 0]]) B = np.array([ \ [0, 0], \ [0, 0], \ [0, m_side_inv*align], \ [1*m_main_inv, 0], \ [0, 0], \ [0, -1*a_i_inv]]) sigma = np.array([ \ [0], \ [0], \ [0], \ [-1*gravity], \ [0], \ [0]]) # gravity compensation BTB = np.dot(B.T, B) u_sigma = -1 * np.linalg.inv(BTB).dot(B.T).dot(sigma) # print("u_sigma {}\n".format(u_sigma)) # Design of LQR # Solve Riccati equation to find a optimal control input R = np.array([ \ [1, 0], \ [0, 1]]) Q = np.array([ \ [1, 0, 0, 0, 0, 0], \ [0, 1, 0, 0, 0, 0], \ [0, 0, 1, 0, 0, 0], \ [0, 0, 0, 1, 0, 0], \ [0, 0, 0, 0, 100, 0], \ [0, 0, 0, 0, 0, 100]]) # Solving Riccati equation P = sp.linalg.solve_continuous_are(A, B, Q, R) # print("P {}\n".format(P)) # u = -KX # K = R-1*Rt*P K = np.linalg.inv(R).dot(B.T).dot(P) thrust = -1 * np.dot(K, X) + u_sigma BK = np.dot(B, K) A_ = A - BK a_eig = np.linalg.eig(A_) a_sort = np.sort(a_eig[0]) # print("eigen values {}\n".format(a_sort)) # print("thrust {}\n".format(thrust)) # thrust[0] = 0 # thrust[1] = 1 if s[1] < 0.3 / SCALE: thrust[0] = 0 thrust[1] = 0 # conversion to compensate main thruster's tricky thrusting thrust[0] = thrust[0] / 0.5 - 1.0 if self.env.continuous: a_updated = np.array([thrust[0], thrust[1]]) # print("a_updated {}\n".format(a_updated)) # a = (0.5, 0) a_updated = np.clip( a_updated, -1, +1) # if the value is less than 0.5, it's ignored # print("a_updated * {}\n".format(a_updated)) else: print("please change to cts mode") return a_updated def fill_buffer(self): print('Filling buffer') s = torch.tensor(self.env.reset().astype(np.float32), requires_grad=False) temp_number = 0 while self.buffer.size < self.buffer_min: # self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32) a = np.random.uniform(self.env.action_space2.low, self.env.action_space2.high, size=(self.a_dim)) a_updated = self.LQR(s, a) if temp_number < 3: print("a {}\n".format(a), "actions:", "{} {}".format(a_updated[0], a_updated[1])) # print("a_updated*** {}\n".format(a_updated)) temp_number += 1 # s_p, r, done, _ = self.env.step(a) s_p, r, done, _ = self.env.step(a_updated) if done: self.env.reset() self.buffer.add_tuple(s, a, r, s_p, done) s = s_p
class DyNODESacAgent(object): """DyNODE-SAC.""" def __init__(self, obs_shape, action_shape, device, model_kind, kind='D', step_MVE=5, hidden_dim=256, discount=0.99, init_temperature=0.01, alpha_lr=1e-3, alpha_beta=0.9, actor_lr=1e-3, actor_beta=0.9, actor_log_std_min=-10, actor_log_std_max=2, critic_lr=1e-3, critic_beta=0.9, critic_tau=0.005, critic_target_update_freq=2, model_lr=1e-3, log_interval=100): self.device = device self.discount = discount self.critic_tau = critic_tau self.critic_target_update_freq = critic_target_update_freq self.log_interval = log_interval self.step_MVE = step_MVE self.model_kind = model_kind self.actor = Actor(obs_shape, action_shape, hidden_dim, actor_log_std_min, actor_log_std_max).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr, betas=(actor_beta, 0.999)) self.critic = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target = Critic(obs_shape, action_shape, hidden_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr, betas=(critic_beta, 0.999)) self.log_alpha = torch.tensor(np.log(init_temperature)).to(device) self.log_alpha.requires_grad = True self.target_entropy = -np.prod( action_shape) # set target entropy to -|A| self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr, betas=(alpha_beta, 0.999)) if self.model_kind == 'dynode_model': self.model = DyNODE(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200).to(device) elif self.model_kind == 'nn_model': self.model = NN_Model(obs_shape, action_shape, hidden_dim_p=200, hidden_dim_r=200, kind=kind).to(device) else: assert 'model is not supported' self.model_optimizer = torch.optim.Adam(self.model.parameters(), lr=model_lr) self.train() self.critic_target.train() def train(self, training=True): self.training = training self.actor.train(training) self.critic.train(training) self.model.train(training) @property def alpha(self): return self.log_alpha.exp() def select_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) obs = obs.unsqueeze(0) mu, _, _, _ = self.actor(obs, compute_pi=False, compute_log_pi=False) return mu.cpu().data.numpy().flatten() def sample_action(self, obs): with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) obs = obs.unsqueeze(0) mu, pi, _, _ = self.actor(obs, compute_log_pi=False) return pi.cpu().data.numpy().flatten() def update_model(self, replay_buffer, L, step): if self.model_kind == 'dynode_model': obs_m, action_m, reward_m, next_obs_m, _ = replay_buffer.sample_dynode( ) transition_loss, reward_loss = self.model.loss( obs_m, action_m, reward_m, next_obs_m) model_loss = transition_loss + reward_loss elif self.model_kind == 'nn_model': obs, action, reward, next_obs, _ = replay_buffer.sample() transition_loss, reward_loss = self.model.loss( obs, action, reward, next_obs) model_loss = transition_loss + reward_loss else: assert 'model is not supported' # Optimize the Model self.model_optimizer.zero_grad() model_loss.backward() self.model_optimizer.step() if step % self.log_interval == 0: L.log('train/model_loss', model_loss, step) def MVE_prediction(self, replay_buffer, L, step): obs, action, reward, next_obs, not_done = replay_buffer.sample() trajectory = [] next_ob = next_obs with torch.no_grad(): while len(trajectory) < self.step_MVE: ob = next_ob _, act, _, _ = self.actor(ob) rew, next_ob = self.model(ob, act) trajectory.append([ob, act, rew, next_ob]) _, next_action, log_pi, _ = self.actor(next_ob) target_Q1, target_Q2 = self.critic_target(next_ob, next_action) ret = torch.min(target_Q1, target_Q2) - self.alpha.detach() * log_pi critic_loss = 0 for ob, act, rew, _ in reversed(trajectory): current_Q1, current_Q2 = self.critic(ob, act) ret = rew + self.discount * ret # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean() critic_loss = critic_loss + F.mse_loss( current_Q1, ret) + F.mse_loss(current_Q2, ret) current_Q1, current_Q2 = self.critic(obs, action) ret = reward + self.discount * ret # critic_loss = critic_loss + utils.huber(current_Q1 - ret).mean() + utils.huber(current_Q2 - ret).mean() critic_loss = critic_loss + F.mse_loss(current_Q1, ret) + F.mse_loss( current_Q2, ret) critic_loss = critic_loss / (self.step_MVE + 1) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # actor _, pi, log_pi, log_std = self.actor(obs) actor_Q1, actor_Q2 = self.critic(obs.detach(), pi) actor_Q = torch.min(actor_Q1, actor_Q2) actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean() # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.log_alpha_optimizer.zero_grad() alpha_loss = (self.alpha * (-log_pi - self.target_entropy).detach()).mean() alpha_loss.backward() self.log_alpha_optimizer.step() def update_critic(self, obs, action, reward, next_obs, not_done, L, step): with torch.no_grad(): _, policy_action, log_pi, _ = self.actor(next_obs) target_Q1, target_Q2 = self.critic_target(next_obs, policy_action) target_V = torch.min(target_Q1, target_Q2) - self.alpha.detach() * log_pi target_Q = reward + (not_done * self.discount * target_V) # get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) if step % self.log_interval == 0: L.log('train_critic/loss', critic_loss, step) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.critic.log(L, step) def update_actor_and_alpha(self, obs, L, step): _, pi, log_pi, log_std = self.actor(obs) actor_Q1, actor_Q2 = self.critic(obs, pi) actor_Q = torch.min(actor_Q1, actor_Q2) actor_loss = (self.alpha.detach() * log_pi - actor_Q).mean() if step % self.log_interval == 0: L.log('train_actor/loss', actor_loss, step) L.log('train_actor/target_entropy', self.target_entropy, step) entropy = 0.5 * log_std.shape[1] * ( 1.0 + np.log(2 * np.pi)) + log_std.sum(dim=-1) if step % self.log_interval == 0: L.log('train_actor/entropy', entropy.mean(), step) # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor.log(L, step) self.log_alpha_optimizer.zero_grad() alpha_loss = (self.alpha * (-log_pi - self.target_entropy).detach()).mean() if step % self.log_interval == 0: L.log('train_alpha/loss', alpha_loss, step) L.log('train_alpha/value', self.alpha, step) alpha_loss.backward() self.log_alpha_optimizer.step() def update(self, replay_buffer, L, step): if step < 2000: for _ in range(2): obs, action, reward, next_obs, not_done = replay_buffer.sample( ) self.update_critic(obs, action, reward, next_obs, not_done, L, step) self.update_actor_and_alpha(obs, L, step) if step % self.log_interval == 0: L.log('train/batch_reward', reward.mean(), step) else: obs, action, reward, next_obs, not_done = replay_buffer.sample() if step % self.log_interval == 0: L.log('train/batch_reward', reward.mean(), step) self.MVE_prediction(replay_buffer, L, step) self.update_critic(obs, action, reward, next_obs, not_done, L, step) self.update_actor_and_alpha(obs, L, step) if step % self.critic_target_update_freq == 0: utils.soft_update_params(self.critic.Q1, self.critic_target.Q1, self.critic_tau) utils.soft_update_params(self.critic.Q2, self.critic_target.Q2, self.critic_tau) def save(self, model_dir, step): torch.save(self.actor.state_dict(), '%s/actor_%s.pt' % (model_dir, step)) torch.save(self.critic.state_dict(), '%s/critic_%s.pt' % (model_dir, step)) def save_model(self, model_dir, step): torch.save(self.model.state_dict(), '%s/model_%s.pt' % (model_dir, step)) def load(self, model_dir, step): self.actor.load_state_dict( torch.load('%s/actor_%s.pt' % (model_dir, step))) self.critic.load_state_dict( torch.load('%s/critic_%s.pt' % (model_dir, step)))
class A2C_v1(): """ Implements Advantage Actor Critic RL agent. Uses episode trajectories to update. Notes ----- GPU implementation is just sketched; it works but it's slower than with CPU. """ def __init__(self, observation_space, action_space, lr, gamma, device='cpu', discrete=False, project_dim=8): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.lr = lr self.n_actions = action_space self.discrete = discrete if self.discrete: self.actor = DiscreteActor(observation_space, action_space, project_dim) self.critic = DiscreteCritic(observation_space, project_dim) else: self.actor = Actor(observation_space, action_space) self.critic = Critic(observation_space) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr) self.device = device ### Not implemented ### #self.actor.to(self.device) # move network to device #self.critic.to(self.device) def get_action(self, state, return_log=False): log_probs = self.forward(state) dist = torch.exp(log_probs) probs = Categorical(dist) action = probs.sample().item() if return_log: return action, log_probs.view(-1)[action] else: return action def forward(self, state): if self.discrete: state = torch.from_numpy(state) else: state = torch.from_numpy(state).float().unsqueeze(0) log_probs = self.actor(state) return log_probs def update(self, rewards, log_probs, states, done): # Wrap variables in tensors if self.discrete: old_states = torch.tensor(states[:,:-1]) new_states = torch.tensor(states[:,1:]) else: old_states = torch.tensor(states[:,:-1]).float() new_states = torch.tensor(states[:,1:]).float() done = torch.LongTensor(done.astype(int)) #log_probs = torch.tensor(log_probs.astype(float)) ### ERROR HERE log_probs = torch.stack(log_probs) # Update critic and then actor self.update_critic(rewards, new_states, old_states, done) self.update_actor(rewards, log_probs, new_states, old_states) return def update_critic(self, rewards, new_states, old_states, done): """ Minimize \sum_{t=0}^{T-1}(rewards[t] + gamma V(new_states[t]) - V(old_states[t]) )**2 where V(state) is the prediction of the critic. Parameters ---------- reward: shape (T,) old_states, new_states: shape (T, observation_space) """ rewards = torch.tensor(rewards) #.to(self.device) #print("rewards.shape ", rewards.shape) # Predictions V_pred = self.critic(old_states).squeeze() #print("V_pred.shape ", V_pred.shape) # Targets V_trg = self.critic(new_states).squeeze().detach() #print("V_trg.shape ", V_trg.shape) V_trg = (1-done)*self.gamma*V_trg + rewards #print("V_trg.shape ", V_trg.shape) # MSE loss loss = torch.sum((V_pred - V_trg)**2) # backprop and update self.critic_optim.zero_grad() loss.backward() self.critic_optim.step() return def update_actor(self, rewards, log_probs, new_states, old_states): # Discount factors Gamma = np.array([self.gamma**i for i in range(rewards.shape[1])]).reshape(1,-1) # reverse everything to use cumsum in right order, then reverse again Gt = np.cumsum(rewards[:,::-1]*Gamma[:,::-1], axis=1)[:,::-1] # Rescale so that present reward is never discounted discounted_rewards = Gt/Gamma # Wrap into tensor dr = torch.tensor(discounted_rewards).float() #.to(self.device) #print("dr ", dr.shape) # Get value as baseline V = self.critic(old_states).squeeze() # Compute advantage as total (discounted) return - value A = dr - V # Rescale to unitary variance for a trajectory (axis=1) #A = (A - A.mean(axis=1).unsqueeze(1))/(A.std(axis=1).unsqueeze(1)) #print("A ", A.shape) #print("log_probs ", log_probs.shape) # Compute - gradient policy_gradient = - log_probs*A #print("policy_gradient ", policy_gradient.shape) # Use it as loss policy_grad = torch.sum(policy_gradient) # barckprop and update self.actor_optim.zero_grad() policy_grad.backward() self.actor_optim.step() return
class TD3MultiAgent: def __init__(self): self.max_action = 1 self.policy_freq = 2 self.policy_freq_it = 0 self.batch_size = 512 self.discount = 0.99 self.replay_buffer = int(1e5) self.device = 'cuda' self.state_dim = 24 self.action_dim = 2 self.max_action = 1 self.policy_noise = 0.1 self.agents = 1 self.random_period = 1e4 self.tau = 5e-3 self.replay_buffer = ReplayBuffer(self.replay_buffer) self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target = Actor(self.state_dim, self.action_dim, self.max_action).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) # self.actor.load_state_dict(torch.load('actor2.pth')) # self.actor_target.load_state_dict(torch.load('actor2.pth')) self.noise = OUNoise(2, 32) self.critic = Critic(48, self.action_dim).to(self.device) self.critic_target = Critic(48, self.action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) def select_action_with_noise(self, state, i): import pdb ratio = len(self.replay_buffer)/self.random_period if len(self.replay_buffer)>self.random_period: state = torch.FloatTensor(state[i,:]).to(self.device) action = self.actor(state).cpu().data.numpy() if self.policy_noise != 0: action = (action + self.noise.sample()) return action.clip(-self.max_action,self.max_action) else: q= self.noise.sample() return q def step(self, i): if len(self.replay_buffer)>self.random_period/2: # Sample mini batch # if True: import pdb s, a, r, s_, d = self.replay_buffer.sample(self.batch_size) state = torch.FloatTensor(s[:,i,:]).to(self.device) action = torch.FloatTensor(a[:,i,:]).to(self.device) next_state = torch.FloatTensor(s_[:,i,:]).to(self.device) a_state = torch.FloatTensor(s).to(self.device).reshape(-1,48) a_action = torch.FloatTensor(a).to(self.device).reshape(-1,4) a_next_state = torch.FloatTensor(s_).to(self.device).reshape(-1,48) done = torch.FloatTensor(1 - d[:,i]).to(self.device) reward = torch.FloatTensor(r[:,i]).to(self.device) # pdb.set_trace() # Select action with the actor target and apply clipped noise noise = torch.FloatTensor(a[:,i,:]).data.normal_(0, self.policy_noise).to(self.device) noise = noise.clamp(-0.1,0.1) # NOISE CLIP WTF? next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(a_next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward.reshape(-1,1) + (done.reshape(-1,1) * self.discount * target_Q).detach() # Get current Q estimates current_Q1, current_Q2 = self.critic(a_state, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.policy_freq_it % self.policy_freq == 0: # Compute actor loss actor_loss = -self.critic.Q1(a_state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.policy_freq_it += 1 return True def reset(self): self.policy_freq_it = 0 self.noise.reset()
class Agent(): ''' Interacts with and learns from the environment ''' def __init__(self, num_agents, state_size, action_size, random_seed=2018): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.device = torch.device('cuda' if cuda else 'cpu') self.update = UPDATE_EVERY self.updates = NUMBER_OF_UPDATES # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) def step(self, state, action, reward, next_state, done, timestep): ''' Save experience in replay memory, and use random sample from buffer to learn ''' # Save experience into memory __for each agent__ for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # If we are in the timestep to update if timestep % self.update == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: # Do learning "updates" times for _ in range(self.updates): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): ''' Returns actions for given state as per current policy ''' states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) # Deactivate gradients and perform forward pass self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: for a in range(self.num_agents): actions[a, :] += self.noise.sample() # Clip action return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples ''' states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # Dimensions actions_next = self.actor_target(next_states) # (BSx2) Q_targets_next = self.critic_target(next_states, actions_next) # # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local( states, actions_pred).mean() # Average over the minibatch # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): ''' Soft update model parameters ''' for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class A2C_v0(): # DOES NOT WORK STILL """ Implements Advantage Actor Critic RL agent. Updates to be executed step by step. Notes ----- GPU implementation is just sketched; it works but it's slower than with CPU. """ def __init__(self, observation_space, action_space, lr_actor, lr_critic, gamma, device='cpu', discrete=False, project_dim=8): """ Parameters ---------- observation_space: int Number of flattened entries of the state action_space: int Number of (discrete) possible actions to take """ self.gamma = gamma self.n_actions = action_space self.discrete = discrete if self.discrete: self.actor = DiscreteActor(observation_space, action_space, project_dim) self.critic = DiscreteCritic(observation_space, project_dim) else: self.actor = Actor(observation_space, action_space) self.critic = Critic(observation_space) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=lr_critic) self.device = device ### Not implemented ### #self.actor.to(self.device) #self.critic.to(self.device) def get_action(self, state, return_log=False): log_probs = self.forward(state) dist = torch.exp(log_probs) probs = Categorical(dist) action = probs.sample().item() if return_log: return action, log_probs.view(-1)[action] else: return action def forward(self, state): if self.discrete: state = torch.from_numpy(state) else: state = torch.from_numpy(state).float().unsqueeze(0) log_probs = self.actor(state) return log_probs def update(self, reward, log_prob, state, new_state, done): # Wrap variables in tensors reward = torch.tensor(reward) if self.discrete: old_state = torch.tensor(state).unsqueeze(0) new_state = torch.tensor(new_state).unsqueeze(0) else: old_state = torch.tensor(state).float().unsqueeze(0) new_state = torch.tensor(new_state).float().unsqueeze(0) #log_prob = torch.tensor([log_prob]) # THIS DETACHES THE TENSOR!! log_prob = log_prob.view(1,1) # Update critic and then actor self.update_critic(reward, new_state, old_state, done) self.update_actor(reward, log_prob, new_state, old_state, done) return def update_critic(self, reward, new_state, old_state, done): # Predictions V_pred = self.critic(old_state).squeeze() #print("V_pred ", V_pred) # Targets V_trg = self.critic(new_state).squeeze() #print("V_trg (net) ", V_trg) # done = 1 if new_state is a terminal state V_trg = (1-done)*self.gamma*V_trg + reward V_trg = V_trg.detach() #print("V_trg (+r) ", V_trg) # MSE loss loss = (V_pred - V_trg).pow(2).sum() #print("loss ", loss) # backprop and update self.critic_optim.zero_grad() loss.backward() self.critic_optim.step() return def update_actor(self, reward, log_prob, new_state, old_state, done): # compute advantage A = (1-done)*self.gamma*self.critic(new_state).squeeze() + reward - self.critic(old_state).squeeze() #print("Advantage ", A) # compute gradient policy_gradient = - log_prob*A #print("policy_gradient ", policy_gradient) # backprop and update self.actor_optim.zero_grad() policy_gradient.backward() self.actor_optim.step() return
class DDPG(): """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """ def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def train(self, env, brain_name, num_episodes=200, max_time=1000, print_every=10): """ Interacts with and learns from a given Unity Environment :param env: Unity Environment the agents is trying to learn :param brain_name: Brain for Environment :param num_episodes: Number of episodes to train :param max_time: How long each episode runs for :param print_every: How often in episodes to print a running average :return: Returns episodes scores and 100 episode averages as lists """ # --------- Set Everything up --------# scores = [] avg_scores = [] scores_deque = deque(maxlen=print_every) # -------- Simulation Loop --------# for episode_num in range(1, num_episodes + 1): # Reset everything env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations episode_scores = np.zeros(self.num_agents) self.reset_noise() # Run the episode for t in range(max_time): actions = self.act(states, self.epsilon) env_info = env.step(actions)[brain_name] next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done self.step(states, actions, rewards, next_states, dones) episode_scores += rewards states = next_states if np.any(dones): break # -------- Episode Finished ---------# self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) scores.append(np.mean(episode_scores)) scores_deque.append(np.mean(episode_scores)) avg_scores.append(np.mean(scores_deque)) if episode_num % print_every == 0: print( f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}' ) torch.save( self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save( self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # -------- All Episodes finished Save parameters and scores --------# # Save Model Parameters torch.save(self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save(self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # Save mean score per episode (of the 20 agents) f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w') scores_string = "\n".join([str(score) for score in scores]) f.write(scores_string) f.close() # Save average scores for 100 window average f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w') avgScores_string = "\n".join([str(score) for score in avg_scores]) f.write(avgScores_string) f.close() return scores, avg_scores def step(self, states, actions, rewards, next_states, dones): """ what the agent needs to do for every time step that occurs in the environment. Takes in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not the same as a step in the environment. Step is only called once per environment time step. :param states: array of states agent used to select actions :param actions: array of actions taken by agents :param rewards: array of rewards for last action taken in environment :param next_states: array of next states after actions were taken :param dones: array of bools representing if environment is finished or not """ # Save experienced in replay memory for agent_num in range(self.num_agents): self.memory.add(states[agent_num], actions[agent_num], rewards[agent_num], next_states[agent_num], dones[agent_num]) # Learn "num_updates" times every "update_every" time step self.t_step += 1 if len(self.memory ) > self.batch_size and self.t_step % self.update_every == 0: self.t_step = 0 for _ in range(self.num_updates): experiences = self.memory.sample() self.learn(experiences) def act(self, states, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param states: array of states from the environment :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() # Sets to eval mode (no gradients) with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # Sets to train mode (gradients back on) if add_noise and epsilon > np.random.random(): actions += [self.noise.sample() for _ in range(self.num_agents)] return np.clip(actions, -1, 1) def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)