class DDPG: def __init__(self, gamma, memory, s, a, tau, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.memory = ReplayMemory(memory) self.actor = Actor(state=s, actions=a) self.critic = Critic(state=s, actions=a) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.targetActor = Actor(state=s, actions=a) self.targetActor.load_state_dict(self.actor.state_dict()) self.targetCritic = Critic(state=s, actions=a) self.targetCritic.load_state_dict(self.critic.state_dict()) self.tau = tau self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.OUarray = np.zeros((1000, self.action), dtype="f") self.step = 0 def processNoise(self): #this should be something more eloquent.... ret = torch.rand(self.action) for i in range(0, self.action): r = random.random() if (r <= .33): ret[i] = ret[i] elif (.33 < r and r <= .66): ret[i] = 0 else: ret[i] = -ret[i] return ret def OUprocess(self, sigma, theta, mu): # define model parameters t_0 = 0 t_end = 10 length = 1000 y = np.zeros((length, self.action), dtype="f") t = np.linspace(t_0, t_end, length) # define time axis dt = np.mean(np.diff(t)) drift = lambda y, t: theta * (mu - y) # define drift term diffusion = lambda y, t: sigma # define diffusion term # solve SDE for j in xrange(1, self.action): y[0][j] = np.random.normal(loc=0.0, scale=1.0) # initial condition noise = np.random.normal(loc=0.0, scale=1.0, size=length) * np.sqrt( dt) #define noise process for i in xrange(1, length): y[i][j] = y[i - 1][j] + drift( y[i - 1][j], i * dt) * dt + diffusion(y[i - 1][j], i * dt) * noise[i] self.OUarray = y def selectAction(self, state): #remember, state better be an autograd Variable ret = self.targetActor(Variable(state)).data ret = ret + torch.from_numpy(self.OUarray[self.step]) self.step += 1 return torch.clamp(ret, 0.0, 1.0) def addToMemory(self, state, action, reward, stateprime): self.memory.push(state, action, reward, stateprime) def primedToLearn(self): return self.memory.isFull() def PerformUpdate(self, batchsize): #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch.... self.actorOptimizer.zero_grad() self.criticOptimizer.zero_grad() batch = self.memory.batch(batchsize) Q = torch.zeros(len(batch), self.state + self.action) Qprime = torch.zeros(len(batch), self.state + self.action) rewards = torch.zeros(len(batch), 1) # This loop should generate all Q values for the batch i = 0 for sample in batch: Q[i, :] = torch.cat((sample['s'], sample['a'])) transition = self.targetActor( Variable(sample['sprime'], volatile=True)).data Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0) rewards[i, 0] = sample['r'][0] i += 1 #Critic Update Qprime = self.gamma * self.targetCritic( Variable(Qprime)).data + rewards Qprime = Variable(Qprime) Q = self.critic(Variable(Q)) criterion = torch.nn.MSELoss() loss = criterion(Q, Qprime) loss.backward() self.criticOptimizer.step() criterion = torch.nn.MSELoss() self.actorOptimizer.zero_grad() S = torch.zeros(len(batch), self.state) i = 0 for sample in batch: S[i, :] = sample['s'] i += 1 A = self.actor(Variable(S)) loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1))) loss.backward() self.actorOptimizer.step() def UpdateTargetNetworks(self): criticDict = self.critic.state_dict() tCriticDict = self.targetCritic.state_dict() for param in criticDict.keys(): tCriticDict[param] = tCriticDict[param] * ( 1 - self.tau) + criticDict[param] * self.tau actorDict = self.actor.state_dict() tActorDict = self.targetActor.state_dict() for param in actorDict.keys(): tActorDict[param] = tActorDict[param] * ( 1 - self.tau) + actorDict[param] * self.tau self.targetCritic.load_state_dict(tCriticDict) self.targetActor.load_state_dict(tActorDict) def saveActorCritic(self): torch.save(self.critic.state_dict(), './critic') torch.save(self.actor.state_dict(), './actor')
class Agent(): '''This agent Interacts with the environment to learn a policy that yields the highest commulative reward. The agent uses the Deep Deterministic Policy Gradient algorithm''' def __init__(self, state_size, action_size, seed=0): '''Initlize the Agent. Parameters ---------- state_size : int The dimension of each state action_size : int The dimension of each action seed : int The random seed used to generate random numbers. ''' self.state_size = state_size self.action_size = action_size random.seed(seed) #actor gives the best action for given state self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) #evaluates the action self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY) #Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) #Noise self.noise = OUNoise(action_size,seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): '''Instructs the agent to take a step in the environment. Executes each time the agent takes a step in the environment. The observed (state, action, reward, next_state, done) tuple is saved in the replay buffer. Once enough experiences have been captured the model is trained. Parameters ---------- state : array_like The current state. action : int The action that was taken. reward : int The reward that was received. next_state : array_like The next state. done : boolean True if the episode is completed, else False ''' self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step+1)%UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.train_model_parameters(experiences) def get_action(self, state, epsilon=0, add_noise=True): '''Gets the action for the given state defined by the current policy. The method returns the action to take for the given state given the current policy. In order to explore in the continuous space noise is added to the action. Parameters ---------- state : array_like The current state. epsilon : float The epsilon value usedfor epsilon-greedy action selection. add_noise : boolean Add noise to the action to encourage exploration. Returns ------- action : array-like The action to take. Each value is between -1 and 1. ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action+=self.noise.sample() return np.clip(action,-1,1) def train_model_parameters(self, experiences): '''Update the model parameters using the given batch of experience tuples. The models are train via the Actor Critic paradigm. The next action is optained fromt he target actor. This is then passed to the target critic to obtain the target next state. The target current state is calculated via the bellman equations. The local critic estimates the next state and is updated accordingly. The local actions predictions the next actions given the current state. The loss for the actor is calculated as the ... Parameters ---------- experiences : Tuple[torch.Variable] A name tuple of state, action, reward, next_action and done. ''' states, actions, rewards, next_states, dones = experiences #Update critic next_actions = self.actor_target(next_states) Q_next_states = self.critic_target(next_states,next_actions) Q_states = rewards + GAMMA*Q_next_states*(1-dones) Q_states_estimated = self.critic_local(states,actions) critic_loss = F.mse_loss(Q_states_estimated, Q_states) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #Update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states,actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self._update_model_parameters(self.critic_local, self.critic_target) self._update_model_parameters(self.actor_local, self.actor_target) def _update_model_parameters(self,local_network, target_network): '''Copy the learned local network parameters to the target network. This method updates the Target network with the learned network parameters. The target parameters are old movd TAU towards the learned local parameters. The is done to help redude the amount of harmful correlation by constating moving the target. ''' for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(TAU*local_param.data + (1-TAU) * target_param.data)
class Policy: def save_policy(self, save_name): self.actor.save_model(save_name=save_name) def load_policy(self, load_name): self.actor.load_model(load_name=load_name) def demonstrate(self, ep_count=1): env = gym.make(self.envid) with torch.no_grad(): for e in range(ep_count): done = False ob = env.reset() while not done: observation = ob[None] action = self.sample_action(observations=observation) action = action[0] ob_, r, done, _ = env.step(action) env.render() env.close() def weights_init(self, m): if hasattr(m, 'weight'): torch.nn.init.xavier_uniform_(m.weight) m.bias.data.fill_(0) def __init__(self, env_id="LunarLanderContinuous-v2"): self.envid = env_id env = gym.make(env_id) state_size = np.prod(list(env.observation_space.shape)) action_size = np.prod(list(env.action_space.shape)) # the max and min here are shorcut # because all spaces have same range here # ideally when clamping, different dimension should support different ranges self.low_action = env.action_space.low.max() self.high_action = env.action_space.high.min() self.actor = Actor(n_ip=state_size, n_op=action_size) self.critic = Critic(n_ip=state_size) self.actor.apply(self.weights_init) self.critic.apply(self.weights_init) if torch.cuda.is_available(): self.use_gpu = True self.device = torch.device("cuda") else: self.use_gpu = False self.device = torch.device("cpu") def sample_action(self, observations): with torch.no_grad(): ob = torch.Tensor(observations).to(self.device) m, s = self.actor(ob) chnk = len(m[0]) # the size of action space here m = m.cpu().flatten().float() s = s.cpu().flatten().float() samples = torch.normal(mean=m, std=s) samples = torch.clamp(samples, min=self.low_action, max=self.high_action) # sampled_action = samples.reshape(-1, chnk).numpy() sampled_action = samples.reshape(-1, chnk).detach().numpy() return sampled_action def get_log_prob(self, mean, standard_deviation, actions): m = mean s = standard_deviation log_prob = torch.distributions.Normal(loc=m, scale=s).log_prob(actions) log_prob = log_prob.sum(-1) return log_prob def improve_critic(self, data_loader, lr=0.001, batch_size=128, iterations=1): total_loss = 0 total_len = 0 optimizer = optim.Adam(self.critic.parameters(), lr=lr) loader = dataloader.DataLoader(data_loader, batch_size=batch_size, shuffle=True) for e in range(iterations): optimizer.zero_grad() for states, targets in loader: total_len += len(targets) # the targets here should be normalized prediction = self.critic(states) loss = nn.functional.mse_loss(prediction, targets) loss.backward() optimizer.step() total_loss += loss.item() avg_loss = total_loss / total_len return avg_loss def improve_actor(self, data_loader, lr=0.001, batch_size=128, iterations=1): optimizer = optim.Adam(self.actor.parameters(), lr=lr) loader = dataloader.DataLoader(data_loader, batch_size=batch_size, shuffle=True) for e in range(iterations): for states, actions, values in loader: optimizer.zero_grad() m, s = self.actor(states) lp = self.get_log_prob(mean=m, standard_deviation=s, actions=actions) loss = torch.sum(-(lp * values)) loss.backward() optimizer.step()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() logging.warning(action) return np.clip(action, 0.0000001, 7.0) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, params): self.action_size = params['action_size'] self.state_size = params['state_size'] self.num_agents = params['num_agents'] self.buffer_size = params['buffer_size'] self.batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__update_every = params['update_every'] self.__save_to = params['save_to'] self.__memory = ReplayBuffer(self.buffer_size, self.batch_size) self.__lr = params['lr'] self.noise_type = params['noise_type'] actor_params = dict() actor_params['arch_params_actor'] = params['arch_params_actor'] actor_params['action_size'] = self.action_size actor_params['state_size'] = self.state_size actor_params['eps'] = params['eps'] actor_params['eps_decay'] = params['eps_decay'] actor_params['eps_min'] = params['min_eps'] actor_params['noise_type'] = params['noise_type'] self.actor = Actor(actor_params) self.actor_target = Actor(actor_params) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.__lr) self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) critic_params = dict() critic_params['arch_params_critic'] = params['arch_params_critic'] critic_params['action_size'] = self.action_size critic_params['state_size'] = self.state_size self.critic = Critic(critic_params) self.critic_target = Critic(critic_params) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.__lr) self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) self.__t = 0 def memorize_experience(self, state, action, reward, next_state, done): self.__memory.add(state, action.detach(), reward, next_state, done) self.__t = (self.__t + 1) def learn_from_past_experiences(self): if self.__t % self.__update_every == 0: if len(self.__memory) > self.batch_size: experiences = self.__memory.sample() self.update_actor_critic(experiences) def choose_action(self, state): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") state = torch.from_numpy(state.astype(dtype=np.float)).to(device) action, action_perturbed = self.actor(state) return action, action_perturbed def update_actor_critic(self, experiences): states, actions, rewards, next_states, dones = experiences next_actions, next_actions_perturbed = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones) ) # if done == True: second term is equal to 0 Q_expected = self.critic(states, actions) loss_func = nn.MSELoss() loss_critic = loss_func(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss_critic.backward() # self.scheduler_critic.step() self.optimizer_critic.step() predicted_actions, predicted_actions_perturbed = self.actor( states) # new predicted actions, not the ones stored in buffer if self.noise_type == 'parameter': #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise if (predicted_actions - predicted_actions_perturbed).pow(2).mean() >= 0.15: self.actor.eps /= 1.01 self.actor_target.eps /= 1.01 else: self.actor.eps *= 1.01 self.actor_target.eps *= 1.01 loss_actor = -self.critic(states, predicted_actions).mean() self.optimizer_actor.zero_grad() loss_actor.backward() # self.scheduler_actor.step() self.optimizer_actor.step() self.soft_update(self.critic, self.critic_target) self.soft_update(self.actor, self.actor_target) def update_eps(self): self.actor.eps = max(self.actor.eps * self.actor.eps_decay, self.actor.eps_min) self.actor_target.eps = max( self.actor_target.eps * self.actor_target.eps_decay, self.actor_target.eps_min) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.__tau * local_param.data + (1.0 - self.__tau) * target_param.data) def save_weights(self, save_to): actor_params_and_state_dict = { 'actor_params': self.actor.actor_params, 'state_dict': self.actor.state_dict() } critic_params_and_state_dict = { 'critic_params': self.critic.critic_params, 'state_dict': self.critic.state_dict() } file = dict() file['critic_params_and_state_dict'] = critic_params_and_state_dict file['actor_params_and_state_dict'] = actor_params_and_state_dict torch.save(file, open(save_to, 'wb')) def load_weights(self, load_from): checkpoint = torch.load(load_from) critic_params_and_state_dict = checkpoint[ 'critic_params_and_state_dict'] actor_params_and_state_dict = checkpoint['actor_params_and_state_dict'] self.actor = Actor(actor_params_and_state_dict['actor_params']) self.actor.load_state_dict(actor_params_and_state_dict['state_dict']) self.critic = Critic(critic_params_and_state_dict['critic_params']) self.critic.load_state_dict(critic_params_and_state_dict['state_dict']) return self
class ActorCritic: def __init__(self, state_dim, action_dim, memory, load): self.memory = memory self.noise = OrnsteinUhlenbeckActionNoise(action_dim) self.actor = Actor(state_dim, action_dim) self.critic = Critic(state_dim, action_dim) self.target_actor = Actor(state_dim, action_dim) self.target_critic = Critic(state_dim, action_dim) self.critic.cuda() self.actor.cuda() self.target_critic.cuda() self.target_actor.cuda() self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),LEARNING_RATE) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),LEARNING_RATE) self.loss_funct = nn.SmoothL1Loss() if load != 0: self.load_models(load) #load the model # Target and trained networks are the same when initializing self.net_update(self.target_actor, self.actor, True) self.net_update(self.target_critic, self.critic, True) # Predict an action with or without noise depending on the "train" flag def get_action(self, state, train): state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor)) action = self.actor.forward(state).detach().cpu().numpy() if train: noise = np.float32(self.noise.sample()) return action + noise return action # Run the optimization: # Get predicted action from the next state by Target Actor # Base on that predict the Value of that action by Target Critic # Use the predicted value to update Critic, and then Actor # Soft update target networks to mirror the progress def optimize(self): state,action,reward,next_state = self.memory.sample(BATCH_SIZE) state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor)) action = Variable(torch.from_numpy(np.float32(action)).type(torch.cuda.FloatTensor)) reward = Variable(torch.from_numpy(np.float32(reward)).type(torch.cuda.FloatTensor)) next_state = Variable(torch.from_numpy(np.float32(next_state)).type(torch.cuda.FloatTensor)) next_action = self.target_actor.forward(next_state).detach() target = reward + GAMMA*torch.squeeze(self.target_critic.forward(next_state, next_action).detach()) prediction = torch.squeeze(self.critic.forward(state, action)) loss_critic = self.loss_funct(prediction, target) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() action = self.actor.forward(state) loss_actor = -1*torch.sum(self.critic.forward(state, action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() self.net_update(self.target_actor, self.actor, False) self.net_update(self.target_critic, self.critic, False) # Apply soft or hard update on the network def net_update(self,target, source, hard): degree = 1 if not hard: degree = TAU for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - degree) + param.data * degree) # Store the models def save_models(self, episode): torch.save(self.target_actor.state_dict(), 'Models/' + str(episode) + '_actor.pt') torch.save(self.target_critic.state_dict(), 'Models/' + str(episode) + '_critic.pt') # Load the models def load_models(self, episode): self.actor.load_state_dict(torch.load('Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict(torch.load('Models/' + str(episode) + '_critic.pt')) self.net_update(self.target_actor, self.actor, True) self.net_update(self.target_critic, self.critic, True) print('Models loaded succesfully')
class DDPG_Agent(): def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 1 self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Noise process self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed) # Critic Networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) def step(self, states, actions, rewards, next_states, dones): """ add an experience in the reply buffer then sample randomly from that buffer to learn (reason behind the random sampling is to break the correlation between sequential experiences) """ # Save experience for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): # Populate list of actions one state at a time actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # We add noise for exploration purposes actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ### Update critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Calculate Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer.step() ### Update actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, regular_model, target_model, tau): """ regular_model: it's the most up to date model as it's the one used for trainning target_model:this one is the most stable we copy the weights of the regular model to it tau (float): interpolation parameter """ for target_param, regular_param in zip(target_model.parameters(), regular_model.parameters()): target_param.data.copy_(tau * regular_param.data + (1.0 - tau) * target_param.data)
class TD3: def __init__(self, n_features, action_bounds): self.n_features = n_features self.action_bounds = action_bounds self.eval_actor_net = Actor(n_features, action_bounds) self.load_weights(self.eval_actor_net) self.eval_actor_net.train() self.target_actor_net = copy.deepcopy(self.eval_actor_net) self.target_actor_net.eval() self.eval_critic_net1 = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net1) self.eval_critic_net1.train() self.eval_critic_net2 = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net2) self.eval_critic_net2.train() self.target_critic_net1 = copy.deepcopy(self.eval_critic_net1) self.target_critic_net1.eval() self.target_critic_net2 = copy.deepcopy(self.eval_critic_net2) self.target_critic_net2.eval() self.memory = Memory(Config.MEMORY_CAPACITY) self.batch_size = Config.BATCH_SIZE self.tau = Config.REPLACEMENT_SOFT_TAU # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam( self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) # itertools.chain(self.encoder.parameters(), self.decoder.parameters()) # self.optimizer_critic = \ # torch.optim.Adam([{'params': self.eval_critic_net1.parameters()}, # {'params': self.eval_critic_net2.parameters()}], Config.LR_CRITIC, (0.9, 0.99)) self.optimizer_critic1 = \ torch.optim.Adam(self.eval_critic_net1.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.optimizer_critic2 = \ torch.optim.Adam(self.eval_critic_net2.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY self.policy_noise_clip = Config.POLICY_NOISE_CLIP self.policy_delay = Config.DELAY_POLICY_UPDATE_ITER self.learn_iter = 0 def load_weights(self, net): # net.state_dict(), 得出来的名字,'layers.1.weight' for m in net.modules(): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 1) nn.init.constant_(m.bias, 0.1) def store_transition(self, s, a, r, s_): self.memory.store([s, a, r, s_]) def chose_action(self, s): s = torch.Tensor(np.expand_dims(s, axis=0)) action = self.eval_actor_net(s).detach().squeeze(dim=0) return action def learn(self): self.learn_iter += 1 # for x in self.Actor_target.state_dict().keys(): # eval('self.Actor_target.' + x + '.data.mul_((1-TAU))') # eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)') # for x in self.Critic_target.state_dict().keys(): # eval('self.Critic_target.' + x + '.data.mul_((1-TAU))') # eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)') # for target_param, param in zip(net_target.parameters(), net.parameters()): # target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) # for k, v in self.eval_critic_net.state_dict().items(): # self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k]) # for k, v in self.eval_actor_net.state_dict().items(): # self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k]) batch_data = self.memory.sample(self.batch_size) s0, a0, r1, s1 = zip(*batch_data) s0 = torch.tensor(s0, dtype=torch.float) a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds)) r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1) s1 = torch.tensor(s1, dtype=torch.float) # Select action according to policy and add clipped noise # Input (s, a), output q q_s0_a0_1 = self.eval_critic_net1(s0, a0) q_s0_a0_2 = self.eval_critic_net2(s0, a0) # Input (s_, a_), output q_ for q_target # 得到a_ noise = (torch.randn_like(a0) * self.policy_noise_clip * 2).clamp( -self.policy_noise_clip, self.policy_noise_clip) a1 = self.target_actor_net(s1).detach() + noise action_bound = self.action_bounds.expand_as(a1) a1[a1 < -action_bound] = -action_bound[a1 < -action_bound] a1[a1 > action_bound] = action_bound[a1 > action_bound] q_s1_a1_1 = self.target_critic_net1(s1, a1).detach() q_s1_a1_2 = self.target_critic_net2(s1, a1).detach() q_s1_a1 = torch.min(q_s1_a1_1, q_s1_a1_2) q_target = r1 + self.gamma * q_s1_a1 loss_critic = nn.MSELoss()(q_s0_a0_1, q_target) + nn.MSELoss()( q_s0_a0_2, q_target) # critic 学习过程 # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce , # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确 # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2 self.optimizer_critic1.zero_grad() self.optimizer_critic2.zero_grad() loss_critic.backward() self.optimizer_critic1.step() self.optimizer_critic2.step() loss_actor = 0 # actor 学习过程 # https://zhuanlan.zhihu.com/p/84321382 # Delayed policy updates if self.learn_iter % self.policy_delay == 0: actor_a = self.eval_actor_net(s0) critic_q = self.eval_critic_net1(s0, actor_a) # loss=-q=-ce(s,ae(s))更新ae ae(s)=a ae(s_)=a_ # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0 loss_actor = -torch.mean(critic_q) self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() # Update the frozen target models for param, target_param in zip( self.eval_critic_net1.parameters(), self.target_critic_net1.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.eval_critic_net2.parameters(), self.target_critic_net2.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_actor_net.parameters(), self.target_actor_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return loss_critic, loss_actor def draw_curve(self, loss): x = np.arange(1, len(loss) + 1) plt.title("cost curve") plt.xlabel("train step") plt.ylabel("cost") plt.plot(x, loss) plt.show()
class Agent: """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use prioritized sample from buffer to learn. """ # Save memory for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # Learn from memory if enough samples exist if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: experiences = self.memory.sample() self.learn(experiences, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, states, add_noise=True): """ Returns actions for given state as per current policy. """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[i, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices = experiences # update Critic # Get next predicted state, actions, and Q values actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # Update priorities delta = abs(Q_targets - Q_expected).detach().numpy() self.memory.update_priorities(delta, indices) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_model_param, local_model_param in zip( target_model.parameters(), local_model.parameters()): target_model_param.data.copy_(tau * local_model_param.data + (1. - tau) * target_model_param.data)
class AsyncDDPG(object): def __init__(self, gamma, s, a, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87) self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.count = 0 def PerformUpdate(self, batchsize, target): #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch.... self.actorOptimizer.zero_grad() self.criticOptimizer.zero_grad() batch = target.getBatchMemory(batchsize) Q = torch.zeros(len(batch), self.state + self.action) Qprime = torch.zeros(len(batch), self.state + self.action) rewards = torch.zeros(len(batch), 1) # This loop should generate all Q values for the batch i = 0 for sample in batch: Q[i, :] = torch.cat((sample['s'], sample['a'])) transition = target.targetActor( Variable(sample['sprime'], volatile=True)).data Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0) rewards[i, 0] = sample['r'][0] i += 1 #Critic Update Qprime = self.gamma * target.targetCritic( Variable(Qprime)).data + rewards Qprime = Variable(Qprime) Q = self.critic(Variable(Q)) criterion = torch.nn.MSELoss() loss = criterion(Q, Qprime) loss.backward() self.criticOptimizer.step() criterion = torch.nn.MSELoss() #criticupdate self.actorOptimizer.zero_grad() S = torch.zeros(len(batch), self.state) i = 0 for sample in batch: S[i, :] = sample['s'] i += 1 A = self.actor(Variable(S)) loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1))) loss.backward() self.actorOptimizer.step() def getActor(self): return self.actor def getCritic(self): return self.critic def ProduceTargetActorCritic(self, memory=2000, tau=.25, epsilon=.5): print(self.count) self.count += 1 s = self.state a = self.action return TargetActorCritic(self.actor, self.critic, memory, s, a, tau, epsilon=0.5) def saveActorCritic(self): torch.save(self.critic.state_dict(), './AsyncCritic') torch.save(self.actor.state_dict(), './AsyncActor')
class Actor_Critic: def __init__(self, n_features, actions=None, is_continues=None): self.actions = actions self.is_continues = is_continues self.actor_net = Actor(n_features, actions=actions, is_continues=is_continues) self.critic_net = Critic(n_features) self.load_weights(self.actor_net) self.load_weights(self.critic_net) # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY def load_weights(self, net): # net.state_dict(), 得出来的名字,'layers.1.weight' for m in net.modules(): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 1) nn.init.constant_(m.bias, 0.1) def store_trajectory(self, s, a, r): self.states.append(s) self.actions.append(a) self.rewards.append(r) def chose_action(self, s): s = torch.Tensor(np.expand_dims(s, axis=0)) if self.is_continues: mu, sigma = self.actor_net(s) mu, sigma = mu.detach().squeeze(), sigma.detach().squeeze() normal_dist = torch.distributions.Normal(mu * 2, sigma + 0.1) action = torch.clamp(normal_dist.sample((1, )), min=-self.actions[0], max=self.actions[0]) else: # 每个动作的概率 actions_probs = F.softmax(self.actor_net(s).detach(), dim=1) # 根据概率选动作 action = random.choices(range(actions_probs.size(1)), weights=actions_probs.squeeze(0))[0] return action def learn(self, s, a, r, s_): s = torch.from_numpy(s).unsqueeze(dim=0).float() s_ = torch.from_numpy(s_).unsqueeze(dim=0).float() r = torch.tensor(r) a = torch.tensor(a).unsqueeze(dim=0) V_st = self.critic_net(s).squeeze(dim=0) V_st_ = self.critic_net(s_).squeeze(dim=0) # td_error = Q(st, at) - V(st) # Q(st, at) = r + r*V(st+1) # td_error = - V(st) + r + gamma*V(st+1) td_error = r + self.gamma * V_st_ - V_st loss_critic = td_error**2 # critic 学习过程 self.optimizer_critic.zero_grad() loss_critic.backward() self.optimizer_critic.step() # actor 学习过程 mu, sigma = self.actor_net(s) mu, sigma = mu.squeeze(), sigma.squeeze() normal_dist = torch.distributions.Normal(mu * 2, sigma + 0.1) log_prob = normal_dist.log_prob(a) loss_actor = torch.sum(log_prob * td_error.detach()) self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() return loss_critic, loss_actor def draw_curve(self, loss): x = np.arange(1, len(loss) + 1) plt.title("cost curve") plt.xlabel("train step") plt.ylabel("cost") plt.plot(x, loss) plt.show()
class Actor_Critic: def __init__(self, n_features, action_bounds): self.n_features = n_features self.action_bounds = action_bounds self.eval_actor_net = Actor(n_features, action_bounds) self.load_weights(self.eval_actor_net) self.eval_actor_net.train() self.target_actor_net = Actor(n_features, action_bounds) self.target_actor_net.eval() self.eval_critic_net = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net) self.eval_critic_net.train() self.target_critic_net = Critic(n_features, action_bounds) self.target_critic_net.eval() self.memory = Memory(Config.MEMORY_CAPACITY) self.batch_size = Config.BATCH_SIZE self.tau = Config.REPLACEMENT_SOFT_TAU # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY def load_weights(self, net): # net.state_dict(), 得出来的名字,'layers.1.weight' for m in net.modules(): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 1) nn.init.constant_(m.bias, 0.1) def store_transition(self, s, a, r, s_): self.memory.store([s, a, r, s_]) def chose_action(self, s): s = torch.Tensor(np.expand_dims(s, axis=0)) action = self.eval_actor_net(s).detach().squeeze(dim=0) return action def learn(self): # for x in self.Actor_target.state_dict().keys(): # eval('self.Actor_target.' + x + '.data.mul_((1-TAU))') # eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)') # for x in self.Critic_target.state_dict().keys(): # eval('self.Critic_target.' + x + '.data.mul_((1-TAU))') # eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)') # for target_param, param in zip(net_target.parameters(), net.parameters()): # target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) for k, v in self.eval_critic_net.state_dict().items(): self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k]) for k, v in self.eval_actor_net.state_dict().items(): self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k]) batch_data = self.memory.sample(self.batch_size) s0, a0, r1, s1 = zip(*batch_data) s0 = torch.tensor(s0, dtype=torch.float) a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds)) r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1) s1 = torch.tensor(s1, dtype=torch.float) # Input (s, a), output q q_s0_a0 = self.eval_critic_net(s0, a0) # Input (s_, a_), output q_ for q_target # 得到a_ a1 = self.target_actor_net(s1).detach() q_s1_a1 = self.target_critic_net(s1, a1).detach() q_target = r1 + self.gamma * q_s1_a1 loss_critic = nn.MSELoss()(q_s0_a0, q_target) # critic 学习过程 # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce , # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确 # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2 self.optimizer_critic.zero_grad() loss_critic.backward() self.optimizer_critic.step() # actor 学习过程 # https://zhuanlan.zhihu.com/p/84321382 actor_a = self.eval_actor_net(s0) critic_q = self.eval_critic_net(s0, actor_a) # loss=-q=-ce(s,ae(s))更新ae ae(s)=a ae(s_)=a_ # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0 loss_actor = -torch.mean(critic_q) self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() return loss_critic, loss_actor def draw_curve(self, loss): x = np.arange(1, len(loss)+1) plt.title("cost curve") plt.xlabel("train step") plt.ylabel("cost") plt.plot(x, loss) plt.show()
class DDPG_Agent(): def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 10.0 self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Make sure the Actor Target Network has the same weight values as the Local Network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ # Make sure the Critic Target Network has the same weight values as the Local Network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, noise=0.0): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if ADD_NOISE: action += self.noise.sample() * noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ### Used only for DDPG (use madddpg.maddpg_learn() for MADDPG) """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
import gym from Actor import Actor from Critic import Critic import numpy as np import torch import torch.nn as nn import torch.optim as optim device = 'cuda' if torch.cuda.is_available() else 'cpu' env = gym.make("Pendulum-v0") actor = Actor(env.observation_space.shape[0], env.action_space.shape[0]) critic = Critic(env.observation_space.shape[0]) criterion = nn.MSELoss().to(device) Actor_Optimizer = optim.SGD(actor.parameters(), lr = 1e-4) Critic_Optimizer = optim.SGD(critic.parameters(), lr = 1e-4) for episode in range(5): state = env.reset() returns = np.zeros(1000) for step in range(1000): pass