class DDPG: def __init__(self, gamma, memory, s, a, tau, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.memory = ReplayMemory(memory) self.actor = Actor(state=s, actions=a) self.critic = Critic(state=s, actions=a) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.targetActor = Actor(state=s, actions=a) self.targetActor.load_state_dict(self.actor.state_dict()) self.targetCritic = Critic(state=s, actions=a) self.targetCritic.load_state_dict(self.critic.state_dict()) self.tau = tau self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.OUarray = np.zeros((1000, self.action), dtype="f") self.step = 0 def processNoise(self): #this should be something more eloquent.... ret = torch.rand(self.action) for i in range(0, self.action): r = random.random() if (r <= .33): ret[i] = ret[i] elif (.33 < r and r <= .66): ret[i] = 0 else: ret[i] = -ret[i] return ret def OUprocess(self, sigma, theta, mu): # define model parameters t_0 = 0 t_end = 10 length = 1000 y = np.zeros((length, self.action), dtype="f") t = np.linspace(t_0, t_end, length) # define time axis dt = np.mean(np.diff(t)) drift = lambda y, t: theta * (mu - y) # define drift term diffusion = lambda y, t: sigma # define diffusion term # solve SDE for j in xrange(1, self.action): y[0][j] = np.random.normal(loc=0.0, scale=1.0) # initial condition noise = np.random.normal(loc=0.0, scale=1.0, size=length) * np.sqrt( dt) #define noise process for i in xrange(1, length): y[i][j] = y[i - 1][j] + drift( y[i - 1][j], i * dt) * dt + diffusion(y[i - 1][j], i * dt) * noise[i] self.OUarray = y def selectAction(self, state): #remember, state better be an autograd Variable ret = self.targetActor(Variable(state)).data ret = ret + torch.from_numpy(self.OUarray[self.step]) self.step += 1 return torch.clamp(ret, 0.0, 1.0) def addToMemory(self, state, action, reward, stateprime): self.memory.push(state, action, reward, stateprime) def primedToLearn(self): return self.memory.isFull() def PerformUpdate(self, batchsize): #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch.... self.actorOptimizer.zero_grad() self.criticOptimizer.zero_grad() batch = self.memory.batch(batchsize) Q = torch.zeros(len(batch), self.state + self.action) Qprime = torch.zeros(len(batch), self.state + self.action) rewards = torch.zeros(len(batch), 1) # This loop should generate all Q values for the batch i = 0 for sample in batch: Q[i, :] = torch.cat((sample['s'], sample['a'])) transition = self.targetActor( Variable(sample['sprime'], volatile=True)).data Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0) rewards[i, 0] = sample['r'][0] i += 1 #Critic Update Qprime = self.gamma * self.targetCritic( Variable(Qprime)).data + rewards Qprime = Variable(Qprime) Q = self.critic(Variable(Q)) criterion = torch.nn.MSELoss() loss = criterion(Q, Qprime) loss.backward() self.criticOptimizer.step() criterion = torch.nn.MSELoss() self.actorOptimizer.zero_grad() S = torch.zeros(len(batch), self.state) i = 0 for sample in batch: S[i, :] = sample['s'] i += 1 A = self.actor(Variable(S)) loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1))) loss.backward() self.actorOptimizer.step() def UpdateTargetNetworks(self): criticDict = self.critic.state_dict() tCriticDict = self.targetCritic.state_dict() for param in criticDict.keys(): tCriticDict[param] = tCriticDict[param] * ( 1 - self.tau) + criticDict[param] * self.tau actorDict = self.actor.state_dict() tActorDict = self.targetActor.state_dict() for param in actorDict.keys(): tActorDict[param] = tActorDict[param] * ( 1 - self.tau) + actorDict[param] * self.tau self.targetCritic.load_state_dict(tCriticDict) self.targetActor.load_state_dict(tActorDict) def saveActorCritic(self): torch.save(self.critic.state_dict(), './critic') torch.save(self.actor.state_dict(), './actor')
class ActorCritic: def __init__(self, state_dim, action_dim, memory, load): self.memory = memory self.noise = OrnsteinUhlenbeckActionNoise(action_dim) self.actor = Actor(state_dim, action_dim) self.critic = Critic(state_dim, action_dim) self.target_actor = Actor(state_dim, action_dim) self.target_critic = Critic(state_dim, action_dim) self.critic.cuda() self.actor.cuda() self.target_critic.cuda() self.target_actor.cuda() self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),LEARNING_RATE) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),LEARNING_RATE) self.loss_funct = nn.SmoothL1Loss() if load != 0: self.load_models(load) #load the model # Target and trained networks are the same when initializing self.net_update(self.target_actor, self.actor, True) self.net_update(self.target_critic, self.critic, True) # Predict an action with or without noise depending on the "train" flag def get_action(self, state, train): state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor)) action = self.actor.forward(state).detach().cpu().numpy() if train: noise = np.float32(self.noise.sample()) return action + noise return action # Run the optimization: # Get predicted action from the next state by Target Actor # Base on that predict the Value of that action by Target Critic # Use the predicted value to update Critic, and then Actor # Soft update target networks to mirror the progress def optimize(self): state,action,reward,next_state = self.memory.sample(BATCH_SIZE) state = Variable(torch.from_numpy(np.float32(state)).type(torch.cuda.FloatTensor)) action = Variable(torch.from_numpy(np.float32(action)).type(torch.cuda.FloatTensor)) reward = Variable(torch.from_numpy(np.float32(reward)).type(torch.cuda.FloatTensor)) next_state = Variable(torch.from_numpy(np.float32(next_state)).type(torch.cuda.FloatTensor)) next_action = self.target_actor.forward(next_state).detach() target = reward + GAMMA*torch.squeeze(self.target_critic.forward(next_state, next_action).detach()) prediction = torch.squeeze(self.critic.forward(state, action)) loss_critic = self.loss_funct(prediction, target) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() action = self.actor.forward(state) loss_actor = -1*torch.sum(self.critic.forward(state, action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() self.net_update(self.target_actor, self.actor, False) self.net_update(self.target_critic, self.critic, False) # Apply soft or hard update on the network def net_update(self,target, source, hard): degree = 1 if not hard: degree = TAU for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - degree) + param.data * degree) # Store the models def save_models(self, episode): torch.save(self.target_actor.state_dict(), 'Models/' + str(episode) + '_actor.pt') torch.save(self.target_critic.state_dict(), 'Models/' + str(episode) + '_critic.pt') # Load the models def load_models(self, episode): self.actor.load_state_dict(torch.load('Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict(torch.load('Models/' + str(episode) + '_critic.pt')) self.net_update(self.target_actor, self.actor, True) self.net_update(self.target_critic, self.critic, True) print('Models loaded succesfully')
class Agent(): def __init__(self, params): self.action_size = params['action_size'] self.state_size = params['state_size'] self.num_agents = params['num_agents'] self.buffer_size = params['buffer_size'] self.batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__update_every = params['update_every'] self.__save_to = params['save_to'] self.__memory = ReplayBuffer(self.buffer_size, self.batch_size) self.__lr = params['lr'] self.noise_type = params['noise_type'] actor_params = dict() actor_params['arch_params_actor'] = params['arch_params_actor'] actor_params['action_size'] = self.action_size actor_params['state_size'] = self.state_size actor_params['eps'] = params['eps'] actor_params['eps_decay'] = params['eps_decay'] actor_params['eps_min'] = params['min_eps'] actor_params['noise_type'] = params['noise_type'] self.actor = Actor(actor_params) self.actor_target = Actor(actor_params) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=self.__lr) self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) critic_params = dict() critic_params['arch_params_critic'] = params['arch_params_critic'] critic_params['action_size'] = self.action_size critic_params['state_size'] = self.state_size self.critic = Critic(critic_params) self.critic_target = Critic(critic_params) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=self.__lr) self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor, step_size=100, gamma=0.95) self.__t = 0 def memorize_experience(self, state, action, reward, next_state, done): self.__memory.add(state, action.detach(), reward, next_state, done) self.__t = (self.__t + 1) def learn_from_past_experiences(self): if self.__t % self.__update_every == 0: if len(self.__memory) > self.batch_size: experiences = self.__memory.sample() self.update_actor_critic(experiences) def choose_action(self, state): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") state = torch.from_numpy(state.astype(dtype=np.float)).to(device) action, action_perturbed = self.actor(state) return action, action_perturbed def update_actor_critic(self, experiences): states, actions, rewards, next_states, dones = experiences next_actions, next_actions_perturbed = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones) ) # if done == True: second term is equal to 0 Q_expected = self.critic(states, actions) loss_func = nn.MSELoss() loss_critic = loss_func(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss_critic.backward() # self.scheduler_critic.step() self.optimizer_critic.step() predicted_actions, predicted_actions_perturbed = self.actor( states) # new predicted actions, not the ones stored in buffer if self.noise_type == 'parameter': #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise if (predicted_actions - predicted_actions_perturbed).pow(2).mean() >= 0.15: self.actor.eps /= 1.01 self.actor_target.eps /= 1.01 else: self.actor.eps *= 1.01 self.actor_target.eps *= 1.01 loss_actor = -self.critic(states, predicted_actions).mean() self.optimizer_actor.zero_grad() loss_actor.backward() # self.scheduler_actor.step() self.optimizer_actor.step() self.soft_update(self.critic, self.critic_target) self.soft_update(self.actor, self.actor_target) def update_eps(self): self.actor.eps = max(self.actor.eps * self.actor.eps_decay, self.actor.eps_min) self.actor_target.eps = max( self.actor_target.eps * self.actor_target.eps_decay, self.actor_target.eps_min) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.__tau * local_param.data + (1.0 - self.__tau) * target_param.data) def save_weights(self, save_to): actor_params_and_state_dict = { 'actor_params': self.actor.actor_params, 'state_dict': self.actor.state_dict() } critic_params_and_state_dict = { 'critic_params': self.critic.critic_params, 'state_dict': self.critic.state_dict() } file = dict() file['critic_params_and_state_dict'] = critic_params_and_state_dict file['actor_params_and_state_dict'] = actor_params_and_state_dict torch.save(file, open(save_to, 'wb')) def load_weights(self, load_from): checkpoint = torch.load(load_from) critic_params_and_state_dict = checkpoint[ 'critic_params_and_state_dict'] actor_params_and_state_dict = checkpoint['actor_params_and_state_dict'] self.actor = Actor(actor_params_and_state_dict['actor_params']) self.actor.load_state_dict(actor_params_and_state_dict['state_dict']) self.critic = Critic(critic_params_and_state_dict['critic_params']) self.critic.load_state_dict(critic_params_and_state_dict['state_dict']) return self
class AsyncDDPG(object): def __init__(self, gamma, s, a, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87) self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.count = 0 def PerformUpdate(self, batchsize, target): #Mildly important, according to https://github.com/vy007vikas/PyTorch-ActorCriticRL # the criterion on the actor is this: sum(-Q(s,a)) I'm assuming this is over the batch.... self.actorOptimizer.zero_grad() self.criticOptimizer.zero_grad() batch = target.getBatchMemory(batchsize) Q = torch.zeros(len(batch), self.state + self.action) Qprime = torch.zeros(len(batch), self.state + self.action) rewards = torch.zeros(len(batch), 1) # This loop should generate all Q values for the batch i = 0 for sample in batch: Q[i, :] = torch.cat((sample['s'], sample['a'])) transition = target.targetActor( Variable(sample['sprime'], volatile=True)).data Qprime[i, :] = torch.cat((sample['sprime'], transition), dim=0) rewards[i, 0] = sample['r'][0] i += 1 #Critic Update Qprime = self.gamma * target.targetCritic( Variable(Qprime)).data + rewards Qprime = Variable(Qprime) Q = self.critic(Variable(Q)) criterion = torch.nn.MSELoss() loss = criterion(Q, Qprime) loss.backward() self.criticOptimizer.step() criterion = torch.nn.MSELoss() #criticupdate self.actorOptimizer.zero_grad() S = torch.zeros(len(batch), self.state) i = 0 for sample in batch: S[i, :] = sample['s'] i += 1 A = self.actor(Variable(S)) loss = -1 * torch.sum(self.critic(torch.cat((Variable(S), A), dim=1))) loss.backward() self.actorOptimizer.step() def getActor(self): return self.actor def getCritic(self): return self.critic def ProduceTargetActorCritic(self, memory=2000, tau=.25, epsilon=.5): print(self.count) self.count += 1 s = self.state a = self.action return TargetActorCritic(self.actor, self.critic, memory, s, a, tau, epsilon=0.5) def saveActorCritic(self): torch.save(self.critic.state_dict(), './AsyncCritic') torch.save(self.actor.state_dict(), './AsyncActor')