def store_experience(self, state, action, nextState, reward, info): if self.experienceProcessor is not None: state, action, nextState, reward = self.experienceProcessor( state, action, nextState, reward, info) # caution: using multiple step forward return can increase variance if self.nStepForward > 1: # if this is final state, we want to do additional backup to increase useful learning experience if nextState is None: transitions = [] transitions.append(Transition(state, action, nextState, reward)) R = reward while len(self.nStepBuffer) > 0: state, action, next_state, reward_old = self.nStepBuffer.pop( 0) R = reward_old + self.gamma * R transNew = Transition(state, action, None, R) transitions.append(transNew) for tran in transitions: if self.priorityMemoryOption: self.memory.store(tran) else: self.memory.push(tran) else: # otherwise we calculate normal n step return self.nStepBuffer.append((state, action, nextState, reward)) if len(self.nStepBuffer) < self.nStepForward: return R = sum([ self.nStepBuffer[i][3] * (self.gamma**i) for i in range(self.nStepForward) ]) state, action, _, _ = self.nStepBuffer.pop(0) transition = Transition(state, action, nextState, R) if self.priorityMemoryOption: self.memory.store(transition) else: self.memory.push(transition) else: # if it is one step transition = Transition(state, action, nextState, reward) if self.priorityMemoryOption: self.memory.store(transition) else: self.memory.push(transition)
def prepare_minibatch(self, state, action, nextState, reward, info): # first store memory self.store_experience(state, action, nextState, reward, info) if len(self.memory) < self.trainBatchSize: return transitions_raw = self.memory.sample(self.trainBatchSize) transitions = Transition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.float32) # shape(batch, numActions) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32) # shape(batch) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, self.device) nonFinalNextState, nonFinalMask = self.stateProcessor( transitions.next_state, self.device) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nonFinalMask = torch.tensor(tuple( map(lambda s: s is not None, transitions.next_state)), device=self.device, dtype=torch.uint8) nonFinalNextState = torch.tensor( [s for s in transitions.next_state if s is not None], device=self.device, dtype=torch.float32) return state, nonFinalMask, nonFinalNextState, action, reward
def prepare_minibatch(self, transitions_raw): ''' do some proprocessing work for transitions_raw order the data convert transition list to torch tensors use trick from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html https://stackoverflow.com/questions/19339/transpose-unzip-function-inverse-of-zip/19343#19343 ''' transitions = Transition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze(-1) # shape(batch, 1) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze(-1) # shape(batch, 1) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, self.device) nonFinalNextState, nonFinalMask = self.stateProcessor(transitions.next_state, self.device) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nonFinalMask = torch.tensor(tuple(map(lambda s: s is not None, transitions.next_state)), device=self.device, dtype=torch.bool) nonFinalNextState = torch.tensor([s for s in transitions.next_state if s is not None], device=self.device, dtype=torch.float32) return state, nonFinalMask, nonFinalNextState, action, reward
def sample(self, batch_size): finish = random.sample(range(0, len(self.memory)), batch_size) begin = [x - self.seq_length for x in finish] samples = [] for start, end in zip(begin, finish): # correct for sampling near beginning # final is a list final = self.memory[max(start + 1, 0):end + 1] # correct for sampling across episodes # remove experiences belongs to previous episode for i in range(len(final) - 2, -1, -1): if final[i][3] is None: final = final[i + 1:] break # pad beginning to for sequence that end earlier while (len(final) < self.seq_length): dummyTransition = Transition(np.zeros_like(self.memory[0][0]), 0, np.zeros_like(self.memory[0][2]), 0) final = [dummyTransition] + final samples += final # returns flattened version return samples
def store_experience(self, states, actions, nextStates, reward, info): transitions = [ Transition(states[n], actions[n], nextStates[n], reward) for n in range(self.numAgents) ] self.memory.push(transitions)
def prepare_minibatch(self, transitions_raw, n): # first store memory transitions = Transition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze( -1) # shape(batch, 1) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze( -1) # shape(batch, 1) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessors is not None: state, _ = self.stateProcessors[n](transitions.state, self.device) nonFinalNextState, nonFinalMask = self.stateProcessors[n]( transitions.next_state, self.device) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nonFinalMask = torch.tensor(tuple( map(lambda s: s is not None, transitions.next_state)), device=self.device, dtype=torch.uint8) nonFinalNextState = torch.tensor( [s for s in transitions.next_state if s is not None], device=self.device, dtype=torch.float32) return state, nonFinalMask, nonFinalNextState, action, reward
def process_experienceAugmentation(self, state, action, nextState, reward, info): if self.globalStepCount % self.experienceAugmentationFreq == 0: state_Augs, action_Augs, nextState_Augs, reward_Augs = self.env.getExperienceAugmentation( state, action, nextState, reward, info) for i in range(len(state_Augs)): transition = Transition(state_Augs[i], action_Augs[i], nextState_Augs[i], reward_Augs[i]) self.memory.push(transition)
def store_experience(self, states, actions, nextStates, rewards, infos): for i in range(len(states)): # if it is ended due to stepLimit, we should not store the experience due to the vec env setup if not infos[i]['endBeforeDone']: transition = Transition(states[i], actions[i], nextStates[i], rewards[i]) self.memory.push(transition) if self.successRepeat and nextStates[i] is None: for _ in range(self.successRepeatTime): self.memory.push(transition)
def process_hindSightExperience(self, state, action, nextState, reward, info): if nextState is not None and self.globalStepCount % self.hindSightERFreq == 0: stateNew, actionNew, nextStateNew, rewardNew = self.env.getHindSightExperience( state, action, nextState, info) if stateNew is not None: transition = Transition(stateNew, actionNew, nextStateNew, rewardNew) self.memory.push(transition) if self.experienceAugmentation: self.process_experienceAugmentation( state, action, nextState, reward, info)
def store_experience(self, state, action, nextState, reward, info): if self.experienceProcessor is not None: state, action, nextState, reward = self.experienceProcessor(state, action, nextState, reward, info) timeStep = state['timeStep'] transition = Transition(state, action, nextState, reward) self.memories[timeStep].push(transition) if self.experienceAugmentation: self.process_experienceAugmentation(state, action, nextState, reward, info) if self.hindSightER: self.process_hindSightExperience(state, action, nextState, reward, info)
def push(self, *args): """Saves a transition""" if len(args) == 1 and isinstance(*args, Transition): transition = args[0] else: transition = Transition(*args) # if it is terminal state if transition.next_state is None: if len(self.terminalMemory) < self.capacity: self.terminalMemory.append(None) self.terminalMemory[self.positionTwo] = transition self.positionTwo = (self.positionTwo + 1) % self.capacity count = 1 R = transition.reward for trans in self.nStepBuffer[::-1]: # if nstepBackup is zero, then this is no backup if count > self.nStepBackup: break R = trans.reward + self.gamma * R transNew = Transition(trans.state, trans.action, None, R) if len(self.terminalMemory) < self.capacity: self.terminalMemory.append(None) self.terminalMemory[self.positionTwo] = transNew self.positionTwo = (self.positionTwo + 1) % self.capacity count += 1 self.nStepBuffer.clear() else: # if it is terminal state if len(self.memory) < self.capacity: self.memory.append(None) self.nStepBuffer.append(transition) # write on the earlier experience self.memory[self.positionOne] = transition self.positionOne = (self.positionOne + 1) % self.capacity
def update_net(self, state, action, nextState, reward): # first store memory self.store_experience(state, action, nextState, reward) if len(self.memory) < self.trainBatchSize: return transitions_raw = self.memory.sample(self.trainBatchSize) transitions = Transition(*zip(*transitions_raw)) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state = self.stateProcessor(transitions.state) nextState = self.stateProcessor(transitions.next_state) else: state = torch.tensor(transitions.state, dtype=torch.float32) nextState = torch.tensor(transitions.next_state, dtype=torch.float32) action = torch.tensor(transitions.action, dtype=torch.long).unsqueeze(-1) # shape(batch, 1) reward = torch.tensor(transitions.reward, dtype=torch.float32).unsqueeze(-1) # shape(batch, 1) batchSize = reward.shape[0] QValues = self.policyNet(state).gather(1, action) # note that here use policyNet for target value QNext = self.policyNet(nextState).detach() targetValues = reward + self.gamma * QNext.max(dim=1)[0].unsqueeze(-1) loss = torch.mean(self.netLossFunc(QValues, targetValues)) self.optimizer.zero_grad() loss.backward() # for lp, gp in zip(self.localNet.parameters(), self.globalNet.parameters()): # gp._grad = lp._grad # # if self.netGradClip is not None: # torch.nn.utils.clip_grad_norm_(self.policyNet.parameters(), self.netGradClip) # # # global net update # self.globalOptimizer.step() # # # update local net # self.localNet.load_state_dict(self.globalNet.state_dict()) if self.globalStepCount % self.lossRecordStep == 0: self.losses.append([self.globalStepCount, self.epIdx, loss])
def update_net_and_sync(self, state, action, nextState, reward): self.store_experience(state, action, nextState, reward) if self.priorityMemoryOption: if len(self.memory) < self.config['memoryCapacity']: return else: if len(self.memory) < self.trainBatchSize: return if self.totalStep % self.updateGlobalFrequency == 0: transitions_raw = self.memory.sample(self.trainBatchSize) transitions = Transition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.long).unsqueeze( -1) # shape(batch, 1) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32).unsqueeze( -1) # shape(batch, 1) batchSize = reward.shape[0] # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, self.device) nonFinalNextState, nonFinalMask = self.stateProcessor(transitions.next_state, self.device) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nonFinalMask = torch.tensor([s is not None for s in transitions.next_state], device=self.device, dtype=torch.uint8) nonFinalNextState = torch.tensor([s for s in transitions.next_state if s is not None], device=self.device, dtype=torch.float32) if self.synchLock: self.lock.acquire() QValues = self.globalPolicyNet(state).gather(1, action) if self.netUpdateOption == 'targetNet': # Here we detach because we do not want gradient flow from target values to net parameters QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32) QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).max(1)[0].detach() targetValues = reward + self.gamma * QNext.unsqueeze(-1) if self.netUpdateOption == 'policyNet': raise NotImplementedError targetValues = reward + self.gamma * torch.max(self.globalPolicyNet(nextState).detach(), dim=1)[0].unsqueeze(-1) if self.netUpdateOption == 'doubleQ': # select optimal action from policy net with torch.no_grad(): batchAction = self.globalPolicyNet(nonFinalNextState).max(dim=1)[1].unsqueeze(-1) QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32).unsqueeze(-1) QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).gather(1, batchAction) targetValues = reward + self.gamma * QNext loss = self.netLossFunc(QValues, targetValues) self.globalOptimizer.zero_grad() loss.backward() if self.netGradClip is not None: torch.nn.utils.clip_grad_norm_(self.globalPolicyNet.parameters(), self.netGradClip) # global net update self.globalOptimizer.step() # # # update local net self.localNet.load_state_dict(self.globalPolicyNet.state_dict()) self.lock.release() else: # update local net self.localNet.load_state_dict(self.globalPolicyNet.state_dict()) QValues = self.localNet(state).gather(1, action) if self.netUpdateOption == 'targetNet': # Here we detach because we do not want gradient flow from target values to net parameters QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32) QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).max(1)[0].detach() targetValues = reward + self.gamma * QNext.unsqueeze(-1) if self.netUpdateOption == 'policyNet': raise NotImplementedError targetValues = reward + self.gamma * torch.max(self.globalPolicyNet(nextState).detach(), dim=1)[ 0].unsqueeze(-1) if self.netUpdateOption == 'doubleQ': # select optimal action from policy net with torch.no_grad(): batchAction = self.localNet(nonFinalNextState).max(dim=1)[1].unsqueeze(-1) QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32).unsqueeze(-1) QNext[nonFinalMask] = self.globalTargetNet(nonFinalNextState).gather(1, batchAction) targetValues = reward + self.gamma * QNext loss = self.netLossFunc(QValues, targetValues) loss.backward() self.lock.acquire() self.globalOptimizer.zero_grad() for lp, gp in zip(self.localNet.parameters(), self.globalPolicyNet.parameters()): if self.device == 'cpu': gp._grad = lp._grad else: gp._grad = lp._grad.cpu() if self.netGradClip is not None: torch.nn.utils.clip_grad_norm_(self.globalPolicyNet.parameters(), self.netGradClip) # global net update self.globalOptimizer.step() self.lock.release() # # # update local net self.localNet.load_state_dict(self.globalPolicyNet.state_dict())
def update_net(self, state, action, nextState, reward, info): # state, nonFinalMask, nonFinalNextState, action, reward = self.prepare_minibatch(state, action, nextState, reward, info) self.store_experience(state, action, nextState, reward, info) if len(self.memory) < self.trainBatchSize: return transitions_raw = self.memory.sample(self.trainBatchSize) transitions = Transition(*zip(*transitions_raw)) action = torch.tensor(transitions.action, device=self.device, dtype=torch.float32) # shape(batch, numActions) reward = torch.tensor(transitions.reward, device=self.device, dtype=torch.float32) # shape(batch) # for some env, the output state requires further processing before feeding to neural network if self.stateProcessor is not None: state, _ = self.stateProcessor(transitions.state, self.device) nonFinalNextState, nonFinalMask = self.stateProcessor( transitions.next_state, self.device) else: state = torch.tensor(transitions.state, device=self.device, dtype=torch.float32) nonFinalMask = torch.tensor(tuple( map(lambda s: s is not None, transitions.next_state)), device=self.device, dtype=torch.uint8) nonFinalNextState = torch.tensor( [s for s in transitions.next_state if s is not None], device=self.device, dtype=torch.float32) batchSize = reward.shape[0] # Critic loss QValuesOne = self.criticNetOne.forward(state, action).squeeze() QValuesTwo = self.criticNetTwo.forward(state, action).squeeze() actionNoise = torch.randn((nonFinalNextState.shape[0], self.numAction), dtype=torch.float32, device=self.device) next_actions = self.actorNet_target.forward( nonFinalNextState) + actionNoise * self.policySmoothNoise # next_actions = self.actorNet_target.forward(nonFinalNextState) QNext = torch.zeros(batchSize, device=self.device, dtype=torch.float32) QNextCriticOne = self.criticNet_targetOne.forward( nonFinalNextState, next_actions.detach()).squeeze() QNextCriticTwo = self.criticNet_targetTwo.forward( nonFinalNextState, next_actions.detach()).squeeze() QNext[nonFinalMask] = torch.min(QNextCriticOne, QNextCriticTwo) targetValues = reward + self.gamma * QNext criticOne_loss = self.netLossFunc(QValuesOne, targetValues) criticTwo_loss = self.netLossFunc(QValuesTwo, targetValues) self.criticOne_optimizer.zero_grad() self.criticTwo_optimizer.zero_grad() # https://jdhao.github.io/2017/11/12/pytorch-computation-graph/ criticOne_loss.backward(retain_graph=True) criticTwo_loss.backward() if self.netGradClip is not None: torch.nn.utils.clip_grad_norm_(self.criticNetOne.parameters(), self.netGradClip) torch.nn.utils.clip_grad_norm_(self.criticNetTwo.parameters(), self.netGradClip) self.criticOne_optimizer.step() self.criticTwo_optimizer.step() if self.learnStepCounter % self.policyUpdateFreq: # Actor loss # we try to maximize criticNet output(which is state value) policy_loss = -self.criticNetOne.forward( state, self.actorNet.forward(state)).mean() # update networks self.actor_optimizer.zero_grad() policy_loss.backward() if self.netGradClip is not None: torch.nn.utils.clip_grad_norm_(self.actorNet.parameters(), self.netGradClip) self.actor_optimizer.step() if self.globalStepCount % self.lossRecordStep == 0: self.losses.append([ self.globalStepCount, self.epIdx, criticOne_loss.item(), criticTwo_loss.item(), policy_loss.item() ]) # update target networks for target_param, param in zip( self.actorNet_target.parameters(), self.actorNet.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip( self.criticNet_targetOne.parameters(), self.criticNetOne.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip( self.criticNet_targetTwo.parameters(), self.criticNetTwo.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) self.learnStepCounter += 1
def store_experience(self, states, actions, nextStates, rewards, infos): for i in range(len(states)): transition = Transition(states[i], actions[i], nextStates[i], rewards[i]) self.memory.push(transition)
from Agents.Core.ReplayMemory import ReplayMemory, Transition #from ..Agents.Core.ReplayMemory import ReplayMemory, Transition import torch tran1 = Transition(1, 1, 1, 1) tran2 = Transition(2, 2, 2, 2) memory = ReplayMemory(10) memory.push(tran1) memory.push(tran2) memory.push(3, 3, 3, 3) print(memory) memory.write_to_text('memoryOut.txt') toTensor = memory.totensor() toTensor2 = torch.tensor(memory.sample(2)) for i in range(5, 50): tran = Transition(i, i, i, i) memory.push(tran) print(memory) memory.clear() print(memory) print(toTensor) print(toTensor2)
from Agents.Core.ReplayMemory import ReplayMemory, Transition #from ..Agents.Core.ReplayMemory import ReplayMemory, Transition import torch import numpy as np import pickle state1 = np.random.rand(5, 5) state2 = np.random.rand(5, 5) state3 = np.random.rand(5, 5) state4 = np.random.rand(5, 5) tran1 = Transition(state1, 1, state2, 1) tran2 = Transition(state3, 2, state4, 2) memory = ReplayMemory(10) memory.push(tran1) memory.push(tran2) print(memory) file = open('memory.pickle', 'wb') pickle.dump(memory, file) file.close() with open('memory.pickle', 'rb') as file: memory2 = pickle.load(file) print(memory2)
def store_experience(self, state, action, nextState, reward, info): if self.experienceProcessor is not None: state, action, nextState, reward = self.experienceProcessor(state, action, nextState, reward, info) transition = Transition(state, action, nextState, reward) self.memory.push(transition)