class WeightBasedExpReplay(object): def __init__(self, maxSize, alpha=0.6, epsilon=0.000001): self.maxSize = maxSize self.buffer = Buffer(self.maxSize) self.sumTree = SumTree(self.maxSize) self.weights = {} self.alpha = 0.6 self.curSize = 0 self.epsilon = epsilon self.heap = Heap() def addExperience(self, experience): weight = self.heap.getMaxPriority() index = self.buffer.getPointer() self.buffer.insert(experience) prevWeight = 0 if index in self.weights: prevWeight = self.weights[index] diffWeight = weight - prevWeight self.weights[index] = weight self.sumTree.insert(diffWeight, index) self.heap.add(index, weight) self.curSize = min(self.curSize + 1, self.maxSize) def modifyExperience(self, weight, index): weight = weight + self.epsilon weight = weight**self.alpha prevWeight = 0 if index in self.weights: prevWeight = self.weights[index] diffWeight = weight - prevWeight self.weights[index] = weight self.sumTree.insert(diffWeight, index) self.heap.add(index, weight) def sample(self, samplesAmount): startPoints = np.linspace(0, self.sumTree.getAllSum(), samplesAmount + 1).tolist() expList = [] weightList = [] indexList = [] for a in range(len(startPoints) - 1): start = startPoints[a] end = startPoints[a + 1] sampledNum = np.random.uniform(start, end) retrIndex = self.sumTree.search(sampledNum) expList.append(self.buffer.getItem(retrIndex)) weightList.append(self.weights[retrIndex] / self.sumTree.getAllSum()) indexList.append(retrIndex) return np.asarray(expList), np.asarray(weightList), np.asarray( indexList) def getMaxPriority(self): if self.heap.size == 0: return sys.float_info.max return self.heap.p2w[1]
class ExperienceReplay(object): def __init__(self,maxSize): self.maxSize = maxSize self.buffer = Buffer(self.maxSize) self.curSize = 0 def addExperience(self, *experience): self.buffer.insert(Transition(*experience)) self.curSize = min(self.curSize+1,self.maxSize) def sample(self, samplesAmount): sampledPoints = np.random.choice(self.curSize, samplesAmount, replace=False).tolist() expList = [] for a in sampledPoints : expList.append(self.buffer.getItem(a)) return expList
class Agent(): def __init__(self, Env_dim, Nb_action): self.memory = Buffer(Memory_size) self.eval_nn = Network(Env_dim, Nb_action) self.target_nn = Network(Env_dim, Nb_action) self.optimizer = torch.optim.Adam(self.eval_nn.parameters(), lr=Learning_rate) self.criterion = nn.MSELoss(reduction='sum') self.counter = 0 self.target_nn.fc1 = self.eval_nn.fc1 self.target_nn.fc2 = self.eval_nn.fc2 self.target_nn.out = self.eval_nn.out def choose_action(self, s): s = torch.unsqueeze(torch.FloatTensor(s), 0) return self.eval_nn(s)[0].detach() # ae(s) def getSample(self): return self.memory.sample(Batch_size) def optimize_model(self, file): if self.memory.get_nb_elements() >= Batch_size: batch = self.memory.sample(Batch_size) for s, a, s_, r, done in batch: qValues = (self.eval_nn(torch.tensor(s).float()))[a] qValues_ = self.target_nn(torch.tensor(s_).float()) qValues_target = Gamma * torch.max(qValues_) JO = pow(qValues - (r + (qValues_target * (1 - done))), 2) loss = self.criterion(qValues, JO) self.optimizer.zero_grad() # if i != Batch_size - 1: # loss.backward(retain_graph=True) # else: # loss.backward() loss.backward() self.optimizer.step() self.counter += 1 if self.counter % Refresh_gap == 0: torch.save(self.eval_nn, file) self.target_nn.fc1 = self.eval_nn.fc1 self.target_nn.fc2 = self.eval_nn.fc2 self.target_nn.out = self.eval_nn.out def store_transition(self, value): self.memory.insert(value)
class ExperienceReplay(object): def __init__(self,maxSize, alpha=0.6): self.maxSize = maxSize self.buffer = Buffer(self.maxSize) self.curSize = 0 def addExperience(self, experience): self.buffer.insert(experience) self.curSize = min(self.curSize+1,self.maxSize) def sample(self, samplesAmount): sampledPoints = np.random.choice(self.curSize, samplesAmount, replace=False).tolist() expList = [] weightList = [] for a in sampledPoints : expList.append(self.buffer.getItem(a)) weightList.append(1.0/samplesAmount) return np.asarray(expList), weightList, None
class RankBasedExpReplay(object): def __init__(self,maxSize, alpha=0.6): self.maxSize = maxSize self.buffer = Buffer(self.maxSize) self.heap = Heap() self.weights = None #Add two flags to indicate whether alpha or queue size has changed self.prevAlpha = alpha self.prevSize =0 # Variables to store current alpha and exp replay size self.alpha = alpha self.curSize = 0 #Weightings to each experience self.endPoints = [] def addExperience(self, experience): index = self.buffer.getPointer() self.buffer.insert(experience) weight = self.heap.getMaxPriority() self.heap.add(index, weight) self.curSize = self.heap.size def modifyExperience(self, weight, index): self.heap.add(index, weight) self.curSize = self.heap.size def sample(self, samplesAmount): if (self.prevAlpha != self.alpha) or (self.prevSize != self.curSize) : self.endPoints, self.weights = self.computeBoundaries(self.alpha, self.curSize, samplesAmount) self.prevAlpha = self.alpha self.prevSize = self.curSize totalWeights = sum(self.weights) startPoint = 0 expList = [] weightList = [] indexList = [] for a in self.endPoints : end = a + 1 diff = end - startPoint sampledNum = np.random.randint(diff, size=1)[0] retrIndex = startPoint + sampledNum startPoint = end expList.append(self.buffer.getItem(self.heap.getIndex(retrIndex))) weightList.append(self.weights[retrIndex]/totalWeights) indexList.append(retrIndex) return np.asarray(expList),np.asarray(weightList),np.asarray(indexList) def computeBoundaries(self, alpha, curSize, samplesAmount): ranks = list(range(curSize)) weights = [(1.0/(rank+1))**alpha for rank in ranks] sumAllWeights = sum(weights) stops = np.linspace(0,sumAllWeights,samplesAmount+1).tolist() del stops[0] curSum = 0 curFounded = 0 curStop = -1 results = [] for a in weights: curSum += a curStop += 1 if curSum >= stops[curFounded]: results.append(curStop) curFounded += 1 return results, weights def rebalance(self): indexList = [] weightList = [] while self.heap.size != 0: maxIndex = self.heap.p2i[1] maxWeight = self.heap.p2w[1] indexList.append(maxIndex) weightList.append(maxWeight) self.heap.delete(maxIndex) for a in range(len(indexList)): self.add(indexList[a],weightList[a]) def getMaxPriority(self): if self.heap.size == 0: return sys.float_info.max return self.heap.p2w[1]