def buildTrainData(self, currState, nextState, reward, done, action): states = np.asarray([currState, nextState]) q = self.model.predict(np.reshape(self.nState(states), (-1, 2))) self.qValues = q[0] qVal = q[1] qMax = np.max(qVal) Y = copy.deepcopy(self.qValues) if done: y = reward else: y = reward + self.discount * qMax #check if replaced prpoerly, 1 epoh loss should be mpr, initial loss has to be more #check if values are referenced rather rhan copy Y[action] = y self.trainX.append(self.nState(currState)) self.trainY.append(Y) return skMSE(Y, self.qValues)
def buildMiniBatchTrainData(self): c = [] n = [] r = [] d = [] a = [] if len(self.curState) > self.batchSize: ndxs = random.sample(range(len(self.curState)), self.batchSize) else: ndxs = range(len(self.curState)) c = itemgetter(*ndxs)(self.curState) n = itemgetter(*ndxs)(self.nxtState) r = np.asanyarray(np.array(itemgetter(*ndxs)(self.rwdList))) d = np.asanyarray(np.array(itemgetter(*ndxs)(self.doneList))) a_ = np.array(itemgetter(*ndxs)(self.actnList)) aTemp = np.vstack((np.array(range(len(a_))), a_)) a = np.asanyarray(aTemp) # sending current states and next states together for inference X = torch.stack(n + c) self.model.eval() qVal = self.model(X.float()).cpu().detach().numpy() # splitting them to get the current and next states hIndx = self.batchSize qVal_n = qVal[:hIndx] qMax_n = np.max(qVal_n, axis=1) qVal_c = qVal[hIndx:] Y = copy.deepcopy(qVal_c) y = np.zeros(r.shape) ndx = np.where(d == True) y[ndx] = r[ndx] ndx = np.where(d == False) y[ndx] = r[ndx] + self.discount * qMax_n[ndx] Y[a[0], a[1]] = y self.trainX = X[hIndx:] self.trainY = torch.from_numpy(Y).to(self.device) return skMSE(Y, qVal_c)
def buildMiniBatchTrainData(self): c = [] n = [] r = [] d = [] a = [] if len(self.replayMemory) > self.batchSize: minibatch = random.sample(self.replayMemory, self.batchSize) else: minibatch = self.replayMemory for ndx, [currState, nextState, reward, done, action] in enumerate(minibatch): c.append(currState) n.append(nextState) r.append(reward) d.append(done) a.append([ndx, action]) c = np.asanyarray(c) n = np.asanyarray(n) r = np.asanyarray(r) d = np.asanyarray(d) a = np.asanyarray(a) a = a.T self.model.eval() X = torch.from_numpy(np.reshape(self.nState(n), (-1, 2))).to(self.device) qVal_n = self.model(X.float()).cpu().detach().numpy() qMax_n = np.max(qVal_n, axis=1) X = torch.from_numpy(np.reshape(self.nState(c), (-1, 2))).to(self.device) qVal_c = self.model(X.float()).cpu().detach().numpy() Y = copy.deepcopy(qVal_c) y = np.zeros(r.shape) ndx = np.where(d == True) y[ndx] = r[ndx] ndx = np.where(d == False) y[ndx] = r[ndx] + self.discount * qMax_n[ndx] Y[a[0], a[1]] = y self.trainX = c self.trainY = Y return skMSE(Y, qVal_c)
def buildMiniBatchTrainData(self): c = [] n = [] r = [] d = [] a = [] if len(self.curState)>self.batchSize: ndxs = random.sample(range(len(self.curState)), self.batchSize) else: ndxs = range(len(self.curState)) bSize = len(ndxs) c = np.asanyarray(np.array(itemgetter(*ndxs)(self.curState))) n = np.asanyarray(np.array(itemgetter(*ndxs)(self.nxtState))) r = np.asanyarray(np.array(itemgetter(*ndxs)(self.rwdList))) d = np.asanyarray(np.array(itemgetter(*ndxs)(self.doneList))) a_ = np.array(itemgetter(*ndxs)(self.actnList)) aTemp = np.vstack((np.array(range(len(a_))),a_)) a = np.asanyarray(aTemp) self.model.eval() X = torch.from_numpy(np.reshape(n,(bSize,-1))).to(self.device) qVal_n = self.model(X.float()).cpu().detach().numpy() qMax_n = np.max(qVal_n, axis = 1) X = torch.from_numpy(np.reshape(c,(bSize,-1))).to(self.device) qVal_c = self.model(X.float()).cpu().detach().numpy() Y = copy.deepcopy(qVal_c) y = np.zeros(r.shape) ndx = np.where(d == True) y[ndx] = r[ndx] ndx = np.where(d == False) y[ndx] = r[ndx] + self.discount * qMax_n[ndx] Y[a[0],a[1]] = y self.trainX = c self.trainY = Y return skMSE(Y,qVal_c)