def generate(self, s, a): #sprime = np.random.choice([i for i in range(0,self.model.N)],p=self.model.px[a][s]); #tmpGM = GM((np.array(s) + np.array(self.model.delA)).T.tolist(),self.model.delAVar,1); tmpGM = GM() tmpGM.addG( Gaussian((np.array(s) + np.array(self.model.delA[a])).tolist(), self.model.delAVar, 1)) sprime = tmpGM.sample(1)[0] ztrial = [0] * len(self.model.pz) for i in range(0, len(self.model.pz)): ztrial[i] = self.model.pz[i].pointEval(sprime) z = ztrial.index(max(ztrial)) reward = self.model.r[a].pointEval(s) ''' if(a == 0 and s > 13): reward = 10; elif(a==1 and s<13): reward = 10; elif(a == 2 and s==13): reward = 100; else: reward = -10; ''' return [sprime, z, reward]
def simulate(self, steps, initState=None): if (initState is None): initState = self.states[0] states = [] obs = [] states.append(initState) for step in range(0, steps): #get new state keys, vals = zip(*self.Tprob[states[-1]].items()) states.append(np.random.choice(keys, p=vals)) newGM = GM() newGM.addG(self.Oprob[self.states.index(states[-1])]) obs.append(newGM.sample(1)[0]) return states, obs
def getRolloutReward(self, s, d=1): reward = 0 for i in range(0, d): a = np.random.randint(0, self.model.acts) ''' if(s < 13): a = 1; elif(s>13): a = 0; else: a = 2; ''' reward += self.model.discount * self.model.r[a].pointEval(s) #s = np.random.choice([i for i in range(0,self.model.N)],p=self.model.px[a][s]); tmpGM = GM() tmpGM.addG( Gaussian((np.array(s) + np.array(self.model.delA[a])).tolist(), self.model.delAVar, 1)) s = tmpGM.sample(1)[0] return reward
def lwisUpdate(self, prior, softClass, numSamples, inverse=False): #Runs a likelihood weighted importance sampling update on a given gaussian q = GM() q.addG(Gaussian(prior.mean, prior.var, 1)) p = GM() p.addG(prior) x = q.sample(numSamples) w = np.zeros(numSamples) for i in range(0, numSamples): if (not inverse): w[i] = p.pointEval(x[i]) * self.pointEvalND( softClass, x[i]) / q.pointEval(x[i]) else: w[i] = p.pointEval(x[i]) * ( 1 - self.pointEvalND(softClass, x[i])) / q.pointEval(x[i]) suma = sum(w) for i in range(0, len(w)): w[i] = w[i] / suma muHat = np.zeros(len(prior.mean)) for i in range(0, numSamples): muHat = muHat + np.dot(x[i], w[i]) varHat = np.zeros(shape=(len(prior.mean), len(prior.mean))) for i in range(0, numSamples): xi = np.asarray(x[i]) varHat = varHat + w[i] * np.outer(xi, xi) varHat = varHat - np.outer(muHat, muHat) muHat = muHat.tolist() varHat = varHat.tolist() if (len(prior.mean) == 1): muHat = muHat[0] if (len(prior.var) == 1): varHat = varHat[0][0] #Calculate Weights #sample a bunch from the prior tmp = GM() tmp.addG(Gaussian(prior.mean, prior.var, 1)) tmpSamps = tmp.sample(500) #Find the likelihood at each sampled point probs = np.zeros(500).tolist() for i in range(0, 500): if (not inverse): probs[i] = self.pointEvalND(softClass, tmpSamps[i]) else: probs[i] = 1 - self.pointEvalND(softClass, tmpSamps[i]) #Find the average likelihood, which is the weight factor sumSamp = sum(probs) / 500 #Multiply the sampled weight factor by the previous weight #or add in log space logSamps = np.log(sumSamp) logWeight = np.log(prior.weight) + logSamps #Extract final weight weight = np.exp(logWeight) post = Gaussian(muHat, varHat, weight) return post
def testMCTSSim2D(): trails = 10 trailLength = 100 allReward = np.zeros(shape=(trails, trailLength)).tolist() random = False for count in range(0, trails): ''' if(trails == 1): fig,ax = plt.subplots(); ''' totalReward = 0 a = OnlineSolver() x1 = np.random.randint(-5, 5) x2 = np.random.randint(-5, 5) x = [x1, x2] b = GM() b.addG(Gaussian(x, [[1, 0], [0, 1]], 1)) for step in range(0, trailLength): ''' if(trails == 1): ax.cla(); ax.plot(b,linewidth=4); ax.scatter(x,.4,s=150,c='r'); ax.set_ylim([0,.5]); ax.set_title('POMCP Belief'); plt.pause(0.1); ''' if (random): act = np.random.randint(0, 5) else: [act, u] = a.MCTS(b, 2) totalReward += a.model.r[act].pointEval(x) #x = np.random.choice([i for i in range(0,a.model.N)],p=a.model.px[act][x]); tmpGM = GM() tmpGM.addG( Gaussian((np.array(x) + np.array(a.model.delA[act])).tolist(), a.model.delAVar, 1)) x = tmpGM.sample(1)[0] ztrial = [0] * len(a.model.pz) for i in range(0, len(a.model.pz)): ztrial[i] = a.model.pz[i].pointEval(x) z = ztrial.index(max(ztrial)) b = a.beliefUpdate(b, act, z, a.model) if (not random): #RenderTreeGraph(a.T).to_picture('tree2.png'); a.T = [ node for node in PreOrderIter(a.T, filter_=lambda n: n.name == a .T.name + str(act) + str(z)) ][0] a.T.parent = None #print(a.T); #RenderTreeGraph(a.T).to_picture('tree1.png'); allReward[count][step] = totalReward print(allReward[count][-1]) averageAllReward = [0] * trailLength for i in range(0, trails): for j in range(0, trailLength): averageAllReward[j] += allReward[i][j] / trails allSigma = [0] * trailLength for i in range(0, trailLength): suma = 0 for j in range(0, trails): suma += (allReward[j][i] - averageAllReward[i])**2 allSigma[i] = np.sqrt(suma / trails) UpperBound = [0] * trailLength LowerBound = [0] * trailLength for i in range(0, trailLength): UpperBound[i] = averageAllReward[i] + allSigma[i] LowerBound[i] = averageAllReward[i] - allSigma[i] x = [i for i in range(0, trailLength)] plt.figure() plt.plot(x, averageAllReward, 'g') plt.plot(x, UpperBound, 'g--') plt.plot(x, LowerBound, 'g--') plt.fill_between(x, LowerBound, UpperBound, color='g', alpha=0.25) plt.xlabel('Time Step') plt.ylabel('Accumlated Reward') plt.title('Average Accumulated Rewards over Time for: ' + str(trails) + ' simulations') plt.show()