def unitTest(cls): print "Testing ValueIteration" np.random.seed(0) from system import System from networks import genGridNetwork from model import PJ system = System(genGridNetwork((3,3)),PJ()) numNodes = system.network.numNodes numTrt = Agent.numTrt(system) numValidTrt = Agent.numValidTrt(numNodes,numTrt) v = ValueIteration.solve(dc(system)) q = PolicyIteration.solve(dc(system)) q = util.unflattenQ(q,numNodes,numValidTrt) vChk = [max(i) for i in q] for i in zip(v,vChk): print "% 12.6f % 10.6f" % i
def unitTest(cls): print "Testing PolicyIteration" np.random.seed(0) from networks import genGridNetwork from model import PJ from copy import deepcopy from runners import vPiS system = systems.System(genGridNetwork((2, 2)), PJ()) numNodes = system.network.numNodes p = np.array(cls.calcP(system)) r = np.array(cls.calcR(system)) one = np.ones((p.shape[1],)) pRowsum = np.dot(p, one) ## check numerical ranges on p tol = 1e-8 if np.amin(p) < 0: raise ValueError("P has negative values") if np.amax(p) > 1.0: raise ValueError("P has values greater than 1") if abs(max(pRowsum) - 1) > tol or abs(min(pRowsum) - 1) > tol: raise ValueError("Not all row sums for P are 1.0") numTrt = agents.Agent.numTrt(system) numValidTrt = agents.Agent.numValidTrt(numNodes, numTrt) q = util.unflattenQ(np.random.randint(numNodes, size=(1 << numNodes) * numValidTrt), numNodes, numValidTrt) pi = cls.piForPolicy(util.q2Policy(q), system) one = np.ones((pi.shape[1],)) piRowSum = pi * one ## check numerical ranges on pi if pi.max() < 0: raise ValueError("Pi has some negative values") if pi.min() > 0: raise ValueError("Pi has values greater than 1") if abs(np.amin(piRowSum) - 1) > tol or abs(np.amax(piRowSum) - 1): raise ValueError("Rows of pi do not sum to 1") ## make sure random agent estimates worse Q-values gamma = 0.9 randPol = [range(numValidTrt) for i in range(1 << numNodes)] vRa = PolicyIteration2.vForPolicy(randPol, system, gamma=gamma).tolist() polOpt = PolicyIteration2.solve(deepcopy(system), gamma=gamma) vOpt = PolicyIteration2.vForPolicy(polOpt, system, gamma=gamma).tolist() cnt = sum(i > j for i, j in zip(vRa, vOpt)) if cnt > 0: raise ValueError("Random Agent does better " + "than optimal V-function %d times" % cnt) ## check that gamma = 0 results in a v-function equal to ## expected immediate rewards gamma = 0.0 polOpt = PolicyIteration2.solve(deepcopy(system), gamma=gamma) vOpt = PolicyIteration2.vForPolicy(polOpt, system, gamma=gamma) pi = cls.piForPolicy(polOpt, system) if np.linalg.norm(pi.dot(r) - vOpt, 2) > 1e-10: raise ValueError("Gamma = 0 did not result in expected " + "immediate rewards") ## check analytical values with simulated values gamma = 0.5 polOpt = PolicyIteration2.solve(deepcopy(system), gamma=gamma) vOpt = PolicyIteration2.vForPolicy(polOpt, system, gamma=gamma) agentOpt = agents.PolAgent.fromPol(polOpt) diff = 0.0 for s in range(1 << numNodes): print "Checking state % 4d" % s val = vPiS(s, system, agentOpt, gamma=gamma, finalT=10, reps=1000) diff += abs(vOpt[s] - val) diff /= float(1 << numNodes) # print "diff from sim: % 10.6f" % diff if diff > 0.05: raise ValueError("V values differ from sim by %f" % diff)