def test(): try: from gridworld.chainwalk import Chainwalk except: print "Unable to import Chainwalk for test!" return cw = Chainwalk() trace = cw.trace() zeros = np.zeros(cw.nfeatures()) w = LSPI(trace,0.0001,cw,zeros,show=True) print w
def test(): try: from gridworld.chainwalk import Chainwalk except: print("Unable to import Chainwalk for test!") return cw = Chainwalk() trace = cw.trace() zeros = np.zeros(cw.nfeatures()) w = LSPI(trace,0.0001,cw,zeros,show=True) print(w)
pi = [gw.linear_policy(w0, s) for s in range(gw.nstates)] gw.set_arrows(pi) gw.background() gw.mainloop() if test_sarsa: gw = GridworldGui(nrows=9, ncols=9, endstates=[0], walls=[]) learner = Sarsa(8, 81, 0.5, 0.9, 0.9, 0.1) learner.learn(10000, gw, verbose=True) pi = [learner.best(s) for s in range(gw.nstates)] gw.set_arrows(pi) gw.background() gw.mainloop() if test_chainwalk: cw = Chainwalk() t = cw.trace(1000) policy0 = np.zeros(cw.nfeatures()) print(LSTDQ(t, cw, policy0)) if test_scale: gw = GridworldGui(nrows=64, ncols=64, size=8, endstates=[0], walls=[]) try: t = pickle.load(open("scale_trace.pck")) except: t = gw.trace(100000) #, show = False) pickle.dump(t, open("scale_trace.pck", "w"), pickle.HIGHEST_PROTOCOL) policy0 = np.zeros(gw.nfeatures()) #w0, weights0 = LSPI(t, 0.005, gw, policy0, maxiter=1, method="alt", debug = False, timer = True, show=False, format="csr") w0, weights0 = LSPI(t,
pi = [gw.linear_policy(w0,s) for s in range(gw.nstates)] gw.set_arrows(pi) gw.background() gw.mainloop() if test_sarsa: gw = GridworldGui(nrows = 9, ncols = 9, endstates = [0], walls = []) learner = Sarsa(8, 81, 0.5, 0.9, 0.9, 0.1) learner.learn(10000, gw, verbose=True) pi = [learner.best(s) for s in range(gw.nstates)] gw.set_arrows(pi) gw.background() gw.mainloop() if test_chainwalk: cw = Chainwalk() t = cw.trace(1000) policy0 = np.zeros(cw.nfeatures()) print LSTDQ(t, cw, policy0) if test_scale: gw = GridworldGui(nrows=64,ncols=64, size=8, endstates = [0], walls=[]) try: t = pickle.load(open("scale_trace.pck")) except: t = gw.trace(100000)#, show = False) pickle.dump(t,open("scale_trace.pck","w"),pickle.HIGHEST_PROTOCOL) policy0 = np.zeros(gw.nfeatures()) #w0, weights0 = LSPI(t, 0.005, gw, policy0, maxiter=1, method="alt", debug = False, timer = True, show=False, format="csr") w0, weights0 = LSPI(t, 0.005, gw, policy0, maxiter=10, method="parallel", debug = False, timer = True, show=True,ncpus=6)
Author: Jeremy M. Stober Program: TD_EXAMPLE.PY Date: Friday, February 24 2012 Description: Examples using TD algorithms to learn value functions. """ from gridworld.boyan import Boyan from gridworld.chainwalk import Chainwalk from cartpole import CartPole from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac # a simple environment env = Boyan() learner = TD(13, 0.1, 1.0, 0.8) learner.learn(1000, env, env.random_policy) print learner.V env = Chainwalk() learnerq = TDQ(2, 4, 0.1, 0.9, 0.8) import pdb env = CartPole() #learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01) #learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01) #learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well. learnerq = ActorCriticCmac( 2, 0.5, 1.0, 0.95, 0.8, 0.9 ) # Clearly does some learning, but not nearly as well. Policy not as stable. learnerq.learn(1000, env)