def test_gapworld(): # Register the avatar first vgdl.registry.register_class(RightMovingJumpingAvatar) game = load_gapworld_game_and_level() env = VGDLPybrainEnvironment(game, GapworldObserver(game)) task = VGDLPybrainTask(env) mapper = vgdl.mdp.MDPConverter(task) T, R = mapper.convert_task_to_mdp() print('Known states:') print(mapper.get_observations()) for action_i in range(T.shape[0]): print('Action {}:'.format(env.action_set[action_i])) print(T[action_i]) print('Rewards:') print(R) from pybrain.rl.learners.modelbased import policyIteration, trueValues # policy is S x A policy, optimal_T = policyIteration(T, R, discountFactor=.9) # So this seems wrong whether we allow transitions from absorbing states # or not, but it's a good indication V = trueValues(optimal_T, R, discountFactor=.9) print('Optimal policy:') print(policy) import ipdb ipdb.set_trace()
def plotLSPIValues(gametype, layout, discountFactor=0.9, useTD=False, showValue=False): # build the game g = VGDLParser().parseGame(gametype) g.buildLevel(layout) # transform into an MDP and the mapping to observations C = MDPconverter(g) Ts, R, fMap = C.convert() # find the the best least-squares approximation to the policy, # given only observations, not the state information if useTD: # state-based _, Tlspi = LSTD_PI_policy(fMap, Ts, R, discountFactor=discountFactor) else: # state-action-based _, Tlspi = LSPI_policy(fMap, Ts, R, discountFactor=discountFactor) # evaluate the policy Vlspi = trueValues(Tlspi, R, discountFactor=discountFactor) # plot those values featurePlot((g.width, g.height), C.states, Vlspi) if showValue: # expected discounted reward at initial state Vinit = Vlspi[C.initIndex()] pylab.xlabel("V0=%.4f"%Vinit)
def plotOptimalValues(gametype, layout, discountFactor=0.9, showValue=False): # build the game g = VGDLParser().parseGame(gametype) g.buildLevel(layout) # transform into an MDP C = MDPconverter(g) Ts, R, _ = C.convert() # find the optimal policy _, Topt = policyIteration(Ts, R, discountFactor=discountFactor) # evaluate the policy Vopt = trueValues(Topt, R, discountFactor=discountFactor) # plot those values featurePlot((g.width, g.height), C.states, Vopt, plotdirections=True) if showValue: # expected discounted reward at initial state Vinit = Vopt[C.initIndex()] pylab.xlabel("V0=%.4f" % Vinit)
def plotOptimalValues(gametype, layout, discountFactor=0.9, showValue=False): # build the game g = VGDLParser().parseGame(gametype) g.buildLevel(layout) # transform into an MDP C = MDPconverter(g) Ts, R, _ = C.convert() # find the optimal policy _, Topt = policyIteration(Ts, R, discountFactor=discountFactor) # evaluate the policy Vopt = trueValues(Topt, R, discountFactor=discountFactor) # plot those values featurePlot((g.width, g.height), C.states, Vopt, plotdirections=True) if showValue: # expected discounted reward at initial state Vinit = Vopt[C.initIndex()] pylab.xlabel("V0=%.4f"%Vinit)