def test_gapworld(): # Register the avatar first vgdl.registry.register_class(RightMovingJumpingAvatar) game = load_gapworld_game_and_level() env = VGDLPybrainEnvironment(game, GapworldObserver(game)) task = VGDLPybrainTask(env) mapper = vgdl.mdp.MDPConverter(task) T, R = mapper.convert_task_to_mdp() print('Known states:') print(mapper.get_observations()) for action_i in range(T.shape[0]): print('Action {}:'.format(env.action_set[action_i])) print(T[action_i]) print('Rewards:') print(R) from pybrain.rl.learners.modelbased import policyIteration, trueValues # policy is S x A policy, optimal_T = policyIteration(T, R, discountFactor=.9) # So this seems wrong whether we allow transitions from absorbing states # or not, but it's a good indication V = trueValues(optimal_T, R, discountFactor=.9) print('Optimal policy:') print(policy) import ipdb ipdb.set_trace()
def buildOptimal(game_env, discountFactor=0.99): """ Given a game, find the optimal (state-based) policy and return an agent that is playing accordingly. """ from vgdl.mdpmap import MDPconverter C = MDPconverter(env=game_env) Ts, R, _ = C.convert() policy, _ = policyIteration(Ts, R, discountFactor=discountFactor) game_env.reset() def x(*_): s = game_env.getState() #print s i = C.states.index(s) return i #return PolicyDrivenAgent(policy, lambda *_: C.states.index(game_env.getState())) return PolicyDrivenAgent(policy, x)
def buildOptimal(game_env, discountFactor=0.99): """ Given a game, find the optimal (state-based) policy and return an agent that is playing accordingly. """ from mdpmap import MDPconverter C = MDPconverter(env=game_env) Ts, R, _ = C.convert() policy, _ = policyIteration(Ts, R, discountFactor=discountFactor) game_env.reset() def x(*_): s = game_env.getState() #print s i = C.states.index(s) return i #return PolicyDrivenAgent(policy, lambda *_: C.states.index(game_env.getState())) return PolicyDrivenAgent(policy, x)
def plotOptimalValues(gametype, layout, discountFactor=0.9, showValue=False): # build the game g = VGDLParser().parseGame(gametype) g.buildLevel(layout) # transform into an MDP C = MDPconverter(g) Ts, R, _ = C.convert() # find the optimal policy _, Topt = policyIteration(Ts, R, discountFactor=discountFactor) # evaluate the policy Vopt = trueValues(Topt, R, discountFactor=discountFactor) # plot those values featurePlot((g.width, g.height), C.states, Vopt, plotdirections=True) if showValue: # expected discounted reward at initial state Vinit = Vopt[C.initIndex()] pylab.xlabel("V0=%.4f" % Vinit)
def plotOptimalValues(gametype, layout, discountFactor=0.9, showValue=False): # build the game g = VGDLParser().parseGame(gametype) g.buildLevel(layout) # transform into an MDP C = MDPconverter(g) Ts, R, _ = C.convert() # find the optimal policy _, Topt = policyIteration(Ts, R, discountFactor=discountFactor) # evaluate the policy Vopt = trueValues(Topt, R, discountFactor=discountFactor) # plot those values featurePlot((g.width, g.height), C.states, Vopt, plotdirections=True) if showValue: # expected discounted reward at initial state Vinit = Vopt[C.initIndex()] pylab.xlabel("V0=%.4f"%Vinit)