########################### import valueIterationAgents, qlearningAgents a = None if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) gridWorldEnv = GridworldEnvironment(mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = {'gamma': opts.discount, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'actionFn': actionFn} a = qlearningAgents.QLearningAgent(**qLearnOpts) elif opts.agent == 'random': # # No reason to use the random agent without episodes if opts.episodes == 0: opts.episodes = 10 class RandomAgent: def getAction(self, state): return random.choice(mdp.getPossibleActions(state)) def getValue(self, state): return 0.0 def getQValue(self, state, action): return 0.0 def getPolicy(self, state): "NOTE: 'random' is a special policy value; don't use it in your code." return 'random' def update(self, state, action, nextState, reward):
def main(myargs): sys.argv = myargs.split() opts = parseOptions() ########################### # GET THE GRIDWORLD ########################### if opts.grid == 'VerticalBridgeGrid': opts.gridSize = 120 import gridworld mdpFunction = getattr(gridworld, "get" + opts.grid) mdp = mdpFunction() mdp.setLivingReward(opts.livingReward) mdp.setNoise(opts.noise) env = gridworld.GridworldEnvironment(mdp) ########################### # GET THE DISPLAY ADAPTER ########################### import textGridworldDisplay display = textGridworldDisplay.TextGridworldDisplay(mdp) if not opts.textDisplay: import graphicsGridworldDisplay display = graphicsGridworldDisplay.GraphicsGridworldDisplay( mdp, opts.gridSize, opts.speed) try: display.start() except KeyboardInterrupt: sys.exit(0) ########################### # GET THE AGENT ########################### import valueIterationAgents, qlearningAgents a = None if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) gridWorldEnv = GridworldEnvironment(mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = { 'gamma': opts.discount, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'actionFn': actionFn } a = qlearningAgents.QLearningAgent(**qLearnOpts) elif opts.agent == 'random': # # No reason to use the random agent without episodes if opts.episodes == 0: opts.episodes = 10 class RandomAgent: def getAction(self, state): return random.choice(mdp.getPossibleActions(state)) def getValue(self, state): return 0.0 def getQValue(self, state, action): return 0.0 def getPolicy(self, state): "NOTE: 'random' is a special policy value; don't use it in your code." return 'random' def update(self, state, action, nextState, reward): pass a = RandomAgent() else: if not opts.manual: raise 'Unknown agent type: ' + opts.agent ########################### # RUN EPISODES ########################### # DISPLAY Q/V VALUES BEFORE SIMULATION OF EPISODES try: if not opts.manual and opts.agent == 'value': if opts.valueSteps: for i in range(opts.iters): tempAgent = valueIterationAgents.ValueIterationAgent( mdp, opts.discount, i) display.displayValues(tempAgent, message="VALUES AFTER " + str(i) + " ITERATIONS") display.pause() display.displayValues(a, message="VALUES AFTER " + str(opts.iters) + " ITERATIONS") display.pause() display.displayQValues(a, message="Q-VALUES AFTER " + str(opts.iters) + " ITERATIONS") display.pause() except KeyboardInterrupt: sys.exit(0) # FIGURE OUT WHAT TO DISPLAY EACH TIME STEP (IF ANYTHING) displayCallback = lambda x: None if not opts.quiet: if opts.manual and opts.agent == None: displayCallback = lambda state: display.displayNullValues(state) else: if opts.agent == 'random': displayCallback = lambda state: display.displayValues( a, state, "CURRENT VALUES") if opts.agent == 'value': displayCallback = lambda state: display.displayValues( a, state, "CURRENT VALUES") if opts.agent == 'q': displayCallback = lambda state: display.displayQValues( a, state, "CURRENT Q-VALUES") messageCallback = lambda x: printString(x) if opts.quiet: messageCallback = lambda x: None # FIGURE OUT WHETHER TO WAIT FOR A KEY PRESS AFTER EACH TIME STEP pauseCallback = lambda: None if opts.pause: pauseCallback = lambda: display.pause() # FIGURE OUT WHETHER THE USER WANTS MANUAL CONTROL (FOR DEBUGGING AND DEMOS) if opts.manual: decisionCallback = lambda state: getUserAction(state, mdp. getPossibleActions) else: decisionCallback = a.getAction # RUN EPISODES if opts.episodes > 0: print() print("RUNNING", opts.episodes, "EPISODES") print() returns = 0 for episode in range(1, opts.episodes + 1): returns += runEpisode(a, env, opts.discount, decisionCallback, displayCallback, messageCallback, pauseCallback, episode) if opts.episodes > 0: print() print("AVERAGE RETURNS FROM START STATE: " + str((returns + 0.0) / opts.episodes)) print() print() # DISPLAY POST-LEARNING VALUES / Q-VALUES if opts.agent == 'q' and not opts.manual: try: display.displayQValues(a, message="Q-VALUES AFTER " + str(opts.episodes) + " EPISODES") display.pause() display.displayValues(a, message="VALUES AFTER " + str(opts.episodes) + " EPISODES") display.pause() except KeyboardInterrupt: sys.exit(0)
#lims = (1.5, 2.55, .26, 3.2) num_intervals = 10 ndgrid = MultiDimGrid.MultiDimGrid([ MultiDimGrid.SingleDimGrid(high=lim, low=-lim, num_intervals=num_intervals) for lim in lims ]) gymUtil.transformEnvState(env, lambda x: ndgrid.discretize(x)) # Define agent #env = gym.make('Acrobot-v0') kwargs = { 'epsilon': 0.01, 'gamma': 1, 'alpha': 0.2, } agent = qlearningAgents.QLearningAgent(**kwargs) agent.getLegalActions = lambda x: range(env.action_space.n) def make_experiment(): kwargs = { 'agent': agent, 'environment': env, 'numPolicyChecks': numPolicyChecks, 'numEpisodesPerCheck': numEpisodesPerCheck, 'numTrainEpisodes': numTrainEpisodes, } for key, value in sorted(kwargs.items(), key=lambda x: x[0]): logging.info('\t%s = %s' % (str(key, ), str(value))) experiment = Experiment.Experiment(**kwargs)
def __init__(self, grid_name='DiscountGrid', discount=0.9, learning_rate=0.5, living_reward=0.0, noise=0.2, epsilon=0.3, display_speed=0.5, grid_size=150, text_only=False, n_episodes=100, agent_window_size=1, agent_max_n_experiences=1000, is_use_q_agent=False): self.text_only = text_only self.display_speed = display_speed self.n_episodes = n_episodes self.discount = discount ########################### # GET THE INPUT MODULE ########################### if is_use_q_agent: self.user_input_module = None else: self.user_input_module = user_input.UserInputModule() ########################### # GET THE GRIDWORLD ########################### # noinspection PyUnresolvedReferences import gridworld mdp_function = getattr(gridworld, "get" + grid_name) self.mdp = mdp_function() self.mdp.setLivingReward(living_reward) self.mdp.setNoise(noise) self.env = gridworld.GridworldEnvironment(self.mdp) ########################### # GET THE DISPLAY ADAPTER ########################### import textGridworldDisplay self.display = textGridworldDisplay.TextGridworldDisplay(self.mdp) if not text_only: import graphicsGridworldDisplay self.display = graphicsGridworldDisplay.GraphicsGridworldDisplay( self.mdp, grid_size, display_speed) try: self.display.start() except KeyboardInterrupt: sys.exit(0) ########################### # GET THE TAMER AGENT ########################### import qlearningAgents # env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon # simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) self.gridWorldEnv = GridworldEnvironment(self.mdp) action_function = lambda state: self.mdp.getPossibleActions(state) q_learn_opts = { 'gamma': discount, 'alpha': learning_rate, 'epsilon': epsilon, 'actionFn': action_function } if is_use_q_agent: self.agent = qlearningAgents.QLearningAgent(**q_learn_opts) else: self.agent = qlearningAgents.TamerQAgent( max_n_experiences=agent_max_n_experiences, window_size=agent_window_size, **q_learn_opts)
if opts.agent == 'value': a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters) elif opts.agent == 'q': #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp) gridWorldEnv = GridworldEnvironment(mdp) actionFn = lambda state: mdp.getPossibleActions(state) qLearnOpts = { 'gamma': opts.discount, 'alpha': opts.learningRate, 'epsilon': opts.epsilon, 'actionFn': actionFn, 'numTraining': opts.episodes } a = qlearningAgents.QLearningAgent(env, **qLearnOpts) elif opts.agent == 'random': # # No reason to use the random agent without episodes if opts.episodes == 0: opts.episodes = 10 class RandomAgent: def getAction(self, state): return random.choice(mdp.getPossibleActions(state)) def getValue(self, state): return 0.0 def getQValue(self, state, action): return 0.0