def initExperiment(alg, optimistic=True): env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) if optimistic: table.initialize(1.) else: table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = alg() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() agent = LearningAgent(table, learner) agent.batchMode = False experiment = Experiment(task, agent) experiment.allRewards = [] return experiment
def testMaze(): # simplified version of the reinforcement learning tutorial example structure = np.array([[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 1, 0, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(3): experiment.doInteractions(40) controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
class SpadesPlayer: def __init__(self,game_deck, game_env): self.gameDeck = game_deck self.hand = SpadesDeckTest.SpadesDeckTest.draw_hand(self.gameDeck) self.gamesWon = 0 self.gamesTied = 0 self.av_table = ActionValueTable(4, 1) self.av_table.initialize(0.0) self.env = game_env self.task = SpadesTask.SpadesTask(game_env) self.agent = None self.learner = None def get_value(self): return self.hand def play_card(self, cardindex): print cardindex retCard = copy.copy(self.hand[cardindex]) self.hand.remove(self.hand[cardindex]) return retCard def get_new_hand(self): self.hand = SpadesDeckTest.SpadesDeckTest.draw_hand(self.gameDeck)
def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)), max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def createExperimentInstance(): gymRawEnv = gym.make('MountainCarContinuous-v0') cartPositionGroup = Digitizer.buildBins(-1.2, 0.6, 16) cartVelocityGroup = Digitizer.buildBins(-0.07, 0.07, 4) actionDedigitizer = Digitizer.build(-1.0, 1.0, 5, True) # print("Cart position bins:", cartPositionGroup) # print("Cart velocity bins:", cartVelocityGroup) # print("Cart force bins:", actionDedigitizer.bins, actionDedigitizer.possibleValues()) observationDigitizer = ArrayDigitizer( [cartPositionGroup, cartVelocityGroup]) transformation = EnvTransformation(observationDigitizer, actionDedigitizer) task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) # env.setCumulativeRewardMode() # create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information) ## gamma -- discount factor (importance of future reward) # create value table and initialize with ones table = ActionValueTable(observationDigitizer.states, actionDedigitizer.states) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment(experiment, doSingleExperiment) return experiment
def createExperimentInstance(): gymRawEnv = gym.make('MountainCar-v0') cartPositionGroup = Digitizer.buildBins(-1.2, 0.6, 16) cartVelocityGroup = Digitizer.buildBins(-0.07, 0.07, 16) # print("Cart position bins:", cartPositionGroup) # print("Cart velocity bins:", cartVelocityGroup) observationDigitizer = ArrayDigitizer( [cartPositionGroup, cartVelocityGroup]) transformation = EnvTransformation(observationDigitizer) task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) # env.setCumulativeRewardMode() # create value table and initialize with ones table = ActionValueTable(observationDigitizer.states, env.numActions) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment(experiment, ExperimentIteration()) return experiment
def setup_RL(): # create the maze with walls (1) envmatrix = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) return experiment, agent, table
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)) , max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def load_AV_Table(self): load_D = loadData(self._filename) if load_D[1] == True: self._av_table = load_D[0] print "Found Table!" else: self._av_table = ActionValueTable(self._number_of_states, self._actions) self._av_table.initialize(0.0) print "No training for this format. Creating new AV table"
def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
class IntelligentAgent(Agent, LearningAgent): """An agent that learns through a value-based RL algorithm""" def __init__(self, name, num_states, num_actions, epsilon=0.3, gamma=0.99, alpha=0.95): self.controller = ActionValueTable(num_states, num_actions) self.controller.initialize(np.random.rand(num_states * num_actions)) self.learner = Q(gamma=gamma, alpha=alpha) self.learner.batchMode = False self.learner.explorer.epsilon = epsilon LearningAgent.__init__(self, self.controller, self.learner) Agent.__init__(self, name) def choose_action(self): return self.getAction()[0]
def __init__(self, name, num_states, num_actions, epsilon=0.3, gamma=0.99, alpha=0.95): self.controller = ActionValueTable(num_states, num_actions) self.controller.initialize(np.random.rand(num_states * num_actions)) self.learner = Q(gamma=gamma, alpha=alpha) self.learner.batchMode = False self.learner.explorer.epsilon = epsilon LearningAgent.__init__(self, self.controller, self.learner) Agent.__init__(self, name)
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins**env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def get_discrete_task_agent(generators, market, nStates, nOffer, markups, withholds, maxSteps, learner, Pd0=None, Pd_min=0.0): """ Returns a tuple of task and agent for the given learner. """ env = pyreto.discrete.MarketEnvironment(generators, market, numStates=nStates, numOffbids=nOffer, markups=markups, withholds=withholds, Pd0=Pd0, Pd_min=Pd_min) task = pyreto.discrete.ProfitTask(env, maxSteps=maxSteps) nActions = len(env._allActions) module = ActionValueTable(numStates=nStates, numActions=nActions) agent = LearningAgent(module, learner) return task, agent
def __init__(self, event_queue_name, hub_queue_name): super().__init__() # create environment self.conn = boto.sqs.connect_to_region(constants.REGION) self.event_queue = self.conn.get_queue(event_queue_name) self.event_queue.set_message_class(MHMessage) self.env = DogEnv(DogEnv.ALL_QUIET, DogEnv.ALL_QUIET, self.event_queue, hub_queue_name) self.env.delay = (self.episodes == 1) # create task self.task = QuietDogTask(self.env) # create value table and initialize with ones # TODO: Get number of states from DogEnv self.table = ActionValueTable(2*5*4, 5*4) self.table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here self.learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well self.learner.explorer = BoltzmannExplorer() # create agent self.agent = DogAgent(self.table, self.learner) # create experiment self.experiment = Experiment(self.task, self.agent)
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins ** env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def __init__(self, name, clientID, sensorHandle, bodyHandle): ''' Constructor ''' self.resetParameters() controller = ActionValueTable(150, 5) # pyBrain controller.initialize(1.) # pyBrain learner = Q() # pyBrain self.__mind=AgentMind(controller, learner) # with pyBrain self.__controller=controller self.__name=name self.__clientID=clientID # Client ID of the Dummy object self.__sensorHandle=sensorHandle # Proximity sensor handle of the V-Rep agent self.__bodyHandle=bodyHandle # BubbleRob body handle self.__mind.setInput("name", name) self.__pybrainEnvironment = LocomotionEnvironment() self.__pybrainTask = LocomotionTask(self.__pybrainEnvironment)
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ "!!!!!!!!!!", "! ! ! ! !", "! !! ! ! !", "! ! !", "! !!!!!! !", "! ! ! !", "! ! !!!! !", "! !", "! !!!!! !", "! ! !", "!!!!!!!!!!", ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy))
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def initialize(self, grid): """ initializes all the (s,a) pairs with the no-traffic travel time """ ActionValueTable.initialize(self, float("-inf")) #not every action is possible from every state for node, time in grid.all_shortest_path_lengths(): in_edges = grid.grid.in_edges([node]) for edge in in_edges: for period in xrange(const.PERIODS): s = task.get_state(g.node_number(edge[0]), period) #state involves node previous to current node a = g.action(edge) q = - time - grid.grid.get_edge_data(*edge)["weight"] self.updateValue(s, a, q) #Q(s_final, a) for all actions is 0 for p in xrange(const.PERIODS): s = task.get_state(const.NODES - 1, p) for a in xrange(const.POSSIBLE_ACTIONS): self.updateValue(s, a, 0)
def __init__(self,game_deck, game_env): self.gameDeck = game_deck self.hand = SpadesDeckTest.SpadesDeckTest.draw_hand(self.gameDeck) self.gamesWon = 0 self.gamesTied = 0 self.av_table = ActionValueTable(4, 1) self.av_table.initialize(0.0) self.env = game_env self.task = SpadesTask.SpadesTask(game_env) self.agent = None self.learner = None
class RL: def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0] self.experiment.doInteractions(1) self.agent.learn()
def createExperimentInstance(): gymRawEnv = gym.make('Taxi-v2') transformation = EnvTransformation() task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation( transformation ) ## env.setCumulativeRewardMode() ## create value table and initialize with ones table = ActionValueTable(env.numStates, env.numActions) # table = ActionValueTableWrapper(table) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment( experiment, experimentIteration ) return experiment
class RL: def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0] self.experiment.doInteractions(1) self.agent.learn()
def testValueBased(self): """ Test value-based learner. """ mkt = SmartMarket(self.case) exp = MarketExperiment([], [], mkt) for g in self.case.generators: env = DiscreteMarketEnvironment([g], mkt) dim_state, num_actions = (10, 10) exp.tasks.append(ProfitTask(env, dim_state, num_actions)) module = ActionValueTable(dim_state, num_actions) module.initialize(1.0) # module = ActionValueNetwork(dimState=1, numActions=4) learner = SARSA() #Q() QLambda() # learner.explorer = BoltzmannExplorer() # default is e-greedy. exp.agents.append(LearningAgent(module, learner)) for _ in range(1000): exp.doInteractions(24) # interact with the env in batch mode for agent in exp.agents: agent.learn() agent.reset()
def __init__(self): self.interactionscount = 0 # Define action-value table controller = ActionValueTable(DerivedConstants.NUM_STATES, DerivedConstants.NUM_ACTIONS) controller.initialize(INITIAL_ACTION_VALUE_TABLE_VALUE) # Define Q-learning agent learner = Q(ALPHA, GAMMA) learner._setExplorer(EpsilonGreedyExplorer(EPSILON)) self.agent = LearningAgent(controller, learner) # Define the environment self.environment = BeaverEnv() # Define the task self.task = BeaverTask(self.environment) # Finally, define experiment self.experiment = Experiment(self.task, self.agent)
def explore_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list("!!!!!!!!!!"), list("! ! ! ! !"), list("! !! ! ! !"), list("! ! !"), list("! !!!!!! !"), list("! ! ! !"), list("! ! !!!! !"), list("! !"), list("! !!!!! !"), list("! ! !"), list("!!!!!!!!!!"), ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy)) assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
def createExperimentInstance(): gymRawEnv = gym.make('FrozenLake-v0') transformation = EnvTransformation() task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) ## env.setCumulativeRewardMode() # create value table and initialize with ones table = ActionValueTable(gymRawEnv.observation_space.n, gymRawEnv.action_space.n) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) iterator = ExperimentIteration() quality = QualityFunctor() experiment = ProcessExperiment(experiment, iterator, quality) return experiment
def initialize(self, grid): """ initializes all the (s,a) pairs with the no-traffic travel time """ ActionValueTable.initialize( self, float("-inf")) #not every action is possible from every state for node, time in grid.all_shortest_path_lengths(): in_edges = grid.grid.in_edges([node]) for edge in in_edges: for period in xrange(const.PERIODS): s = task.get_state( g.node_number(edge[0]), period) #state involves node previous to current node a = g.action(edge) q = -time - grid.grid.get_edge_data(*edge)["weight"] self.updateValue(s, a, q) #Q(s_final, a) for all actions is 0 for p in xrange(const.PERIODS): s = task.get_state(const.NODES - 1, p) for a in xrange(const.POSSIBLE_ACTIONS): self.updateValue(s, a, 0)
def runMainProg(): # define action value table av_table = ActionValueTable(32, 2) av_table.initialize(0.) for i in range (0,32): print "The AV Value At ",i," is: ", av_table.getActionValues(i) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0,0)) agent = LearningAgent(av_table, learner) #define a blackjack deck theDeck = BlackjackCardDeck() #define the environment env = BlackjackEnv(theDeck) env.createHand() #define a Dealer dealer = BlackjackDealer(theDeck) #define the task task = BlackjackTask(env) #define the experiment experiment = Experiment(task, agent) #run the game for i in range(0,10000): playGame(dealer, task, env, experiment, agent) print "Games Agent Won: ", GamesAgentWon print "Games Dealer won: ", GamesDealerWon print "Games Tied: ", GamesTied print "Total Games Played: ", TotalGames for i in range (0,32): print "The AV Value At ",i," is: ", av_table.getActionValues(i)
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i + 1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def initData(targetPlatform): global numActions, numStates, table, trainDataFile, tableFile if targetPlatform not in supportedPlatforms: sys.stderr.write("------------------------------------------\n") sys.stderr.write( "ERROR: target platform '%s' not supported by RL in training set\n" % (targetPlatform)) sys.stderr.write("------------------------------------------\n") # sys.stderr.write("\n\n%s\n\n" % targetPlatform) defaultTarget = "maxj" if targetPlatform == "none": targetPlatform = defaultTarget replaceStr = "_%s.txt" % (targetPlatform) trainDataFile = trainDataFile.replace(".txt", replaceStr) tableFile = tableFile.replace(".txt", replaceStr) readTrainData(trainDataPath + trainDataFile) readActionValueTable(trainDataPath + tableFile) numActions = transitionTable.shape[1] numStates = transitionTable.shape[0] # create value table and initialize with ones table = ActionValueTable(numStates, numActions) # print(actionValueTable) for i in range(transitionTable.shape[0]): for j in range(transitionTable.shape[1]): table._params[i * transitionTable.shape[1] + j] = actionValueTable[i * transitionTable.shape[1] + j]
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i+1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1] )
def __init__(self): ActionValueTable.__init__(self, const.STATES, const.POSSIBLE_ACTIONS)
from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer from pacmanTask import PacmanTask from pacmanAgent import PacmanAgent from runPacman import RunPacman from ghost import Ghost from pacmanEnvironment import Environment ############################################################### # The main function that begins running our Pacman-In-AI game # ############################################################### if __name__ == "__main__": # Initialize our Action-Environment-Reward Table controller = ActionValueTable(196, 4) controller.initialize(0.) # Initialize Reinforcement Learning learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) # Setup the PyBrain and PyGame Environments environment = Environment() game = RunPacman(environment) # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action task = PacmanTask(environment, game) task.performAction(np.array([1]))
pylab.ion() pylab.hot() pylab.show() with CurrentController(3) as control: environment = ControllerEnvironment(control) task = MdpRedCubeTask(environment, False) control.cubes_x = 2 control.cubes_y = 3 control.cubes_size = 4 task.max_samples = 500 actions = len(environment.actions) actionValueNetwork = ActionValueTable(task.outdim, task.indim) actionValueNetwork.stdParams = 0.0001 actionValueNetwork.randomize() # actionValueNetwork = ActionValueNetwork(task.outdim,task.indim) # if os.path.isfile("q/q_train.npy"): # actionValueNetwork.param = np.load("q/q_train.npy") #else: actionValueNetwork.initialize(0.0001) # if os.path.isfile("nfq.xml"): actionValueNetwork.network = NetworkReader.readFrom('nfq.xml') pylab.pcolor(actionValueNetwork.params.reshape(32, actions).max(1).reshape(8,4).T) pylab.pause(0.01) learner = Q() agent = LearningAgent(actionValueNetwork, learner) experiment = Experiment(task, agent) start = time()
def getObservation(self): return self.env.getSensors() if __name__ == "__main__": # testing the environment and task from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.learners import Q from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer env = Chain() controller = ActionValueTable(env.outdim, env.indim) controller.initialize(1.) # controller.initialize(0.) # learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8 learner = Q() # default alpha 0.5, gamma 0.99 # learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(controller, learner) task = ChainTask(env) exp = Experiment(task, agent) reward = 0 xs = [] ys = []
def run( learning_rounds, test_rounds, player1_learn_file, player2_learn_file, player1_test_file, player2_test_file, alpha, gamma, epsilon, logs, interactive_test): """ Run a learning process with given parameters, than tests agent's performance by playing given amount of test games and returns the percent of won games. """ # define the environment env = CowboyEnv(player1_learn_file, player2_learn_file, player1_test_file, player2_test_file) # define the task task = CowboyTask(env) av_table = ActionValueTable(env.outdim, env.indim) av_table.initialize(0.) # define Q-learning agent learner = Q(alpha, gamma) learner._setExplorer(EpsilonGreedyExplorer(epsilon)) agent = LearningAgent(av_table, learner) # finally, define experiment experiment = Experiment(task, agent) def play_one_game(learn): """ Orders the agent to play a single game and learn from it. Returns number of rounds played """ # Do interactions until the game finishes rounds_played = 0 while not env.game_finished(): experiment.doInteractions(1) if learn: agent.learn() agent.reset() rounds_played += 1 env.reset() return rounds_played env.toggle_logs(False) # Learn for given number of rounds round_counter = 0 while round_counter < learning_rounds: round_counter += play_one_game(True) if logs: sys.stdout.write("Learning progress: %d%% \r" % (round_counter * 100.0 / learning_rounds)) sys.stdout.flush() # Test for given number of rounds env.toggle_test(True) round_counter = 0 game_counter = 0 score = 0 if interactive_test: env.toggle_logs(True) while round_counter < test_rounds: round_counter += play_one_game(False) game_counter += 1 score += env.agent_score() if interactive_test: print("Testing progress: %d%%" % (round_counter * 100.0 / learning_rounds)) raw_input('Score: {0} ->'.format(score)) elif logs: if learning_rounds > 0: sys.stdout.write("Testing progress: %d%% \r" % (round_counter * 100.0 / learning_rounds)) sys.stdout.flush() if logs: sys.stdout.write(" \r") sys.stdout.flush() return score * 100.0 / game_counter
def table_print(table, nstates): print '\n'.join( str(get_color(i, nstates)) + str(a) for i, a in enumerate(np.array_split(table, nstates)) ) ################################################################################ ### main if __name__ == '__main__': world = WorldInteraction() predTable = ActionValueTable( PredatorInteraction.NSTATES, len(PredatorInteraction.ACTIONS) ) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) try: for t in xrange(MAX_TIME): print 't = %d' % t world.t = t
def table_print(table, nstates): print '\n'.join( str(get_color(i, nstates)) + str(a) for i, a in enumerate(np.array_split(table, nstates)) ) ################################################################################ ### main if __name__ == '__main__': world = WorldInteraction() predTable = ActionValueTable( PredatorInteraction.NSTATES, len(PredatorInteraction.ACTIONS) ) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) mimicTable = ActionValueTable( MimicryPreyInteraction.NSTATES, len(MimicryPreyInteraction.ACTIONS) )
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
a learner, which updates the controller parameters according to the interaction it had with the world, and an explorer, which adds some explorative behaviour to the actions. All standard agents already have a default explorer, so we don't need to take care of that in this tutorial. The controller in PyBrain is a module, that takes states as inputs and transforms them into actions. For value-based methods, like the Q-Learning algorithm we will use here, we need a module that implements the ActionValueInterface. There are currently two modules in PyBrain that do this: The ActionValueTable for discrete actions and the ActionValueNetwork for continuous actions. Our maze uses discrete actions, so we need a table: """ controller = ActionValueTable(81, 4) controller.initialize(1.) """ The table needs the number of states and actions as parameters. The standard maze environment comes with the following 4 actions: north, east, south, west. Then, we initialize the table with 1 everywhere. This is not always necessary but will help converge faster, because unvisited state-action pairs have a promising positive value and will be preferred over visited ones that didn't lead to the goal. Each agent also has a learner component. Several classes of RL learners are currently implemented in PyBrain: black box optimizers, direct search methods, and value-based learners. The classical Reinforcement Learning mostly consists of value-based learning, in which of the most well-known algorithms is the Q-Learning algorithm. Let's now create
from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q from pybrain.rl.experiments import Experiment #Create 2d 2room gridworld structure = array([[1,1,1,1,1,1,1], [1,0,0,1,0,0,1], [1,0,0,0,0,0,1], [1,0,0,1,0,0,1], [1,0,0,1,0,0,1], [1,1,1,1,1,1,1]]) #Initialize agent doing Q-Learning controller = ActionValueTable(49, 4) controller.initialize(0.) learner = Q() agent = LearningAgent(controller, learner) while True: #place random goal for each walk [i,j] = structure.shape goal = (randint(0,i-1),randint(0,j-1)) #place the goal in a field which is not a wall while structure[goal] != 0: goal = (randint(0,i-1),randint(0,j-1))
[1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) # prepare plotting
explorative behaviour to the actions. All standard agents already have a default explorer, so we don't need to take care of that in this tutorial. The controller in PyBrain is a module, that takes states as inputs and transforms them into actions. For value-based methods, like the Q-Learning algorithm we will use here, we need a module that implements the ActionValueInterface. There are currently two modules in PyBrain that do this: The ActionValueTable for discrete actions and the ActionValueNetwork for continuous actions. Our maze uses discrete actions, so we need a table: I will need to use continuous actions network """ controller = ActionValueTable(16, 3) controller.initialize(0.0020) """ The table needs the number of states and actions as parameters. The standard market environment comes with the following 4 actions: long, short and wait Then, we initialize the table with min gap everywhere. This is not always necessary but will help converge faster, because unvisited state-action pairs have a promising positive value and will be preferred over visited ones that didn't lead to the goal. Each agent also has a learner component. Several classes of RL learners are currently implemented in PyBrain: black box optimizers, direct search methods, and value-based learners. The classical Reinforcement Learning mostly consists of value-based learning, in which of the most
pylab.gray() pylab.ion() structure = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) environment = Maze(structure, (7, 7)) controller = ActionValueTable(81, 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) experiment.doInteractions(100) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(81,4).max(1).reshape(9,9)) pylab.draw()
""" Reinforcement Learning to learn xor function """ # generic import import numpy as np import random # pybrain import from pybrain import SigmoidLayer, LinearLayer from pybrain.rl.explorers import EpsilonGreedyExplorer from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q from pybrain.rl.learners.valuebased import ActionValueTable # The parameters of your algorithm av_table = ActionValueTable(4, 2) av_table.initialize(0.) # For Action Value Table learner = Q(0.5, 0.0) # define Q-learning agent learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) for x in xrange(1,100): # The training listxor = random.choice([[0, 0],[0, 1], [1, 0], [1, 1]]) qstate = listxor[0] + listxor[1]*2 resultxor = listxor[0]^listxor[1] agent.integrateObservation([qstate]) action = agent.getAction()
from blackjackenv import BlackjackEnv from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer # define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent)
from pybrain.rl.environments.blackjackenv import BlackjackEnv from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer # define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent)
# # number of actions: # 3 the number of action values the environment accepts - Foreward, Backward and Snooze states = 165 #Has to match class Env(Environment) - outdim in environment_01.py actions = 2 #Has to match class Env(Environment) - indim in environment_01.py try: arr = np.loadtxt('/home/pi/Desktop/ray_bot/ray_bot2.csv', delimiter=';') # open action value table from .csv file except Exception as e: # print e arr = np.zeros((states, actions)) # except if the file does not exist - ie. first time - then creat and initialize it with numpy of zeros av_table = ActionValueTable(states, actions) av_table.initialize(arr.flatten()) # define Q-learning agent learner = Q(0.1, 0.5) learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(av_table, learner) # define the environment env = Env() # define the task task = Task(env) # define experiment experiment = Experiment(task, agent)
from pybrain.rl.experiments import Experiment from pybrain.rl.experiments import EpisodicExperiment from pybrain.rl.environments import Task, EpisodicTask warnings.filterwarnings("ignore") # create the maze with walls (1) envmatrix = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment # experiment = Experiment(task, agent) experiment = EpisodicExperiment(task, agent) # prepare plotting pylab.gray() pylab.ion() for i in range(50):