def testNet(learner, moduleNet, env, maxPlaneStartDist, stepSize,numAngs,thermRadius): # Turn off exploration from pybrain.rl.explorers.discrete.egreedy import EpsilonGreedyExplorer learner._setExplorer(EpsilonGreedyExplorer(0)) agent = LearningAgent(moduleNet, learner) # Move the plane back to the start by resetting the environment env = contEnv.contThermEnvironment(maxPlaneStartDist, stepSize,numAngs,thermRadius) from simpleThermalTask import SimpThermTask task = SimpThermTask(env) from pybrain.rl.experiments import Experiment experiment = Experiment(task, agent) # Have the plane move 100 times, and plot the position of the plane (hopefully it moves to the high reward area) testIter = 100 trainResults = [env.distPlane()] for i in range(testIter): experiment.doInteractions(1) trainResults.append(env.distPlane()) # Plot the training results import matplotlib.pyplot as plt plt.figure(1) plt.plot(trainResults,'o') plt.ylabel('Distance from center of thermal') plt.xlabel('Interaction iteration') plt.title('Test Results for Neural Fitted Q Learner') plt.show()
def initExperiment(alg, optimistic=True): env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) if optimistic: table.initialize(1.) else: table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = alg() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() agent = LearningAgent(table, learner) agent.batchMode = False experiment = Experiment(task, agent) experiment.allRewards = [] return experiment
def __init__(self, task, agent): Experiment.__init__(self, task, agent) agent.learner.explorer.experiment = self # agent.learner.module.getValue() self.screen = pygame.display.set_mode(((xsize+2)*MAGNIFY,(ysize+2)*MAGNIFY)) pygame.display.set_caption('Policy Visualizer') self.clock = pygame.time.Clock() self.screenBuffer = pygame.Surface(self.screen.get_size()) self.screenBuffer = self.screenBuffer.convert() self.screenBuffer.fill((64, 64, 64)) # Dark Gray self.bombImage = pygame.image.load("bomb_image.png") self.bombImage = pygame.transform.scale(self.bombImage, (MAGNIFY - 2, MAGNIFY - 2)) self.isPaused = False self.isCrashed = False self.speed = 10 self.num = 0 self.robotXA = -1 self.robotYA = -1 self.bomb_counter = 0 self.count = 0 self.acc_reward = 0 self.collect_data = False if collect_data_file != None: self.collect_data = True self.collect_episode_data_file = open(collect_data_file + "_episodelen.data", "w") self.collect_reward_data_file = open(collect_data_file + "_avg_reward.data", "w")
def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def testMaze(): # simplified version of the reinforcement learning tutorial example structure = np.array([[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 1, 0, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(3): experiment.doInteractions(40) controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment( config.get(mode + 'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode + 'GroundTruth')) self.experiment = Experiment(self.task, self.agent)
def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity ) self.controller.initialize(1.0) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity) self.controller.initialize(1.) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins**env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def __init__(self, event_queue_name, hub_queue_name): super().__init__() # create environment self.conn = boto.sqs.connect_to_region(constants.REGION) self.event_queue = self.conn.get_queue(event_queue_name) self.event_queue.set_message_class(MHMessage) self.env = DogEnv(DogEnv.ALL_QUIET, DogEnv.ALL_QUIET, self.event_queue, hub_queue_name) self.env.delay = (self.episodes == 1) # create task self.task = QuietDogTask(self.env) # create value table and initialize with ones # TODO: Get number of states from DogEnv self.table = ActionValueTable(2*5*4, 5*4) self.table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here self.learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well self.learner.explorer = BoltzmannExplorer() # create agent self.agent = DogAgent(self.table, self.learner) # create experiment self.experiment = Experiment(self.task, self.agent)
def setup_RL(): # create the maze with walls (1) envmatrix = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) return experiment, agent, table
def createExperimentInstance(): gymRawEnv = gym.make('MountainCarContinuous-v0') cartPositionGroup = Digitizer.buildBins(-1.2, 0.6, 16) cartVelocityGroup = Digitizer.buildBins(-0.07, 0.07, 4) actionDedigitizer = Digitizer.build(-1.0, 1.0, 5, True) # print("Cart position bins:", cartPositionGroup) # print("Cart velocity bins:", cartVelocityGroup) # print("Cart force bins:", actionDedigitizer.bins, actionDedigitizer.possibleValues()) observationDigitizer = ArrayDigitizer( [cartPositionGroup, cartVelocityGroup]) transformation = EnvTransformation(observationDigitizer, actionDedigitizer) task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) # env.setCumulativeRewardMode() # create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information) ## gamma -- discount factor (importance of future reward) # create value table and initialize with ones table = ActionValueTable(observationDigitizer.states, actionDedigitizer.states) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment(experiment, doSingleExperiment) return experiment
def createExperimentInstance(): gymRawEnv = gym.make('MountainCar-v0') cartPositionGroup = Digitizer.buildBins(-1.2, 0.6, 16) cartVelocityGroup = Digitizer.buildBins(-0.07, 0.07, 16) # print("Cart position bins:", cartPositionGroup) # print("Cart velocity bins:", cartVelocityGroup) observationDigitizer = ArrayDigitizer( [cartPositionGroup, cartVelocityGroup]) transformation = EnvTransformation(observationDigitizer) task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) # env.setCumulativeRewardMode() # create value table and initialize with ones table = ActionValueTable(observationDigitizer.states, env.numActions) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment(experiment, ExperimentIteration()) return experiment
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins ** env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ "!!!!!!!!!!", "! ! ! ! !", "! !! ! ! !", "! ! !", "! !!!!!! !", "! ! ! !", "! ! !!!! !", "! !", "! !!!!! !", "! ! !", "!!!!!!!!!!", ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy))
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def main(): rospy.init_node("lauron_reinforcement_learning") environment = RLEnvironment() dim_state = environment.joint_states.shape[0] num_actions = len(environment.actions) controller = ActionValueNetwork(dim_state, num_actions) learner = SARSA() agent = LearningAgent(controller, learner) task = RLTask(environment) experiment = Experiment(task, agent) episode_counter = 0 while True: print("Training episode {}".format(episode_counter)) experiment.doInteractions(NUM_INTERACTIONS) agent.learn() agent.reset() episode_counter += 1
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth')) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = BoxSearchAgent(self.controller, self.learner) self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent)
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i+1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1] )
class RL: def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0] self.experiment.doInteractions(1) self.agent.learn()
class RL: def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0] self.experiment.doInteractions(1) self.agent.learn()
def _oneInteraction(self): resetInThisRound = False old = (self.XA, self.switch_state) (self.XA, self.switch_state) = reverseStateMapper[level.state] payoff = stateToRewardMapper[level.state] self.acc_reward += payoff * 10 if self.collect_data: self.count += 1 if payoff > 0: self.collect_episode_data_file.write(str(self.count) + "\n") self.count = 0 if self.stepid % interval == 0: self.collect_reward_data_file.write( str(self.acc_reward / float(interval)) + "\n") self.acc_reward = 0 if self.stepid % 100000 == 0: pass if self.stepid % interval == 0: sys.stdout.write("\033[K") sys.stdout.write( "[{2}{3}] ({0}/{1}) | alpha = {4} | epsilon = {5}\n".format( self.stepid, MAX_STEPS, '#' * int(math.floor(self.stepid / float(MAX_STEPS) * 20)), ' ' * int( (20 - math.floor(self.stepid / float(MAX_STEPS) * 20))), learner.alpha, learner.explorer.exploration)) sys.stdout.write("\033[F") if self.stepid >= MAX_STEPS: print("\nSimulation done!") sys.exit() if payoff > 0: # episode done if save_file != None: controller.params.reshape( controller.numRows, controller.numColumns).tofile(save_file) learner.alpha *= 0.999999 learner.explorer.exploration *= 0.999999 if level.state == errorState: level.reset() self.isCrashed = False if not self.isPaused: return Experiment._oneInteraction(self) else: return self.stepid
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i + 1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def __init__(self, task, agent): Experiment.__init__(self, task, agent) agent.learner.explorer.experiment = self self.isPaused = False self.isCrashed = False self.speed = 10 self.num = 0 self.XA = 50 self.switch_state = 1 self.count = 0 self.acc_reward = 0 self.collect_data = False if collect_data_file != None: self.collect_data = True self.collect_episode_data_file = open( collect_data_file + "_episodelen.data", "w") self.collect_reward_data_file = open( collect_data_file + "_avg_reward.data", "w")
def __init__(self): self.environment = GameEnv() av_table = ActionValueTable(self.environment.outdim, self.environment.indim) av_table.initialize(0.) # todo: save & restore agents state learner = Q() learner._setExplorer(EpsilonGreedyExplorer()) agent = LearningAgent(av_table, learner) self.agent = agent self.task = GameTask(self.environment) self.experiment = Experiment(self.task, self.agent)
def explore_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list("!!!!!!!!!!"), list("! ! ! ! !"), list("! !! ! ! !"), list("! ! !"), list("! !!!!!! !"), list("! ! ! !"), list("! ! !!!! !"), list("! !"), list("! !!!!! !"), list("! ! !"), list("!!!!!!!!!!"), ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy)) assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if(self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent)
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
def createExperimentInstance(): gymRawEnv = gym.make('Taxi-v2') transformation = EnvTransformation() task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation( transformation ) ## env.setCumulativeRewardMode() ## create value table and initialize with ones table = ActionValueTable(env.numStates, env.numActions) # table = ActionValueTableWrapper(table) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) experiment = ProcessExperiment( experiment, experimentIteration ) return experiment
def createExperimentInstance(): gymRawEnv = gym.make('FrozenLake-v0') transformation = EnvTransformation() task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) ## env.setCumulativeRewardMode() # create value table and initialize with ones table = ActionValueTable(gymRawEnv.observation_space.n, gymRawEnv.action_space.n) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) agent = createAgent(table) experiment = Experiment(task, agent) iterator = ExperimentIteration() quality = QualityFunctor() experiment = ProcessExperiment(experiment, iterator, quality) return experiment
def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if(self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent)
class RlOp(threading.Thread): episodes = 1 epilen = 200 def __init__(self, event_queue_name, hub_queue_name): super().__init__() # create environment self.conn = boto.sqs.connect_to_region(constants.REGION) self.event_queue = self.conn.get_queue(event_queue_name) self.event_queue.set_message_class(MHMessage) self.env = DogEnv(DogEnv.ALL_QUIET, DogEnv.ALL_QUIET, self.event_queue, hub_queue_name) self.env.delay = (self.episodes == 1) # create task self.task = QuietDogTask(self.env) # create value table and initialize with ones # TODO: Get number of states from DogEnv self.table = ActionValueTable(2*5*4, 5*4) self.table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here self.learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well self.learner.explorer = BoltzmannExplorer() # create agent self.agent = DogAgent(self.table, self.learner) # create experiment self.experiment = Experiment(self.task, self.agent) def run(self): self.call_run() def call_run(self): print('RlOp: running') # prepare plotting pylab.gray() pylab.ion() for i in range(1000): # interact with the environment (here in batch mode) self.experiment.doInteractions(100) self.agent.learn() self.agent.reset() results0 = self.table.params.reshape(2, 4, 5, 20)[0] results1 = self.table.params.reshape(2, 4, 5, 20)[1] pp.pprint(results0.argmax(2)) pp.pprint(results1.argmax(2)) # and draw the table #ar=self.table.params.reshape(2,5,4,5,4) #for state1 in range(len(constants.SOUNDS)): # for state2 in range(4): # pylab.pcolor(ar[1][state1][state2]) # pylab.draw() results0 = self.table.params.reshape(2, 4, 5, 20)[0] results1 = self.table.params.reshape(2, 4, 5, 20)[1] while True: time.sleep(60) pp.pprint(results0.argmax(2)) pp.pprint(results1.argmax(2))
class ReinforcementLearningRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment(config.get(mode+'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: self.experiment.doInteractions(interactions) self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images)/2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch',epoch+egEpochs,'(exploitation mode: epsilon={:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.db.images))
from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer env = Chain() controller = ActionValueTable(env.outdim, env.indim) controller.initialize(1.) # controller.initialize(0.) # learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8 learner = Q() # default alpha 0.5, gamma 0.99 # learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(controller, learner) task = ChainTask(env) exp = Experiment(task, agent) reward = 0 xs = [] ys = [] import matplotlib.pyplot as plt for i in xrange(5000): exp.doInteractions(1) agent.learn() reward += agent.lastreward if i%100 == 0: xs.append(i) ys.append(reward)
from ObjectLocalizerEnvironment import ObjectLocalizerEnvironment from DeepQNetwork import DeepQNetwork from DeepQLearning import DeepQLearning from MDPObjectLocalizerTask import MDPObjectLocalizerTask from ObjectLocalizationAgent import ObjectLocalizationAgent print 'Starting Environment' epsilon = 1.0 environment = ObjectLocalizerEnvironment(config.get('imageDir'), config.get('candidatesFile'), 'Training') print 'Initializing DeepQNetwork' controller = DeepQNetwork() controller.setEpsilonGreedy(epsilon) print 'Initializing Q Learner' learner = DeepQLearning() print 'Preparing Agent' agent = ObjectLocalizationAgent(controller, learner) print 'Configuring Task' task = MDPObjectLocalizerTask(environment, config.get('groundTruth')) print 'Setting up Experiment' experiment = Experiment(task, agent) i = 0 print 'Main Loop' while i < config.geti('maximumEpochs'): print 'Epoch',i,'(epsilon:{:5.3f})'.format(epsilon) experiment.doInteractions(int(config.get('numInteractions'))) agent.learn() agent.reset() i += 1 epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon) controller.setEpsilonGreedy(epsilon)
[0.3, 0.5, 0.2]]) env = BanditEnvironment(payouts, distrib) task = BanditTask(env) table = PropensityTable(payouts.shape[0]) table.initialize(500.0) #learner = RothErev(experimentation=0.55, recency=0.3) learner = VariantRothErev(experimentation=0.65, recency=0.3) learner.explorer = BoltzmannExplorer(tau=100.0, decay=0.9995) agent = LearningAgent(table, learner) experiment = Experiment(task, agent) epis = int(1e1) batch = 2 avgRewards = scipy.zeros(epis) allActions = scipy.zeros(epis * batch) c = 0 for i in range(epis): experiment.doInteractions(batch) avgRewards[i] = scipy.mean(agent.history["reward"]) allActions[c:c + batch] = agent.history["action"].flatten() + 1 agent.learn() agent.reset() c += batch
import numpy env=HitTheGoalEnv(5) task=HitTheGoalTask(env,[5,0,0]) net = buildNetwork(2, 1, bias=False) # create agent with controller and learner (and its options) #agent=OptimizationAgent(net, CMAES()) #agent.learner.setEvaluator(task,agent.module) agent = LearningAgent(net,Reinforce()) #agent.learner.explorer=EpsilonGreedyExplorer(0.0) #agent.learner._setExplorer(EpsilonGreedyExplorer(0.0)) #agent.learner.explorer.sigma=[0.1] #print agent.learner.explorer.sigma #exit() experiment = Experiment(task, agent) itr=0 #task.performAction(numpy.array([36])) while True: #print itr # agent.learner.maxEvaluations += 1 #agent.learner.learn() experiment.doInteractions(50) agent.learn() agent.reset() task.reset() # env.reset() # itr=itr+1
table = ActionValueTable(matrix_size, 2) #table = ActionValueTable(matrix_size, matrix_size) table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = Q() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) # prepare plotting pylab.gray() pylab.ion() #for i in range(100): while True: # interact with the environment (here in batch mode) experiment.doInteractions(matrix_size) agent.learn() agent.reset() # and draw the table print table.params.reshape(matrix_size,2) #print table.params.reshape(matrix_size,matrix_size)
world = WorldInteraction() predTable = ActionValueTable( PredatorInteraction.NSTATES, len(PredatorInteraction.ACTIONS) ) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) mimicTable = ActionValueTable( MimicryPreyInteraction.NSTATES, len(MimicryPreyInteraction.ACTIONS) ) mimicTable.initialize(0.) mimicLearner = Q(ALPHA, GAMMA) mimicLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) mimicAgent = LearningAgent(mimicTable, mimicLearner) mimicEnv = MimicryPreyEnvironment(world) mimicTask = MimicryPreyTask(mimicEnv) mimicExp = Experiment(mimicTask, mimicAgent)
import sys, time from scipy import * from pybrain.rl.environments import Task from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.experiments import Experiment from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA var_structure_arr_ = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) var_controller_ = ActionValueTable(81, 4) var_controller_.initialize(1.0) var_learner_ = Q() var_Agent_ = LearningAgent(var_controller_, var_learner_) var_task_ = MDPMazeTask(Task) experiment = Experiment(var_task_, var_Agent_)
def _oneInteraction(self): global draw resetInThisRound = False # Process events for event in pygame.event.get(): if event.type == pygame.locals.QUIT or ( event.type == pygame.locals.KEYDOWN and event.key in [pygame.locals.K_ESCAPE, pygame.locals.K_q]): return if (event.type == pygame.locals.KEYDOWN and event.key == pygame.locals.K_SPACE): print len(controller.params) print controller.params.reshape(controller.numRows, controller.numColumns) controller.params.reshape( controller.numRows, controller.numColumns).tofile("test.table") self.isPaused = not self.isPaused if (event.type == pygame.locals.KEYDOWN and event.key == pygame.locals.K_r): resetInThisRound = True if (event.type == pygame.locals.KEYDOWN and event.key == pygame.locals.K_PLUS): self.speed += 1 if (event.type == pygame.locals.KEYDOWN and event.key == pygame.locals.K_MINUS): self.speed = max(self.speed - 1, 1) if (event.type == pygame.locals.KEYDOWN and event.key == pygame.locals.K_d): draw = not draw # if self.isCrashed: # self.isCrashed = False # # level.reset() # # Update if resetInThisRound: print "reset" level.reset() old = (self.robotXA, self.robotYA) (self.robotXA, self.robotYA, csf, payoff) = reverseStateMapper[level.state] if not self.isCrashed and enemies_enabled: enemy_handler.update(old) for e in enemy_handler.getEnemyPositions(): if (self.robotXA, self.robotYA) == e: self.isCrashed = True level.penalty += 1 self.acc_reward -= 1 if shield_options > 0 and not args.huge_neg_reward: print "Shields are not allowed to make errors!" exit() break if (self.robotXA + 1, self.robotYA + 1) in bombs: self.bomb_counter += 1 if self.bomb_counter == 4: self.isCrashed = True level.penalty += 1 self.acc_reward -= 1 if shield_options > 0 and not args.huge_neg_reward: print "Shields are not allowed to make errors!" exit() else: self.bomb_counter = 0 if draw: q_max = 0 for state in range(len(reverseStateMapper) - 1): q_max = max(q_max, max(controller.getActionValues(state))) # Draw Field for x in xrange(0, xsize): for y in xrange(0, ysize): paletteColor = imageData[y * xsize + x] color = palette[paletteColor * 3:paletteColor * 3 + 3] pygame.draw.rect(self.screenBuffer, color, ((x + 1) * MAGNIFY, (y + 1) * MAGNIFY, MAGNIFY, MAGNIFY), 0) # Draw boundary if self.robotXA == -1 or self.isCrashed: boundaryColor = (255, 0, 0) else: boundaryColor = (64, 64, 64) pygame.draw.rect(self.screenBuffer, boundaryColor, (0, 0, MAGNIFY * (xsize + 2), MAGNIFY), 0) pygame.draw.rect(self.screenBuffer, boundaryColor, (0, MAGNIFY, MAGNIFY, MAGNIFY * (ysize + 1)), 0) pygame.draw.rect(self.screenBuffer, boundaryColor, (MAGNIFY * (xsize + 1), MAGNIFY, MAGNIFY, MAGNIFY * (ysize + 1)), 0) pygame.draw.rect(self.screenBuffer, boundaryColor, (MAGNIFY, MAGNIFY * (ysize + 1), MAGNIFY * xsize, MAGNIFY), 0) # pygame.draw.rect(screenBuffer,boundaryColor,(0,0,MAGNIFY*(xsize+2),MAGNIFY),0) # Draw cell frames for x in xrange(0, xsize): for y in xrange(0, ysize): pygame.draw.rect(self.screenBuffer, (0, 0, 0), ((x + 1) * MAGNIFY, (y + 1) * MAGNIFY, MAGNIFY, MAGNIFY), 1) if (x + 1, y + 1) in bombs: self.screenBuffer.blit(self.bombImage, ((x + 1) * MAGNIFY + 1, (y + 1) * MAGNIFY + 1)) pygame.draw.rect(self.screenBuffer, (0, 0, 0), (MAGNIFY - 1, MAGNIFY - 1, MAGNIFY * xsize + 2, MAGNIFY * ysize + 2), 1) # Draw "Good" Robot if self.robotXA != -1: pygame.draw.circle( self.screenBuffer, (192, 32, 32), ((self.robotXA + 1) * MAGNIFY + MAGNIFY / 2, (self.robotYA + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3 - 2, 0) pygame.draw.circle( self.screenBuffer, (255, 255, 255), ((self.robotXA + 1) * MAGNIFY + MAGNIFY / 2, (self.robotYA + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3 - 1, 1) pygame.draw.circle( self.screenBuffer, (0, 0, 0), ((self.robotXA + 1) * MAGNIFY + MAGNIFY / 2, (self.robotYA + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3, 1) # Draw "Bad" Robots if enemies_enabled: for (e_x, e_y) in enemy_handler.getEnemyPositions(): pygame.draw.circle(self.screenBuffer, (32, 32, 192), ((e_x + 1) * MAGNIFY + MAGNIFY / 2, (e_y + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3 - 2, 0) pygame.draw.circle(self.screenBuffer, (255, 255, 255), ((e_x + 1) * MAGNIFY + MAGNIFY / 2, (e_y + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3 - 1, 1) pygame.draw.circle(self.screenBuffer, (0, 0, 0), ((e_x + 1) * MAGNIFY + MAGNIFY / 2, (e_y + 1) * MAGNIFY + MAGNIFY / 2), MAGNIFY / 3, 1) # zone_width = danger_zone[-1][0] - danger_zone[0][0] + 1 # zone_height = danger_zone[-1][1] - danger_zone[0][1] + 1 # pygame.draw.rect(screenBuffer,(200,200,0),(MAGNIFY*(danger_zone[0][0]+1),MAGNIFY*(danger_zone[0][1]+1),MAGNIFY*zone_width,MAGNIFY*zone_height),5) # Flip! self.screen.blit(self.screenBuffer, (0, 0)) pygame.display.flip() # Make the transition if not self.isPaused: # Done self.clock.tick(self.speed) else: self.clock.tick(3) self.acc_reward += payoff * 10 if self.collect_data: self.count += 1 if payoff > 0: self.collect_episode_data_file.write(str(self.count) + "\n") self.count = 0 if self.stepid % 100 == 0: self.collect_reward_data_file.write( str(self.acc_reward / 100.) + "\n") self.acc_reward = 0 if self.stepid % 100000 == 0: pass # print learner.alpha # print learner.explorer.exploration # print self.stepid # raw_input() if self.stepid % 100 == 0: sys.stdout.write("\033[K") sys.stdout.write( "[{2}{3}] ({0}/{1}) | alpha = {4} | epsilon = {5}\n".format( self.stepid, MAX_STEPS, '#' * int(math.floor(self.stepid / float(MAX_STEPS) * 20)), ' ' * int( (20 - math.floor(self.stepid / float(MAX_STEPS) * 20))), learner.alpha, learner.explorer.exploration)) sys.stdout.write("\033[F") if self.stepid >= MAX_STEPS: print "\nSimulation done!" sys.exit() if payoff > 0: # episode done if save_file != None: controller.params.reshape( controller.numRows, controller.numColumns).tofile(save_file) learner.alpha *= 1. #0.999 learner.explorer.exploration *= 1. #0.999 self.isCrashed = False if not self.isPaused: return Experiment._oneInteraction(self) else: return self.stepid
class ReinforcementLearningRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment( config.get(mode + 'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode + 'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: self.experiment.doInteractions(interactions) self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images) / 2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.db.images))
class BoxSearchRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = BoxSearchEnvironment( config.get(mode + 'Database'), mode, config.get(mode + 'GroundTruth')) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = BoxSearchAgent(self.controller, self.learner) self.task = BoxSearchTask(self.environment, config.get(mode + 'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: k = 0 while not self.environment.episodeDone and k < interactions: self.experiment._oneInteraction() k += 1 self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.imageList), config.geti('trainInteractions')) #self.agent.assignPriorMemory(self.environment.priorMemory) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): networkFile = config.get('networkDir') + config.get( 'snapshotPrefix') + '_iter_' + config.get( 'trainingIterationsPerBatch') + '.caffemodel' interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.imageList) / 1 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) epoch = 1 exEpochs = config.geti('explorationEpochs') while epoch <= exEpochs: s = cu.tic() print 'Epoch', epoch, ': Exploration (epsilon=1.0)' self.runEpoch(interactions, len(self.environment.imageList)) self.task.flushStats() s = cu.toc('Epoch done in ', s) epoch += 1 self.learner = QLearning() self.agent.learner = self.learner egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs + exEpochs: s = cu.tic() epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) epoch += 1 maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs while epoch <= maxEpochs: s = cu.tic() print 'Epoch', epoch, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ', s) shutil.copy(networkFile, networkFile + '.' + str(epoch)) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.imageList)) def doValidation(self, epoch): if epoch % config.geti('validationEpochs') != 0: return auxRL = BoxSearchRunner('test') auxRL.run() indexType = config.get('evaluationIndexType') category = config.get('category') if indexType == 'pascal': categories, catIndex = bse.get20Categories() elif indexType == 'relations': categories, catIndex = bse.getCategories() elif indexType == 'finetunedRelations': categories, catIndex = bse.getRelationCategories() catI = categories.index(category) scoredDetections = bse.loadScores(config.get('testMemory'), catI) groundTruthFile = config.get('testGroundTruth') ps, rs = bse.evaluateCategory(scoredDetections, 'scores', groundTruthFile) pl, rl = bse.evaluateCategory(scoredDetections, 'landmarks', groundTruthFile) line = lambda x, y, z: x + '\t{:5.3f}\t{:5.3f}\n'.format(y, z) print line('Validation Scores:', ps, rs) print line('Validation Landmarks:', pl, rl)
class Player: def __init__(self): self.environment = GameEnv() av_table = ActionValueTable(self.environment.outdim, self.environment.indim) av_table.initialize(0.) # todo: save & restore agents state learner = Q() learner._setExplorer(EpsilonGreedyExplorer()) agent = LearningAgent(av_table, learner) self.agent = agent self.task = GameTask(self.environment) self.experiment = Experiment(self.task, self.agent) def name(self, index): self.me = index [self.opp1, self.opp2] = [i for i in range(3) if i != self.me] def hand(self, card): self.environment.reset() self.environment.setHand(card) self.environment.setStack(300) def bet1(self, min): self.environment.setPhase('bet-1') self.environment.setMinBet(min) self.experiment.doInteractions(1) bet = self.environment.getTranslatedAction() return bet def bet1_info(self, bets): opp1_bet = bets[self.opp1] opp2_bet = bets[self.opp2] self.environment.setOpponentsBets(opp1_bet, opp2_bet) def call1(self, current_bet): self.environment.setPhase('call-1') self.environment.setToCall(current_bet) self.experiment.doInteractions(1) is_calling = self.environment.getTranslatedAction() return is_calling def call1_info(self, in_game): opp1_in_game = in_game[self.opp1] opp2_in_game = in_game[self.opp2] self.environment.setOpponentsFolded(not opp1_in_game, not opp2_in_game) def bet2(self, min): self.environment.setPhase('bet-2') self.environment.setMinBet(min) self.experiment.doInteractions(1) bet = self.environment.getTranslatedAction() return bet def bet2_info(self, bets): opp1_bet = bets[self.opp1] opp2_bet = bets[self.opp2] self.environment.setOpponentsBets(opp1_bet, opp2_bet) def call2(self, current_bet): self.environment.setPhase('call-1') self.environment.setToCall(current_bet) self.experiment.doInteractions(1) is_calling = self.environment.getTranslatedAction() return is_calling def call2_info(self, in_game): opp1_in_game = in_game[self.opp1] opp2_in_game = in_game[self.opp2] def showdown(self, hand): opp1_hand = hand[self.opp1] opp2_hand = hand[self.opp2] def result(self, winnings): my_winnings = winnings[self.me] opp1_winnings = winnings[self.opp1] opp2_winnings = winnings[self.opp2] self.environment.setPhase('results') self.task.setWinnings(my_winnings) self.experiment.doInteractions(1) self.agent.learn() self.agent.reset()
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
world = WorldInteraction() predTable = ActionValueTable( PredatorInteraction.NSTATES, len(PredatorInteraction.ACTIONS) ) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) try: for t in xrange(MAX_TIME): print 't = %d' % t world.t = t predExp.doInteractions(1) predAgent.learn() print 'Colors vs. Q-table:' table_print(predTable._params, PredatorInteraction.NSTATES) print except KeyboardInterrupt: pass finally:
import pickle import time # Create environment sub_env = Environment(20, 20) world = World(sub_env) # Brain for the animat, we have already trained the data f = open('neuro.net', 'r') trained_net = pickle.load(f) brain = BrainController(trained_net) # Learning method we use #learner = PolicyGradientLearner() learner = ENAC() learner._setLearningRate(0.2) # Create an animat animat = StupidAnimat(trained_net, learner, sub_env) # Establish a task task = InteractTask(world, animat) brain.validate_net() experiment = Experiment(task, animat) while True: experiment.doInteractions(10000) animat.learn() animat.reset() brain.validate_net() time.sleep(3)
class BoxSearchRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth')) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = BoxSearchAgent(self.controller, self.learner) self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: k = 0 while not self.environment.episodeDone and k < interactions: self.experiment._oneInteraction() k += 1 self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.imageList), config.geti('trainInteractions')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): networkFile = config.get('networkDir') + config.get('snapshotPrefix') + '_iter_' + config.get('trainingIterationsPerBatch') + '.caffemodel' interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.imageList)/1 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) epoch = 1 exEpochs = config.geti('explorationEpochs') while epoch <= exEpochs: s = cu.tic() print 'Epoch',epoch,': Exploration (epsilon=1.0)' self.runEpoch(interactions, len(self.environment.imageList)) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) epoch += 1 self.learner = QLearning() self.agent.learner = self.learner egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs + exEpochs: s = cu.tic() epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) epoch += 1 maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs while epoch <= maxEpochs: s = cu.tic() print 'Epoch',epoch,'(exploitation mode: epsilon={:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) shutil.copy(networkFile, networkFile + '.' + str(epoch)) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.imageList)) def doValidation(self, epoch): if epoch % config.geti('validationEpochs') != 0: return auxRL = BoxSearchRunner('test') auxRL.run() indexType = config.get('evaluationIndexType') category = config.get('category') if indexType == 'pascal': categories, catIndex = bse.get20Categories() elif indexType == 'relations': categories, catIndex = bse.getCategories() elif indexType == 'finetunedRelations': categories, catIndex = bse.getRelationCategories() if category in categories: catI = categories.index(category) else: catI = -1 scoredDetections = bse.loadScores(config.get('testMemory'), catI) groundTruthFile = config.get('testGroundTruth') #ps,rs = bse.evaluateCategory(scoredDetections, 'scores', groundTruthFile) pl,rl = bse.evaluateCategory(scoredDetections, 'landmarks', groundTruthFile) line = lambda x,y,z: x + '\t{:5.3f}\t{:5.3f}\n'.format(y,z) #print line('Validation Scores:',ps,rs) print line('Validation Landmarks:',pl,rl)
# Initialize Reinforcement Learning learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) # Setup the PyBrain and PyGame Environments environment = Environment() game = RunPacman(environment) # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action task = PacmanTask(environment, game) task.performAction(np.array([1])) # The Experiment is the PyBrain link between the task to be completed and the agent completing it experiment = Experiment(task, agent) currentGame = 1 # Continue to loop program until the 'X' on the GUI is clicked while True: # Allow the agent to interaction with the environment (Move in a direction) then learn from it. experiment.doInteractions(1) agent.learn() # Check if current pacman game ended and needs to start a new one if game.wonGame == 1 or game.wonGame == -1: currentGame += 1 # Store the information the agent has learned in long term memory, # Clear the short term memory to reduce any chance of overfitting,
# define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process while True: experiment.doInteractions(1) agent.learn() agent.reset()
from DeepQLearning import DeepQLearning from MDPObjectLocalizerTask import MDPObjectLocalizerTask from ObjectLocalizationAgent import ObjectLocalizationAgent print 'Starting Environment' epsilon = 1.0 environment = ObjectLocalizerEnvironment(config.get('imageDir'), config.get('candidatesFile'), 'Training') print 'Initializing DeepQNetwork' controller = DeepQNetwork() controller.setEpsilonGreedy(epsilon) print 'Initializing Q Learner' learner = DeepQLearning() print 'Preparing Agent' agent = ObjectLocalizationAgent(controller, learner) print 'Configuring Task' task = MDPObjectLocalizerTask(environment, config.get('groundTruth')) print 'Setting up Experiment' experiment = Experiment(task, agent) i = 0 print 'Main Loop' while i < config.geti('maximumEpochs'): print 'Epoch', i, '(epsilon:{:5.3f})'.format(epsilon) experiment.doInteractions(int(config.get('numInteractions'))) agent.learn() agent.reset() i += 1 epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon) controller.setEpsilonGreedy(epsilon)
table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) # create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information) ## gamma -- discount factor (importance of future reward) # learner = Q(0.5, 0.99) learner = SARSA(0.5, 0.99) # learner = QLambda(0.5, 0.99, 0.9) explorer = learner.explorer explorer.decay = 0.999992 agent = LearningAgent(table, learner) experiment = Experiment(task, agent) ## prevents "ImportError: sys.meta_path is None, Python is likely shutting down" atexit.register(task.close) render_demo = False render_steps = False imax = 7000 period_print = 100 eval_periods = 100 print("\nStarting") total_reward = 0 period_reward = 0
# create value table and initialize with ones table = ActionValueTable(numStates, numActions) table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = QLambda() learner = SARSA() # learner = Q() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) # prepare plotting # pylab.gray() # pylab.ion() # Learning phase # Num iterations used for PROHA Workshop perliminary evaluation # numIterations = 1600 numIterations = 1500 numInteractions = 600 # Num iterations used for PROHA and PROLE slides # numIterations = 10 # numInteractions = 3