def testMaze(): # simplified version of the reinforcement learning tutorial example structure = np.array([[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 1, 0, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(3): experiment.doInteractions(40) controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def testNet(learner, moduleNet, env, maxPlaneStartDist, stepSize,numAngs,thermRadius): # Turn off exploration from pybrain.rl.explorers.discrete.egreedy import EpsilonGreedyExplorer learner._setExplorer(EpsilonGreedyExplorer(0)) agent = LearningAgent(moduleNet, learner) # Move the plane back to the start by resetting the environment env = contEnv.contThermEnvironment(maxPlaneStartDist, stepSize,numAngs,thermRadius) from simpleThermalTask import SimpThermTask task = SimpThermTask(env) from pybrain.rl.experiments import Experiment experiment = Experiment(task, agent) # Have the plane move 100 times, and plot the position of the plane (hopefully it moves to the high reward area) testIter = 100 trainResults = [env.distPlane()] for i in range(testIter): experiment.doInteractions(1) trainResults.append(env.distPlane()) # Plot the training results import matplotlib.pyplot as plt plt.figure(1) plt.plot(trainResults,'o') plt.ylabel('Distance from center of thermal') plt.xlabel('Interaction iteration') plt.title('Test Results for Neural Fitted Q Learner') plt.show()
def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity) self.controller.initialize(1.) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity ) self.controller.initialize(1.0) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ "!!!!!!!!!!", "! ! ! ! !", "! !! ! ! !", "! ! !", "! !!!!!! !", "! ! ! !", "! ! !!!! !", "! !", "! !!!!! !", "! ! !", "!!!!!!!!!!", ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy))
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def main(): rospy.init_node("lauron_reinforcement_learning") environment = RLEnvironment() dim_state = environment.joint_states.shape[0] num_actions = len(environment.actions) controller = ActionValueNetwork(dim_state, num_actions) learner = SARSA() agent = LearningAgent(controller, learner) task = RLTask(environment) experiment = Experiment(task, agent) episode_counter = 0 while True: print("Training episode {}".format(episode_counter)) experiment.doInteractions(NUM_INTERACTIONS) agent.learn() agent.reset() episode_counter += 1
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i+1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1] )
class RL: def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0] self.experiment.doInteractions(1) self.agent.learn()
class RL: def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0] self.experiment.doInteractions(1) self.agent.learn()
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i + 1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def explore_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list("!!!!!!!!!!"), list("! ! ! ! !"), list("! !! ! ! !"), list("! ! !"), list("! !!!!!! !"), list("! ! ! !"), list("! ! !!!! !"), list("! !"), list("! !!!!! !"), list("! ! !"), list("!!!!!!!!!!"), ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy)) assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if(self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
table = PropensityTable(payouts.shape[0]) table.initialize(500.0) #learner = RothErev(experimentation=0.55, recency=0.3) learner = VariantRothErev(experimentation=0.65, recency=0.3) learner.explorer = BoltzmannExplorer(tau=100.0, decay=0.9995) agent = LearningAgent(table, learner) experiment = Experiment(task, agent) epis = int(1e1) batch = 2 avgRewards = scipy.zeros(epis) allActions = scipy.zeros(epis * batch) c = 0 for i in range(epis): experiment.doInteractions(batch) avgRewards[i] = scipy.mean(agent.history["reward"]) allActions[c:c + batch] = agent.history["action"].flatten() + 1 agent.learn() agent.reset() c += batch pylab.figure(figsize=(16, 6)) #pylab.plot(avgRewards) pylab.plot(allActions) pylab.show()
# controller.initialize(0.) # learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8 learner = Q() # default alpha 0.5, gamma 0.99 # learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(controller, learner) task = ChainTask(env) exp = Experiment(task, agent) reward = 0 xs = [] ys = [] import matplotlib.pyplot as plt for i in xrange(5000): exp.doInteractions(1) agent.learn() reward += agent.lastreward if i%100 == 0: xs.append(i) ys.append(reward) print i # print learner.laststate, learner.lastaction, learner.lastreward # print controller.params.reshape(5, 2) print "TOTAL REWARD:", reward print ys
table = PropensityTable(payouts.shape[0]) table.initialize(500.0) #learner = RothErev(experimentation=0.55, recency=0.3) learner = VariantRothErev(experimentation=0.65, recency=0.3) learner.explorer = BoltzmannExplorer(tau=100.0, decay=0.9995) agent = LearningAgent(table, learner) experiment = Experiment(task, agent) epis = int(1e1) batch = 2 avgRewards = scipy.zeros(epis) allActions = scipy.zeros(epis * batch) c = 0 for i in range(epis): experiment.doInteractions(batch) avgRewards[i] = scipy.mean(agent.history["reward"]) allActions[c:c + batch] = agent.history["action"].flatten() + 1 agent.learn() agent.reset() c += batch pylab.figure(figsize=(16, 6)) #pylab.plot(avgRewards) pylab.plot(allActions) pylab.show()
from ObjectLocalizerEnvironment import ObjectLocalizerEnvironment from DeepQNetwork import DeepQNetwork from DeepQLearning import DeepQLearning from MDPObjectLocalizerTask import MDPObjectLocalizerTask from ObjectLocalizationAgent import ObjectLocalizationAgent print 'Starting Environment' epsilon = 1.0 environment = ObjectLocalizerEnvironment(config.get('imageDir'), config.get('candidatesFile'), 'Training') print 'Initializing DeepQNetwork' controller = DeepQNetwork() controller.setEpsilonGreedy(epsilon) print 'Initializing Q Learner' learner = DeepQLearning() print 'Preparing Agent' agent = ObjectLocalizationAgent(controller, learner) print 'Configuring Task' task = MDPObjectLocalizerTask(environment, config.get('groundTruth')) print 'Setting up Experiment' experiment = Experiment(task, agent) i = 0 print 'Main Loop' while i < config.geti('maximumEpochs'): print 'Epoch',i,'(epsilon:{:5.3f})'.format(epsilon) experiment.doInteractions(int(config.get('numInteractions'))) agent.learn() agent.reset() i += 1 epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon) controller.setEpsilonGreedy(epsilon)
# pylab.gray() # pylab.ion() # Learning phase # Num iterations used for PROHA Workshop perliminary evaluation # numIterations = 1600 numIterations = 1500 numInteractions = 600 # Num iterations used for PROHA and PROLE slides # numIterations = 10 # numInteractions = 3 for i in range(numIterations): # interact with the environment (here in batch mode) experiment.doInteractions(numInteractions) agent.learn() agent.reset() # # and draw the table # # pylab.pcolor(table.params.reshape(numStates,numActions).max(1).reshape(numStates,numStates)) # # # pylab.savefig('myfilename_%2d.png' % (i)) # # pylab.show(block=True) # # print table.params.reshape(numStates,numActions).max(1).reshape(numStates,1) # print("\nIteration: %d" % (i)) # print table.params.reshape(numStates,numActions) # # print "-------------------------------------------------" # exit(0) # print table.params.reshape(numStates,numActions)
# Setup the PyBrain and PyGame Environments environment = Environment() game = RunPacman(environment) # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action task = PacmanTask(environment, game) task.performAction(np.array([1])) # The Experiment is the PyBrain link between the task to be completed and the agent completing it experiment = Experiment(task, agent) currentGame = 1 # Continue to loop program until the 'X' on the GUI is clicked while True: # Allow the agent to interaction with the environment (Move in a direction) then learn from it. experiment.doInteractions(1) agent.learn() # Check if current pacman game ended and needs to start a new one if game.wonGame == 1 or game.wonGame == -1: currentGame += 1 # Store the information the agent has learned in long term memory, # Clear the short term memory to reduce any chance of overfitting, # Reset the Pac-Man game, and the environment for the next game test agent.reset() environment.resetMap() game.__init__(environment)
) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) try: for t in xrange(MAX_TIME): print 't = %d' % t world.t = t predExp.doInteractions(1) predAgent.learn() print 'Colors vs. Q-table:' table_print(predTable._params, PredatorInteraction.NSTATES) print except KeyboardInterrupt: pass finally: print 'Background: %s' % BKGD_COLOR print 'Colors vs. Final Q-table:' table_print(predTable._params, PredatorInteraction.NSTATES) print counts = {'ate' : {}, 'poison' : 0, 'death' : 0, 'poisondeath' : 0, 'rejected' : {}}
from DeepQLearning import DeepQLearning from MDPObjectLocalizerTask import MDPObjectLocalizerTask from ObjectLocalizationAgent import ObjectLocalizationAgent print 'Starting Environment' epsilon = 1.0 environment = ObjectLocalizerEnvironment(config.get('imageDir'), config.get('candidatesFile'), 'Training') print 'Initializing DeepQNetwork' controller = DeepQNetwork() controller.setEpsilonGreedy(epsilon) print 'Initializing Q Learner' learner = DeepQLearning() print 'Preparing Agent' agent = ObjectLocalizationAgent(controller, learner) print 'Configuring Task' task = MDPObjectLocalizerTask(environment, config.get('groundTruth')) print 'Setting up Experiment' experiment = Experiment(task, agent) i = 0 print 'Main Loop' while i < config.geti('maximumEpochs'): print 'Epoch', i, '(epsilon:{:5.3f})'.format(epsilon) experiment.doInteractions(int(config.get('numInteractions'))) agent.learn() agent.reset() i += 1 epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon) controller.setEpsilonGreedy(epsilon)
class ReinforcementLearningRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment( config.get(mode + 'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode + 'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: self.experiment.doInteractions(interactions) self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images) / 2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format( epsilon) self.runEpoch(interactions, epochSize) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.db.images))
class Player: def __init__(self): self.environment = GameEnv() av_table = ActionValueTable(self.environment.outdim, self.environment.indim) av_table.initialize(0.) # todo: save & restore agents state learner = Q() learner._setExplorer(EpsilonGreedyExplorer()) agent = LearningAgent(av_table, learner) self.agent = agent self.task = GameTask(self.environment) self.experiment = Experiment(self.task, self.agent) def name(self, index): self.me = index [self.opp1, self.opp2] = [i for i in range(3) if i != self.me] def hand(self, card): self.environment.reset() self.environment.setHand(card) self.environment.setStack(300) def bet1(self, min): self.environment.setPhase('bet-1') self.environment.setMinBet(min) self.experiment.doInteractions(1) bet = self.environment.getTranslatedAction() return bet def bet1_info(self, bets): opp1_bet = bets[self.opp1] opp2_bet = bets[self.opp2] self.environment.setOpponentsBets(opp1_bet, opp2_bet) def call1(self, current_bet): self.environment.setPhase('call-1') self.environment.setToCall(current_bet) self.experiment.doInteractions(1) is_calling = self.environment.getTranslatedAction() return is_calling def call1_info(self, in_game): opp1_in_game = in_game[self.opp1] opp2_in_game = in_game[self.opp2] self.environment.setOpponentsFolded(not opp1_in_game, not opp2_in_game) def bet2(self, min): self.environment.setPhase('bet-2') self.environment.setMinBet(min) self.experiment.doInteractions(1) bet = self.environment.getTranslatedAction() return bet def bet2_info(self, bets): opp1_bet = bets[self.opp1] opp2_bet = bets[self.opp2] self.environment.setOpponentsBets(opp1_bet, opp2_bet) def call2(self, current_bet): self.environment.setPhase('call-1') self.environment.setToCall(current_bet) self.experiment.doInteractions(1) is_calling = self.environment.getTranslatedAction() return is_calling def call2_info(self, in_game): opp1_in_game = in_game[self.opp1] opp2_in_game = in_game[self.opp2] def showdown(self, hand): opp1_hand = hand[self.opp1] opp2_hand = hand[self.opp2] def result(self, winnings): my_winnings = winnings[self.me] opp1_winnings = winnings[self.opp1] opp2_winnings = winnings[self.opp2] self.environment.setPhase('results') self.task.setWinnings(my_winnings) self.experiment.doInteractions(1) self.agent.learn() self.agent.reset()
) mimicTable.initialize(0.) mimicLearner = Q(ALPHA, GAMMA) mimicLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) mimicAgent = LearningAgent(mimicTable, mimicLearner) mimicEnv = MimicryPreyEnvironment(world) mimicTask = MimicryPreyTask(mimicEnv) mimicExp = Experiment(mimicTask, mimicAgent) try: for t in xrange(MAX_TIME): print 't = %d' % t world.t = t predExp.doInteractions(1) predAgent.learn() mimicExp.doInteractions(1) mimicAgent.learn() print 'Mimicker Colors vs. Q-table:' table_print(mimicTable._params, MimicryPreyInteraction.NSTATES) print 'Predator Colors vs. Q-table:' table_print(predTable._params, PredatorInteraction.NSTATES) print except KeyboardInterrupt: pass finally: print 'Background: %s' % BKGD_COLOR print 'Predator Colors vs. Final Q-table:'
import numpy env=HitTheGoalEnv(5) task=HitTheGoalTask(env,[5,0,0]) net = buildNetwork(2, 1, bias=False) # create agent with controller and learner (and its options) #agent=OptimizationAgent(net, CMAES()) #agent.learner.setEvaluator(task,agent.module) agent = LearningAgent(net,Reinforce()) #agent.learner.explorer=EpsilonGreedyExplorer(0.0) #agent.learner._setExplorer(EpsilonGreedyExplorer(0.0)) #agent.learner.explorer.sigma=[0.1] #print agent.learner.explorer.sigma #exit() experiment = Experiment(task, agent) itr=0 #task.performAction(numpy.array([36])) while True: #print itr # agent.learner.maxEvaluations += 1 #agent.learner.learn() experiment.doInteractions(50) agent.learn() agent.reset() task.reset() # env.reset() # itr=itr+1
class ReinforcementLearningRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment(config.get(mode+'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: self.experiment.doInteractions(interactions) self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images)/2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch',epoch+egEpochs,'(exploitation mode: epsilon={:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.db.images))
# controller.initialize(0.) # learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8 learner = Q() # default alpha 0.5, gamma 0.99 # learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(controller, learner) task = ChainTask(env) exp = Experiment(task, agent) reward = 0 xs = [] ys = [] import matplotlib.pyplot as plt for i in xrange(5000): exp.doInteractions(1) agent.learn() reward += agent.lastreward if i % 100 == 0: xs.append(i) ys.append(reward) print i # print learner.laststate, learner.lastaction, learner.lastreward # print controller.params.reshape(5, 2) print "TOTAL REWARD:", reward print ys
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
# set the task task = charge_opt(environment, 0.8, 0.01) #do experiment number_of_runs = 20 #change the reward and run the whole experiment task.change_reward(1, 0.0) # create the experiment experiment = Experiment(task, agent) k = 0 while k < number_of_runs: experiment.doInteractions(96) agent.learn() agent.reset() #log some data of the first and last run. if k == 0: # if it is the first run first_run_time2 = environment.log_time first_run_soc2 = environment.log_soc first_run_volt2 = environment.log_volt #if k == number_of_runs - 1: # if it it the last run # last_run_time2 = environment.log_time # last_run_soc2 = environment.log_soc # last_run_volt2 = environment.log_volt environment.reset() k += 1 agent.learning = False #to keep it from exploring
# Learning phase # Num iterations used for PROHA Workshop perliminary evaluation # numIterations = 1600 numIterations = 1500 numInteractions = 600 # Num iterations used for PROHA and PROLE slides # numIterations = 10 # numInteractions = 3 for i in range(numIterations): # interact with the environment (here in batch mode) experiment.doInteractions(numInteractions) agent.learn() agent.reset() # # and draw the table # # pylab.pcolor(table.params.reshape(numStates,numActions).max(1).reshape(numStates,numStates)) # # # pylab.savefig('myfilename_%2d.png' % (i)) # # pylab.show(block=True) # # print table.params.reshape(numStates,numActions).max(1).reshape(numStates,1) # print("\nIteration: %d" % (i)) # print table.params.reshape(numStates,numActions) # # print "-------------------------------------------------" # exit(0) # print table.params.reshape(numStates,numActions)
learner = Q() agent = LearningAgent(actionValueNetwork, learner) experiment = Experiment(task, agent) start = time() i = 0 while True: for state in range(control.get_randomize_states()): control.randomize(state) task.reset() print("run %d" % i) experiment.doInteractions(1000) agent.learn() agent.reset() with open('q/rewards.csv','a') as f: f.write("%f,%d\n" % (task.getTotalReward(), time() - start)) # print("learn") # agent.learn() # agent.reset() # control.pause() pylab.pcolor(actionValueNetwork.params.reshape(32, actions).max(1).reshape(8,4).T) pylab.pause(0.01) if (i % 20) == 0: print("save network")
exit("quiting") def start(unused_addr, args, message): print("RL starting") while True: experiment.doInteractions(6) # make a number of interaction in-between learning agent.learn() agent.reset() if __name__ == "__main__": # dispatch osc messages disp = dispatcher.Dispatcher() disp.map("/test", print) # dumb input message disp.map("/quit", self_quit, "ok") disp.map("/start", start, "ok") #disp.map("/iterate", self_quit, "ok") #disp.map("/reset", reset, "ok") server = osc_server.ThreadingOSCUDPServer( ("127.0.0.1", listening_port), disp) print("Serving on {}".format(server.server_address)) #server.serve_forever() while True: experiment.doInteractions(6) # make a number of interaction in-between learning agent.learn() agent.reset()
# define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process while True: experiment.doInteractions(1) agent.learn() agent.reset()
import pickle import time # Create environment sub_env = Environment(20, 20) world = World(sub_env) # Brain for the animat, we have already trained the data f = open('neuro.net', 'r') trained_net = pickle.load(f) brain = BrainController(trained_net) # Learning method we use #learner = PolicyGradientLearner() learner = ENAC() learner._setLearningRate(0.2) # Create an animat animat = StupidAnimat(trained_net, learner, sub_env) # Establish a task task = InteractTask(world, animat) brain.validate_net() experiment = Experiment(task, animat) while True: experiment.doInteractions(10000) animat.learn() animat.reset() brain.validate_net() time.sleep(3)
# standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) # prepare plotting pylab.gray() pylab.ion() #for i in range(100): while True: # interact with the environment (here in batch mode) experiment.doInteractions(matrix_size) agent.learn() agent.reset() # and draw the table print table.params.reshape(matrix_size,2) #print table.params.reshape(matrix_size,matrix_size) pylab.pcolor(table.params.reshape(matrix_size,2).max(1).reshape(matrix_size,1)) #pylab.pcolor(table.params.reshape(matrix_size,matrix_size).max(1).reshape(matrix_size,1)) pylab.draw() pylab.ion() pylab.show() print "training complete"
class RlOp(threading.Thread): episodes = 1 epilen = 200 def __init__(self, event_queue_name, hub_queue_name): super().__init__() # create environment self.conn = boto.sqs.connect_to_region(constants.REGION) self.event_queue = self.conn.get_queue(event_queue_name) self.event_queue.set_message_class(MHMessage) self.env = DogEnv(DogEnv.ALL_QUIET, DogEnv.ALL_QUIET, self.event_queue, hub_queue_name) self.env.delay = (self.episodes == 1) # create task self.task = QuietDogTask(self.env) # create value table and initialize with ones # TODO: Get number of states from DogEnv self.table = ActionValueTable(2*5*4, 5*4) self.table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here self.learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well self.learner.explorer = BoltzmannExplorer() # create agent self.agent = DogAgent(self.table, self.learner) # create experiment self.experiment = Experiment(self.task, self.agent) def run(self): self.call_run() def call_run(self): print('RlOp: running') # prepare plotting pylab.gray() pylab.ion() for i in range(1000): # interact with the environment (here in batch mode) self.experiment.doInteractions(100) self.agent.learn() self.agent.reset() results0 = self.table.params.reshape(2, 4, 5, 20)[0] results1 = self.table.params.reshape(2, 4, 5, 20)[1] pp.pprint(results0.argmax(2)) pp.pprint(results1.argmax(2)) # and draw the table #ar=self.table.params.reshape(2,5,4,5,4) #for state1 in range(len(constants.SOUNDS)): # for state2 in range(4): # pylab.pcolor(ar[1][state1][state2]) # pylab.draw() results0 = self.table.params.reshape(2, 4, 5, 20)[0] results1 = self.table.params.reshape(2, 4, 5, 20)[1] while True: time.sleep(60) pp.pprint(results0.argmax(2)) pp.pprint(results1.argmax(2))