def test_maze(): # simplified version of the reinforcement learning tutorial example structure = np.array([[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 1, 0, 1], [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False, exploretoo=True): """ Return the fitness value for one episode of play, given the policy defined by a neural network. """ task = GameTask(game_env) game_env.recordingEnabled = True game_env.reset() net.reset() task.maxSteps = maxSteps agent = LearningAgent(net) agent.learning = False agent.logging = False exper = EpisodicExperiment(task, agent) fitness = 0 for _ in range(avgOver): rs = exper.doEpisodes(1) # add a slight bonus for more exploration, if rewards are identical if exploretoo: fitness += len(set(game_env._allEvents)) * 1e-6 # the true, discounted reward fitness += sum([ sum([v * discountFactor**step for step, v in enumerate(r)]) for r in rs ]) fitness /= avgOver if returnEvents: return fitness, game_env._allEvents else: return fitness
def initExperiment(alg, optimistic=True): env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) if optimistic: table.initialize(1.) else: table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here learner = alg() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() agent = LearningAgent(table, learner) agent.batchMode = False experiment = Experiment(task, agent) experiment.allRewards = [] return experiment
def train(): # Make the environment environment = TwentyFortyEightEnvironment() # The task is the game this time task = environment # Make the reinforcement learning agent (use a network because inputs are continuous) network = ActionValueNetwork(task.nSenses, task.nActions) # Use Q learning for updating the table (NFQ is for networks) learner = NFQ() learner.gamma = GAMMA agent = LearningAgent(network, learner) # Set up an experiment experiment = EpisodicExperiment(task, agent) # Train the Learner meanScores = [] for i in xrange(LEARNING_EPOCHS): experiment.doEpisodes(GAMES_PER_EPOCH) print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock meanScores.append(task.meanScore) agent.learn() agent.reset() params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA } return meanScores, params, agent
def run_bbox(verbose=False): n_features = n_actions = max_time = -1 if bbox.is_level_loaded(): bbox.reset_level() else: bbox.load_level("../levels/train_level.data", verbose=1) n_features = bbox.get_num_of_features() n_actions = bbox.get_num_of_actions() max_time = bbox.get_max_time() av_table = ActionValueTable(n_features, n_actions) av_table.initialize(0.2) print av_table._params learner = Q(0.5, 0.1) learner._setExplorer(EpsilonGreedyExplorer(0.4)) agent = LearningAgent(av_table, learner) environment = GameEnvironment() task = GameTask(environment) experiment = Experiment(task, agent) while environment.finish_flag: experiment.doInteractions(1) agent.learn() bbox.finish(verbose=1)
def getAction(self): #pega acao com Boltzmann ou Q-Learning if(self.nextAction == None): action = LearningAgent.getAction(self) self.lastaction = action return action else: #indicacao do supervisor com tolerancia if(self.tolerance != None): if( (self.expectedReward * (1 + self.tolerance)) > self.module.getActionValue(self.nextAction)): action = self.nextAction self.lastaction = action self.nextAction = None return action else: #acao independente action = LearningAgent.getAction(self) self.lastaction = action return action #indicacao do supervisor sem tolerancia else: action = self.nextAction self.lastaction = action self.nextAction = None return action
def __init__(self, n_threads=4, initial_port=19997, q_table_version=0, batch_size=None, learner=None, explorer=None): self.barrier = Barrier(n_threads + 1, timeout=720) self.n_threads = n_threads self.initial_port = initial_port self.batch_size = batch_size self.controller = MyActionValueTable(q_table_version) if learner is None: self.learner = Q(0.5, 0.9) else: self.learner = learner if explorer is None: self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998) else: self.explorer = self.learner.explorer = explorer self.agent = LearningAgent(self.controller, self.learner) # Logger initialization self.logger = logging.getLogger('master_logger') self.logger.setLevel(logging.DEBUG) self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log')) self.failed_simulations = [] self.n_episodes = 0 self.simulations = [] self.initialize_simulations()
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)) , max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def __init__(self, name, num_states, num_actions, epsilon=0.3, gamma=0.99, alpha=0.95): self.controller = ActionValueTable(num_states, num_actions) self.controller.initialize(np.random.rand(num_states * num_actions)) self.learner = Q(gamma=gamma, alpha=alpha) self.learner.batchMode = False self.learner.explorer.epsilon = epsilon LearningAgent.__init__(self, self.controller, self.learner) Agent.__init__(self, name)
def __init__(self, outdim, n_actions, random_state, rl_params): """ RL agent """ module = SparseActionValueTable(n_actions, random_state) module.initialize(0.0) learner = EpisodeQ(alpha=rl_params.q_alpha, w=rl_params.q_w, gamma=rl_params.q_gamma) learner.explorer = EGreedyExplorer(random_state, epsilon=rl_params.exp_epsilon, decay=rl_params.exp_decay) LearningAgent.__init__(self, module, learner)
def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
def learn(client): av_table = ActionValueNetwork(4, 1) learner = Reinforce() agent = LearningAgent(av_table, learner) env = CarEnvironment(client) task = CarTask(env) experiment = ContinuousExperiment(task, agent) while True: experiment.doInteractionsAndLearn(1) agent.learn()
class QAlgorithm: def Pause(self):#if menu says pause pause exicution while self.state == 1: time.sleep(.05) return True def Quit(self):#if menu says quit stop running self.process.terminate() return False def Start(self):#starts the Bot if self.process == None: self.runBot() #self.process = multiprocessing.Process(target=self.runBot, args= []) #self.process.start() return True def CheckState(self):#checks to see what state the menu says to be in if self.state == 0 : self.Start() elif self.state == 1: self.Pause() elif self.state == 2: self.Quit() def GameOver(self):#checks to see if state requires bot pause, quit or if the game is over return self.CheckState() or self.sr.checkEndGame(self.endBox,self.gameOver) def __init__(self,rewardBox,box,gameOver,endGame,scoreArea): self.reward = rewardBox self.bbox = box self.environment = TEnviroment(box)#Custom environment class if os.path.isfile("bot.txt"): self.controller = pickle.load(open("bot.txt","rb")) else: self.controller = ActionValueNetwork(50**2,4)#Arguments (framerate*maxPlaytime, Number of acitons) self.learner = Q() gf = {0:self.GameOver} self.agent = LearningAgent(self.controller, self.learner) self.task = TTask(self.environment,scoreArea,gf)#needs custom task self.experiment = EpisodicExperiment(self.task, self.agent) self.process = None self.endBox = endGame def runBot(self):#runes the bot for a single Episode self.experiment.doEpisodes() self.agent.learn() self.agent.reset() file = open("bot.txt","wb+") pickle.dump(self.controller,file)
def __init__(self, _id, module, learner=None): #define variaveis da class self.id = _id self.horizontal_edge = lane.getEdgeID(trafficlights.getControlledLanes(self.id)[0]) self.vertical_edge = lane.getEdgeID(trafficlights.getControlledLanes(str(_id))[2]) #define variaveis da classe pai self.horizontalLoad = [] self.verticalLoad = [] self.averageHorizontal = [] self.averageVertical = [] self.nextAction = None self.expectedReward = None self.tolerance = None LearningAgent.__init__(self, module, learner)
def __init__(self, module, learner = None): ''' Constructor ''' LearningAgent.__init__(self, module, learner) self.__rules=[] self.__states={} self.__input={} self.__buffer={} # self.__rules.append(BackOffRule()) self.__rules.append(BackOffRule2()) self.__rules.append(LocomotionPrimitives()) self.__states["driveBackStartTime"]=AgentMind.__driveBackStartTime self.__states["__lostTrackTurnStartTime"]=AgentMind.__lostTrackTurnStartTime
def __init__(self, module, learner=None): ''' Constructor ''' LearningAgent.__init__(self, module, learner) self.__rules = [] self.__states = {} self.__input = {} self.__buffer = {} # self.__rules.append(BackOffRule()) self.__rules.append(BackOffRule2()) self.__rules.append(LocomotionPrimitives()) self.__states["driveBackStartTime"] = AgentMind.__driveBackStartTime self.__states[ "__lostTrackTurnStartTime"] = AgentMind.__lostTrackTurnStartTime
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity ) self.controller.initialize(1.0) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def setup_RL(): # create the maze with walls (1) envmatrix = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(0.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # create agent agent = LearningAgent(table, learner) # create experiment experiment = Experiment(task, agent) return experiment, agent, table
def get_discrete_task_agent(generators, market, nStates, nOffer, markups, withholds, maxSteps, learner, Pd0=None, Pd_min=0.0): """ Returns a tuple of task and agent for the given learner. """ env = pyreto.discrete.MarketEnvironment(generators, market, numStates=nStates, numOffbids=nOffer, markups=markups, withholds=withholds, Pd0=Pd0, Pd_min=Pd_min) task = pyreto.discrete.ProfitTask(env, maxSteps=maxSteps) nActions = len(env._allActions) module = ActionValueTable(numStates=nStates, numActions=nActions) agent = LearningAgent(module, learner) return task, agent
def __init__(self): """ @brief: Setting up internal parameters for the RL module""" # Navigation Task self._environment = NavigationEnvironment() self._task = NavigationTask(self._environment) # Number of States : (read from params.py) self._states = STATES self._state_limits = LIMITS # Total number of states: self._number_of_states = 1 for i in self._states: self._number_of_states *= i # Number of actions self._actions = ACTION_STATES self._action_limits = ACTION_RANGE # Action Value Table directory self.tables_directory = os.path.dirname(__file__) + "/tables/" self.table_code = "S"+str(self._number_of_states)+"_"+"A"+str(self._actions) self._filename = FILENAME + self.table_code # Action Value Table setup self.load_AV_Table() # Declare ROS Service to store Action Value Table store_service = rospy.Service('store_table', StoreAVTable, self.store_cb) # Set up task parameters: self._task.set_params(COMMAND_DURATION, FUSION_WEIGHTS, TIME_GRANULARITY, self._state_limits, MAX_REWARD, COST_THRESHOLD) # Agent set up self._learner = SARSA(alpha,gamma) self._learner._setExplorer(EpsilonGreedyExplorer(epsilon)) self._agent = LearningAgent(self._av_table, self._learner) # Experiment set up self._experiment = Experiment(self._task,self._agent) self._experiment.set_params(STEP_SIZE) # Start print table thread if VISUALIZATION is True: try: #thread.start_new_thread(self.print_table,()) self.visualization_thread = Thread(target = self.print_table, args = () ) self.visualization_thread.start() except: print "Failed to start visualization thread!" print "Successfully Initialization of RL module! (kappa)"
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ "!!!!!!!!!!", "! ! ! ! !", "! !! ! ! !", "! ! !", "! !!!!!! !", "! ! ! !", "! ! !!!! !", "! !", "! !!!!! !", "! ! !", "!!!!!!!!!!", ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy))
def build(self, direction, x, y): new_tako = tako.Tako(direction, x, y, self) for gen in range(len(self.strand_1)): self.strand_1[gen].read(self.strand_2[gen], new_tako) #take care of net & make agent new_tako.net.sortModules() learner = ENAC() new_tako.agent = LearningAgent(new_tako.net, learner) return new_tako
def __init__(self, x, y, brain, learner, env): LearningAgent.__init__(self, brain.net, learner) self.cellType = 3 self.brain = brain self.module = brain.net self.learner = learner self.env = env self.color = cell.BLACK self.x = x self.y = y self.num_interactions = 0 self.age = 0 self.colddown = 0 self.speed = self.Speeds[0] self.energy = self.MaxEnergy self.food_sensor = 0 self.hunger_sensor = 0 self.target = [-1, -1]
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i+1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1] )
def __init__(self, x, y, brain, learner, env): LearningAgent.__init__(self, brain.net, learner) self.cellType = 3 self.brain = brain self.module = brain.net self.learner = learner self.env = env self.color = cell.BLACK self.x = x self.y = y self.num_interactions = 0 self.age = 0 self.colddown = 0 self.speed = self.Speeds[0] self.energy = self.MaxEnergy self.food_sensor = 0; self.hunger_sensor = 0; self.target = [-1, -1]
class RL: def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0] self.experiment.doInteractions(1) self.agent.learn()
class RL: def __init__(self): self.av_table = ActionValueTable(2, 3) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent) def go(self): global rl_params kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0] self.experiment.doInteractions(1) self.agent.learn()
def createAgent(module): # create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information) ## gamma -- discount factor (importance of future reward) # learner = Q(0.5, 0.99) learner = SARSA(0.5, 0.99) # learner = QLambda(0.5, 0.99, 0.9) agent = LearningAgent(module, learner) return agent
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i + 1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)), max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def explore_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list("!!!!!!!!!!"), list("! ! ! ! !"), list("! !! ! ! !"), list("! ! !"), list("! !!!!!! !"), list("! ! ! !"), list("! ! !!!! !"), list("! !"), list("! !!!!! !"), list("! ! !"), list("!!!!!!!!!!"), ] structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.0) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(" #"))[structure]) print("Maze map:") print("\n".join("".join(row) for row in maze)) print("Greedy policy:") print("\n".join("".join(row) for row in greedy_policy)) assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"
def main(): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) # else: controller = ActionValueNetwork(9, 4) learner = NFQ() agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work #agent.learn() agent.reset() #data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] data =[[0,0,2], [0,0,0], [0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print i, int(numpy.mean(score_list)) , max(score_list), move with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 2], [0, 0, 0, 2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ", i, int( numpy.mean(score_list)), max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def createAgent(module): ### create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information -- update value factor) ## gamma -- discount factor (importance of future reward -- next value factor) learner = Q(0.2, 0.99) # learner = SARSA(0.2, 0.99) ## learner = QLambda(0.5, 0.99, 0.9) explorer = learner.explorer explorer.decay = 1.0 agent = LearningAgent(module, learner) return agent
def __init__(self): self.av_table = ActionValueTable(4, 5) self.av_table.initialize(0.1) learner = SARSA() learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) env = HASSHEnv() task = HASSHTask(env) self.experiment = Experiment(task, self.agent)
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False): """ Return the fitness value for one episode of play, given the policy defined by a neural network. """ task = GameTask(game_env) game_env.recordingEnabled = True game_env.reset() net.reset() task.maxSteps=maxSteps agent = LearningAgent(net) agent.learning = False agent.logging = False exper = EpisodicExperiment(task, agent) fitness = 0 for _ in range(avgOver): rs = exper.doEpisodes(1) # add a slight bonus for more exploration, if rewards are identical fitness += len(set(game_env._allEvents)) * 1e-6 # the true, discounted reward fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs]) fitness /= avgOver if returnEvents: return fitness, game_env._allEvents else: return fitness
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if(self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
class PlayYourCardsRight(Feature): def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent) @property def is_speaking(self): return self.game_interaction.is_speaking def _thread(self, args): # let's play our cards right! while not self.is_stop: self.experiment.doInteractions(1) self.agent.learn() self.av_table.saveParameters()
def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent)
def __init__(self,rewardBox,box,gameOver,endGame,scoreArea): self.reward = rewardBox self.bbox = box self.environment = TEnviroment(box)#Custom environment class if os.path.isfile("bot.txt"): self.controller = pickle.load(open("bot.txt","rb")) else: self.controller = ActionValueNetwork(50**2,4)#Arguments (framerate*maxPlaytime, Number of acitons) self.learner = Q() gf = {0:self.GameOver} self.agent = LearningAgent(self.controller, self.learner) self.task = TTask(self.environment,scoreArea,gf)#needs custom task self.experiment = EpisodicExperiment(self.task, self.agent) self.process = None self.endBox = endGame
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins**env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def main(): # 2048の全ての状態を保存するのは無理でしょ. # 14^16通りの状態があるよね. #controller = ActionValueTable(16, 4) #learner = Q() #controller.initialize(1.) controller = ActionValueNetwork(16, 4) learner = NFQ() #learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) score_list = [] for i in range(10000): # if os.path.exists('./agent.dump'): # with open('./agent.dump') as f: # agent = pickle.load(f) print i, 'playing ...' score = play(agent) score_list.append(score) # ここで, # TypeError: only length-1 arrays can be converted to Python scalars # pybrain/rl/learners/valuebased/q.py # => learnerをQからNFQにしたら行けた. # => http://stackoverflow.com/questions/23755927/pybrain-training-a-actionvaluenetwork-doesnt-properly-work print i, 'learning ...' agent.learn() agent.reset() print i, 'evaluate sample ...' data =[[0,0,0,0], [0,0,0,0], [0,0,0,2], [0,0,0,2]] agent.integrateObservation(numpy.array(data).ravel()) move = agent.getAction() print " ",i, int(numpy.mean(score_list)) , max(score_list), move if i % 20 == 0: print i, 'saving ...' with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump(score_list, f)
def testValueBased(self): """ Test value-based learner. """ mkt = SmartMarket(self.case) exp = MarketExperiment([], [], mkt) for g in self.case.generators: env = DiscreteMarketEnvironment([g], mkt) dim_state, num_actions = (10, 10) exp.tasks.append(ProfitTask(env, dim_state, num_actions)) module = ActionValueTable(dim_state, num_actions) module.initialize(1.0) # module = ActionValueNetwork(dimState=1, numActions=4) learner = SARSA() #Q() QLambda() # learner.explorer = BoltzmannExplorer() # default is e-greedy. exp.agents.append(LearningAgent(module, learner)) for _ in range(1000): exp.doInteractions(24) # interact with the env in batch mode for agent in exp.agents: agent.learn() agent.reset()
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity) self.controller.initialize(1.) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def get_continuous_task_agent(generators, market, nOffer, maxMarkup, maxWithhold, maxSteps, learner): env = pyreto.continuous.MarketEnvironment(generators, market, nOffer, maxMarkup, maxWithhold) task = pyreto.continuous.ProfitTask(env, maxSteps=maxSteps) net = buildNetwork( env.outdim, # 4, env.indim, bias=False, # outputbias=True, # hiddenclass=TanhLayer, # outclass=TanhLayer ) # net._setParameters(([0.0])) agent = LearningAgent(net, learner) # agent.name = generators[0].name return task, agent
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def main(): rospy.init_node("lauron_reinforcement_learning") environment = RLEnvironment() dim_state = environment.joint_states.shape[0] num_actions = len(environment.actions) controller = ActionValueNetwork(dim_state, num_actions) learner = SARSA() agent = LearningAgent(controller, learner) task = RLTask(environment) experiment = Experiment(task, agent) episode_counter = 0 while True: print("Training episode {}".format(episode_counter)) experiment.doInteractions(NUM_INTERACTIONS) agent.learn() agent.reset() episode_counter += 1
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if(self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent)
def main(): vrep.simxFinish(-1) # just in case, close all opened connections client_id = vrep.simxStart('127.0.0.1', 19997, True, True, 5000, 5) # Connect to V-REP if client_id < 0: print('Failed connecting to remote API server') return -1 print('Connected to remote API server') # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = ActionValueTable(task.get_state_space_size(), task.get_action_space_size()) controller.initialize(1.) file = open('standing-up-q.pkl', 'rb') controller._params = pickle.load(file) file.close() # learner = Q() agent = LearningAgent(controller) experiment = EpisodicExperiment(task, agent) i = 0 while True: i += 1 print('Iteration n° ' + str(i)) experiment.doEpisodes(1) vrep.simxFinish(client_id)
def main(): client_id = Utils.connectToVREP() # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = MyActionValueTable() learner = Q(0.5, 0.9) learner.explorer = EpsilonGreedyExplorer(0.15, 1) # EpsilonGreedyBoltzmannExplorer() agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) controller.initialize(agent) i = 0 try: while True: i += 1 print('Episode ' + str(i)) experiment.doEpisodes() agent.learn() agent.reset() print('mean: '+str(numpy.mean(controller.params))) print('max: '+str(numpy.max(controller.params))) print('min: '+str(numpy.min(controller.params))) if i % 500 == 0: # Save q-table every 500 episodes print('Save q-table') controller.save() task.t_table.save() except (KeyboardInterrupt, SystemExit): with open('../data/standing-up-q.pkl', 'wb') as handle: pickle.dump(controller.params, handle) task.t_table.save() controller.save() vrep.simxFinish(client_id)
""" Author: Jeremy M. Stober Program: NFQ_EXAMPLE.PY Date: Thursday, March 1 2012 Description: Test NFQ on my cartpole simulation. """ from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from cartpole import CartPole import numpy as np module = ActionValueNetwork(4,2) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) env = CartPole() cnt = 0 for i in range(1000): env.reset() print "Episode: %d, Count: %d" % (i,cnt) cnt = 0 while not env.failure(): agent.integrateObservation(env.observation()) action = agent.getAction() pstate, paction, reward, state = env.move(action) cnt += 1 agent.giveReward(reward) agent.learn(1)
nb = len([bus for bus in case.buses if bus.type == pylon.PQ]) ng = len([g for g in case.online_generators if g.bus.type != pylon.REFERENCE]) net = buildNetwork(nb, ng, bias=False) # Create an agent and select an episodic learner. #learner = Reinforce() learner = ENAC() #learner.gd.rprop = True ## only relevant for RP #learner.gd.deltamin = 0.0001 ##agent.learner.gd.deltanull = 0.05 ## only relevant for BP #learner.gd.alpha = 0.01 #learner.gd.momentum = 0.9 agent = LearningAgent(net, learner) # Adjust some parameters of the NormalExplorer. sigma = [50.0] * ng learner.explorer.sigma = sigma #learner.explorer.epsilon = 0.01 # default: 0.3 #learner.learningRate = 0.01 # (0.1-0.001, down to 1e-7 for RNNs) # Alternatively, use blackbox optimisation. #learner = HillClimber(storeAllEvaluations=True) ##learner = CMAES(storeAllEvaluations=True) ##learner = FEM(storeAllEvaluations=True) ##learner = ExactNES(storeAllEvaluations=True) ##learner = PGPE(storeAllEvaluations=True) #agent = OptimizationAgent(net, learner)
# define action-value table # number of states is: # # current value: 1-21 # # number of actions: # # Stand=0, Hit=1 av_table = ActionValueTable(21, 2) av_table.initialize(0.) # define Q-learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process while True: experiment.doInteractions(1) agent.learn() agent.reset()
from ghost import Ghost from pacmanEnvironment import Environment ############################################################### # The main function that begins running our Pacman-In-AI game # ############################################################### if __name__ == "__main__": # Initialize our Action-Environment-Reward Table controller = ActionValueTable(196, 4) controller.initialize(0.) # Initialize Reinforcement Learning learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) # Setup the PyBrain and PyGame Environments environment = Environment() game = RunPacman(environment) # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action task = PacmanTask(environment, game) task.performAction(np.array([1])) # The Experiment is the PyBrain link between the task to be completed and the agent completing it experiment = Experiment(task, agent) currentGame = 1 # Continue to loop program until the 'X' on the GUI is clicked while True:
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None) #print "dim: ", task.indim, task.outdim from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False)) # # print agent # from pprint import pprint # pprint (vars(agent.learner)) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) #agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) #for i in range(0,parameters["TestWith"]): # y = testexperiment.doEpisodes(1) # print (agent.learner._allEvaluated) # # # from pprint import pprint # pprint (vars(task)) l = parameters["TestWith"] task.N = parameters["MaxRunsPerEpisodeTest"] experiment.doEpisodes(l) task.N = parameters["MaxRunsPerEpisode"] resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2) #import sumatra.parameters as p #import sys #parameter_file = sys.argv[1] #parameters = p.SimpleParameterSet(parameter_file) # # #run(["BalanceTask",parameters])
################################################################################ ### main if __name__ == '__main__': world = WorldInteraction() predTable = ActionValueTable( PredatorInteraction.NSTATES, len(PredatorInteraction.ACTIONS) ) predTable.initialize(0.) predLearner = Q(ALPHA, GAMMA) predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON)) predAgent = LearningAgent(predTable, predLearner) predEnv = PredatorEnvironment(world) predTask = PredatorTask(predEnv) predExp = Experiment(predTask, predAgent) try: for t in xrange(MAX_TIME): print 't = %d' % t world.t = t predExp.doInteractions(1) predAgent.learn() print 'Colors vs. Q-table:' table_print(predTable._params, PredatorInteraction.NSTATES) print
agents = [] tasks = [] for g in bus1.generators: """ Create an environment for each agent with an asset and a market. """ env = ParticipantEnvironment(g, mkt, n_offbids=2) """ Create a task for the agent to achieve. """ task = ProfitTask(env) """ Build an artificial neural network for the agent. """ net = buildNetwork(task.outdim, task.indim, bias=False, outputbias=False) # net._setParameters(array([9])) """ Create a learning agent with a learning algorithm. """ agent = LearningAgent(module=net, learner=ENAC()) """ Initialize parameters (variance). """ # agent.setSigma([-1.5]) """ Set learning options. """ agent.learner.alpha = 2.0 # agent.learner.rprop = True agent.actaspg = False # agent.disableLearning() agents.append(agent) tasks.append(task) """ The Experiment will coordintate the interaction of the given agents and their associated tasks. """ experiment = MarketExperiment(tasks, agents, mkt) experiment.setRenderer(ExperimentRenderer())
plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure()