def __init__(self, n_threads=4, initial_port=19997, q_table_version=0, batch_size=None, learner=None, explorer=None): self.barrier = Barrier(n_threads + 1, timeout=720) self.n_threads = n_threads self.initial_port = initial_port self.batch_size = batch_size self.controller = MyActionValueTable(q_table_version) if learner is None: self.learner = Q(0.5, 0.9) else: self.learner = learner if explorer is None: self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998) else: self.explorer = self.learner.explorer = explorer self.agent = LearningAgent(self.controller, self.learner) # Logger initialization self.logger = logging.getLogger('master_logger') self.logger.setLevel(logging.DEBUG) self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log')) self.failed_simulations = [] self.n_episodes = 0 self.simulations = [] self.initialize_simulations()
def q_learning_table(): controller = ActionValueTable(36, 4) learner = Q() controller.initialize(1.) agent = LearningAgent(controller, learner) score_list = [] turn_list = [] # neural側のトレーニング分 +100 for i in range(600): print_state(agent.module.getValue, 'table') score, turn = play(agent, 'table') score_list.append(score) turn_list.append(turn) agent.learn() agent.reset() print i, int(numpy.mean(score_list)), max(score_list), score, turn with open('./agent.dump', 'w') as f: pickle.dump(agent, f) with open('./score.dump', 'w') as f: pickle.dump([score_list, turn_list], f)
def createAgent(module): ### create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information -- update value factor) ## gamma -- discount factor (importance of future reward -- next value factor) learner = Q(0.2, 0.99) # learner = SARSA(0.2, 0.99) ## learner = QLambda(0.5, 0.99, 0.9) explorer = learner.explorer explorer.decay = 1.0 agent = LearningAgent(module, learner) return agent
def __init__(self, name, num_states, num_actions, epsilon=0.3, gamma=0.99, alpha=0.95): self.controller = ActionValueTable(num_states, num_actions) self.controller.initialize(np.random.rand(num_states * num_actions)) self.learner = Q(gamma=gamma, alpha=alpha) self.learner.batchMode = False self.learner.explorer.epsilon = epsilon LearningAgent.__init__(self, self.controller, self.learner) Agent.__init__(self, name)
def learn(self, number_of_iterations): learner = Q(0.2, 0.8) task = CartMovingTask(self.environment) self.controller = ActionValueTable( reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity) self.controller.initialize(1.) agent = LearningAgent(self.controller, learner) experiment = Experiment(task, agent) for i in range(number_of_iterations): experiment.doInteractions(1) agent.learn() agent.reset() with open("test.pcl", "w+") as f: pickle.dump(self.controller, f)
def initExperiment(learnalg='Q', history=None, binEdges='10s', scriptfile='./rlRunExperiment_v2.pl', resetscript='./rlResetExperiment.pl'): if binEdges == '10s': centerBinEdges = centerBinEdges_10s elif binEdges == '30s': centerBinEdges = centerBinEdges_30s elif binEdges == 'lessperturbed': centerBinEdges = centerBinEdges_10s_lessperturbed elif binEdges is None: centerBinEdges = None else: raise Exception("No bins for given binEdges setting") env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript) if history is not None: env.data = history['data'] task = OmnetTask(env, centerBinEdges) if history is not None: task.allrewards = history['rewards'] if learnalg == 'Q': nstates = env.numSensorBins**env.numSensors if history is None: av_table = ActionValueTable(nstates, env.numActions) av_table.initialize(1.) else: av_table = history['av_table'] learner = Q(0.1, 0.9) # alpha, gamma learner._setExplorer(EpsilonGreedyExplorer(0.05)) # epsilon elif learnalg == 'NFQ': av_table = ActionValueNetwork(env.numSensors, env.numActions) learner = NFQ() else: raise Exception("learnalg unknown") agent = LearningAgent(av_table, learner) experiment = Experiment(task, agent) if history is None: experiment.nruns = 0 else: experiment.nruns = history['nruns'] return experiment
def __init__(self, name, clientID, sensorHandle, bodyHandle): ''' Constructor ''' self.resetParameters() controller = ActionValueTable(150, 5) # pyBrain controller.initialize(1.) # pyBrain learner = Q() # pyBrain self.__mind=AgentMind(controller, learner) # with pyBrain self.__controller=controller self.__name=name self.__clientID=clientID # Client ID of the Dummy object self.__sensorHandle=sensorHandle # Proximity sensor handle of the V-Rep agent self.__bodyHandle=bodyHandle # BubbleRob body handle self.__mind.setInput("name", name) self.__pybrainEnvironment = LocomotionEnvironment() self.__pybrainTask = LocomotionTask(self.__pybrainEnvironment)
def maze(): # import sys, time pylab.gray() pylab.ion() # The goal appears to be in the upper right structure = [ '!!!!!!!!!!', '! ! ! ! !', '! !! ! ! !', '! ! !', '! !!!!!! !', '! ! ! !', '! ! !!!! !', '! !', '! !!!!! !', '! ! !', '!!!!!!!!!!', ] structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(100): experiment.doInteractions(100) agent.learn() agent.reset() # 4 actions, 81 locations/states (9x9 grid) # max(1) gives/plots the biggest objective function value for that square pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9)) pylab.draw() # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy))
def test_maze(): # simplified version of the reinforcement learning tutorial example structure = [ list('!!!!!!!!!!'), list('! ! ! ! !'), list('! !! ! ! !'), list('! ! !'), list('! !!!!!! !'), list('! ! ! !'), list('! ! !!!! !'), list('! !'), list('! !!!!! !'), list('! ! !'), list('!!!!!!!!!!'), ] structure = np.array([[ord(c) - ord(' ') for c in row] for row in structure]) shape = np.array(structure.shape) environment = Maze(structure, tuple(shape - 2)) controller = ActionValueTable(shape.prod(), 4) controller.initialize(1.) learner = Q() agent = LearningAgent(controller, learner) task = MDPMazeTask(environment) experiment = Experiment(task, agent) for i in range(30): experiment.doInteractions(30) agent.learn() agent.reset() controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape) # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1) greedy_policy = np.flipud( np.array(list('NESW'))[greedy_policy].reshape(shape)) maze = np.flipud(np.array(list(' #'))[structure]) print('Maze map:') print('\n'.join(''.join(row) for row in maze)) print('Greedy policy:') print('\n'.join(''.join(row) for row in greedy_policy)) assert '\n'.join( ''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
def run(): """ number of states is: current value: 0-20 number of actions: Stand=0, Hit=1 """ # define action value table av_table = ActionValueTable(MAX_VAL, MIN_VAL) av_table.initialize(0.) # define Q-learning agent q_learner = Q(Q_ALPHA, Q_GAMMA) q_learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(av_table, q_learner) # define the environment env = BlackjackEnv() # define the task task = BlackjackTask(env, verbosity=VERBOSE) # finally, define experiment experiment = Experiment(task, agent) # ready to go, start the process for _ in range(NB_ITERATION): experiment.doInteractions(1) if task.lastreward != 0: if VERBOSE: print "Agent learn" agent.learn() print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|' print '|:-------:|:-------|:-----|:-----|' for i in range(MAX_VAL): print '| %s | %s | %s | %s |' % ( (i + 1), av_table.getActionValues(i)[0], av_table.getActionValues(i)[1], av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def main(): client_id = Utils.connectToVREP() # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = MyActionValueTable() learner = Q(0.5, 0.9) learner.explorer = EpsilonGreedyExplorer(0.15, 1) # EpsilonGreedyBoltzmannExplorer() agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) controller.initialize(agent) i = 0 try: while True: i += 1 print('Episode ' + str(i)) experiment.doEpisodes() agent.learn() agent.reset() print('mean: '+str(numpy.mean(controller.params))) print('max: '+str(numpy.max(controller.params))) print('min: '+str(numpy.min(controller.params))) if i % 500 == 0: # Save q-table every 500 episodes print('Save q-table') controller.save() task.t_table.save() except (KeyboardInterrupt, SystemExit): with open('../data/standing-up-q.pkl', 'wb') as handle: pickle.dump(controller.params, handle) task.t_table.save() controller.save() vrep.simxFinish(client_id)
def __init__(self, text_to_speech, speech_to_text): Feature.__init__(self) # setup AV Table self.av_table = GameTable(13, 2) if (self.av_table.loadParameters() == False): self.av_table.initialize(0.) # setup a Q-Learning agent learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) self.agent = LearningAgent(self.av_table, learner) # setup game interaction self.game_interaction = GameInteraction(text_to_speech, speech_to_text) # setup environment environment = GameEnvironment(self.game_interaction) # setup task task = GameTask(environment, self.game_interaction) # setup experiment self.experiment = Experiment(task, self.agent)
task.max_samples = 500 actions = len(environment.actions) actionValueNetwork = ActionValueTable(task.outdim, task.indim) actionValueNetwork.stdParams = 0.0001 actionValueNetwork.randomize() # actionValueNetwork = ActionValueNetwork(task.outdim,task.indim) # if os.path.isfile("q/q_train.npy"): # actionValueNetwork.param = np.load("q/q_train.npy") #else: actionValueNetwork.initialize(0.0001) # if os.path.isfile("nfq.xml"): actionValueNetwork.network = NetworkReader.readFrom('nfq.xml') pylab.pcolor(actionValueNetwork.params.reshape(32, actions).max(1).reshape(8,4).T) pylab.pause(0.01) learner = Q() agent = LearningAgent(actionValueNetwork, learner) experiment = Experiment(task, agent) start = time() i = 0 while True: for state in range(control.get_randomize_states()): control.randomize(state) task.reset() print("run %d" % i) experiment.doInteractions(1000)
states = 165 #Has to match class Env(Environment) - outdim in environment_01.py actions = 2 #Has to match class Env(Environment) - indim in environment_01.py try: arr = np.loadtxt('/home/pi/Desktop/ray_bot/ray_bot2.csv', delimiter=';') # open action value table from .csv file except Exception as e: # print e arr = np.zeros((states, actions)) # except if the file does not exist - ie. first time - then creat and initialize it with numpy of zeros av_table = ActionValueTable(states, actions) av_table.initialize(arr.flatten()) # define Q-learning agent learner = Q(0.1, 0.5) learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(av_table, learner) # define the environment env = Env() # define the task task = Task(env) # define experiment experiment = Experiment(task, agent) # ready to go, start the process #while PIR_sensing(PIR)==1 and ultrasonic (ECHO, TRIG)==1:
if __name__ == "__main__": # testing the environment and task from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.learners import Q from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import Experiment from pybrain.rl.explorers import EpsilonGreedyExplorer env = Chain() controller = ActionValueTable(env.outdim, env.indim) controller.initialize(1.) # controller.initialize(0.) # learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8 learner = Q() # default alpha 0.5, gamma 0.99 # learner._setExplorer(EpsilonGreedyExplorer(0.5)) agent = LearningAgent(controller, learner) task = ChainTask(env) exp = Experiment(task, agent) reward = 0 xs = [] ys = [] import matplotlib.pyplot as plt for i in xrange(5000): exp.doInteractions(1) agent.learn()
def Py_Brain(): ############################ # pybrain ############################ import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import itertools from scipy import linalg from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task import pylab #pylab.gray() #pylab.ion() ''' structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) ''' structure = np.array([[1, 1, 1, 1, 1], [1, 1, 0, 0, 1], [1, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 1, 1, 1]]) num_states = int(structure.shape[0]*structure.shape[1]) SQRT = int(math.sqrt(num_states)) #print structure.item((1, 3)) #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple environment = Maze(structure, (1, 3)) #second parameter is goal field tuple print type(environment) print environment # Standard maze environment comes with the following 4 actions: # North, South, East, West controller = ActionValueTable(num_states, 4) #[N, S, E, W] controller.initialize(1) learner = Q() agent = LearningAgent(controller, learner) np.not_equal(agent.lastobs, None) task = MDPMazeTask(environment) experiment = Experiment(task, agent) #while True: for x in range(4): print x experiment.doInteractions(10) agent.learn() agent.reset() pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT)) pylab.draw() #pylab.show() name='MAZE' plt.savefig(str(name)+'_PLOT.png') plt.close()
import sys, time from scipy import * from pybrain.rl.environments import Task from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.environments.mazes import Maze, MDPMazeTask from pybrain.rl.experiments import Experiment from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA var_structure_arr_ = array([[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 1, 0, 1], [1, 1, 1, 1, 1, 1, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) var_controller_ = ActionValueTable(81, 4) var_controller_.initialize(1.0) var_learner_ = Q() var_Agent_ = LearningAgent(var_controller_, var_learner_) var_task_ = MDPMazeTask(Task) experiment = Experiment(var_task_, var_Agent_)
case.generators[1].p_cost = (0.0, 6.5, 200.0) case.generators[2].p_cost = (0.0, 2.0, 200.0) case.generators[0].c_shutdown = 100.0 #case.generators[0].p_min = 0.0 # TODO: Unit-decommitment. #case.generators[1].p_min = 0.0 ##case.generators[2].p_min = 0.0 case.generators[0].p_max = 100.0 case.generators[1].p_max = 70.0 case.generators[2].p_max = 70.0 vre = VariantRothErev(experimentation=0.55, recency=0.3) vre.explorer = BoltzmannExplorer() #tau=100, decay=0.95) learners = [vre, Q(), Reinforce()] profile = [0.9, 0.6] m = (20, 75) # markups nb = 1 # no. offers ns = 3 # no. states mx = 60.0 # max markup weeks = 2 days = 2 outdir = "/tmp/case6ww1" dc = True
from pacmanAgent import PacmanAgent from runPacman import RunPacman from ghost import Ghost from pacmanEnvironment import Environment ############################################################### # The main function that begins running our Pacman-In-AI game # ############################################################### if __name__ == "__main__": # Initialize our Action-Environment-Reward Table controller = ActionValueTable(196, 4) controller.initialize(0.) # Initialize Reinforcement Learning learner = Q(0.5, 0.0) learner._setExplorer(EpsilonGreedyExplorer(0.0)) agent = LearningAgent(controller, learner) # Setup the PyBrain and PyGame Environments environment = Environment() game = RunPacman(environment) # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action task = PacmanTask(environment, game) task.performAction(np.array([1])) # The Experiment is the PyBrain link between the task to be completed and the agent completing it experiment = Experiment(task, agent) currentGame = 1
def runMainProg(): # define Q-learning agents with different attributes to see # which one will come out as the better player. # It would be possible to loop through the number of players N # and create them, however they would all have to be the same # or other code would have to be added to dynamically create # thier attributes. learner = [] learner.append(Q(0.9, 0.0)) learner[0]._setExplorer(EpsilonGreedyExplorer(0., 0.5)) learner.append(Q(0.5, 0.5)) learner[1]._setExplorer(EpsilonGreedyExplorer(0.29, 0.)) learner.append(Q(0.1, 0.5)) learner[2]._setExplorer(EpsilonGreedyExplorer(0.29, 0.5)) learner.append(Q(0.5, 0.0)) learner[3]._setExplorer(DiscreteStateDependentExplorer(0., 0.5)) learner.append(Q(0.5, 0.5)) learner[4]._setExplorer(DiscreteStateDependentExplorer(0.29, 0.5)) #define a blackjack deck theDeck = BlackjackCardDeck() #define action value table, agent, task, and environment arrays av_table = [] agent = [] env = [] task = [] #Loop through the number of players, and set up the action value table, #associated agent, environment, and task, so they can play the game for i in range(0, N): av_table.append(ActionValueTable(22, 2)) av_table[i].initialize(0.) agent.append(LearningAgent(av_table[i], learner[i])) env.append(BlackjackEnv(theDeck)) env[i].createHand() task.append(BlackjackTask(env[i])) #define a Dealer dealer = BlackjackDealer(theDeck) #run the game for a total of 1000 games. This value can be changed. for i in range(0, 1000): #This is the function that plays the game. The code for it is below. playGame(dealer, task, env, agent) #All of the games have been played, and now the results of the games played, #games won, tied, and lost are displayed. for i in range(0, N): print "Games Player ", i + 1, " Won Against The Dealer: ", GamesAgentWon[ i] print "Games Player ", i + 1, " Lost Against The Dealer: ", TotalGames - GamesTied[ i] - GamesAgentWon[i] print "Games Player ", i + 1, " Tied With The Dealer: ", GamesTied[i] print print "Total Games Played: ", TotalGames print #Create some arrays for the action value values, and the hits, and stands. #A new array is needed for the AV values because the AV table used for the program #is not an array that can be easily used to plot results, so below the AV values are #transfered to the array below for processing. theAVTables = [] hits = [] stands = [] #Move the values from the AV table to the array created above, and populate the #hits, and stands tables as well. The values in these tables will be used in the plot below. for i in range(0, N): print "Action Table Values for Player ", i + 1, ":" theAVTables.append([]) hits.append([]) stands.append([]) for j in range(0, 22): print "The AV Value At ", ( j + 1 ), " for Player ", i + 1, " is: ", av_table[i].getActionValues(j) theAVTables[i].append(av_table[i].getActionValues(j)) hits[i].append(theAVTables[i][j][0]) stands[i].append(theAVTables[i][j][1]) print print subPlotVal = 511 #The following uses matplot lib to display a nice graph about the results. for i in range(0, N): plt.figure(1) plt.subplot(subPlotVal) plot(hits[i], label="Hits") plot(stands[i], label="Stands") plt.ylabel('Probability') plt.title('Player ' + str(i + 1)) plt.axis([0, 30, -3, 3]) plt.legend() subPlotVal += 1 plt.xlabel('Hand Value') plt.show()
from pybrain.rl.experiments import Experiment from pybrain.rl.environments import Task from pybrain.rl.learners.valuebased import ActionValueTable from pybrain.rl.agents import LearningAgent from pybrain.rl.learners import Q, SARSA from charge_opt import * from power_env2 import * import matplotlib.pyplot as plt #set the environment environment = power_env2(80, 5, 3) # create the lerner controller = ActionValueTable(63, 3) controller.initialize(1.0) learner = Q(0.5, 1) # discount set to 0 because it is a infinitely long game agent = LearningAgent(controller, learner) # set the task task = charge_opt(environment, 0.8, 0.01) #do experiment number_of_runs = 20 #change the reward and run the whole experiment task.change_reward(1, 0.0) # create the experiment experiment = Experiment(task, agent) k = 0
Pd_min = get_pd_min(case, profile) market = pyreto.SmartMarket(case, priceCap=cap, decommit=decommit, auctionType=auctionType, locationalAdjustment=locAdj) experiment = pyreto.continuous.MarketExperiment([], [], market) portfolios, sync_cond = get_portfolios3() for gidx in portfolios: g = [case.generators[i] for i in gidx] learner = Q(alpha, gamma) learner.explorer.epsilon = epsilon learner.explorer.decay = decay task, agent = get_discrete_task_agent(g, market, nStates, nOffer, markups, withholds, maxSteps, learner, Pd0, Pd_min) print "ALL ACTIONS:", len(task.env._allActions) * nStates experiment.tasks.append(task) experiment.agents.append(agent) passive = [case.generators[i] for i in sync_cond] passive[0].p_min = 0.001 # Avoid invalid offer withholding. passive[0].p_max = 0.002
task = GymTask.createTask(gymRawEnv) env = task.env env.setTransformation(transformation) ## env.setCumulativeRewardMode() ## create value table and initialize with ones table = ActionValueTable(env.numStates, env.numActions) table = ActionValueTableWrapper(table) table.initialize(0.0) # table.initialize( np.random.rand( table.paramdim ) ) ### create agent with controller and learner - use SARSA(), Q() or QLambda() here ## alpha -- learning rate (preference of new information) ## gamma -- discount factor (importance of future reward) learner = Q(0.2, 0.99) # learner = SARSA(0.2, 0.99) ## learner = QLambda(0.5, 0.99, 0.9) explorer = learner.explorer explorer.decay = 1.0 agent = LearningAgent(table, learner) experiment = Experiment(task, agent) ## prevents "ImportError: sys.meta_path is None, Python is likely shutting down" atexit.register(task.close) render_demo = False imax = 5000 period_print = 100