def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False, exploretoo=True): """ Return the fitness value for one episode of play, given the policy defined by a neural network. """ import pdb pdb.set_trace() task = GameTask(game_env) game_env.recordingEnabled = True game_env.reset() net.reset() task.maxSteps=maxSteps agent = LearningAgent(net) agent.learning = False agent.logging = False exper = EpisodicExperiment(task, agent) fitness = 0 for _ in range(avgOver): rs = exper.doEpisodes(1) # add a slight bonus for more exploration, if rewards are identical if exploretoo: fitness += len(set(game_env._allEvents)) * 1e-6 # the true, discounted reward fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs]) fitness /= avgOver if returnEvents: return fitness, game_env._allEvents else: return fitness
def train(): # Make the environment environment = TwentyFortyEightEnvironment() # Store the environment as the task task = environment # Set up the Neural Network neuralNet = buildNetwork(task.nSenses, HIDDEN_NODES, task.nActions) # Use a Genetic Algorithm as the Trainer trainer = GA( populationSize=20, topProportion=0.2, elitism=False , eliteProportion=0.25, mutationProb=0.1 , mutationStdDev=0.2, tournament=False , tournamentSize=2 ) agent = OptimizationAgent(neuralNet, trainer) # Set up an experiment experiment = EpisodicExperiment(task, agent) # Train the network meanScores = [] print "Starting HillClimberNN" for i in xrange(LEARNING_EPOCHS): experiment.doEpisodes(GAMES_PER_EPOCH) print "Training Iteration", i, "With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock environment.maxGameBlock = 0 meanScores.append(task.meanScore) params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "hiddenNodes": HIDDEN_NODES } return meanScores, params, experiment
def train(self, episodes, maxSteps): avgReward = 0 # set up environment and task self.env = InfoMaxEnv(self.objectNames, self.actionNames, self.numCategories) self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \ do_decay_beliefs = True, uniformInitialBeliefs = True) # create neural net and learning agent self.params = buildNetwork(self.task.outdim, self.task.indim, \ bias=True, outclass=SoftmaxLayer) if self._PGPE: self.agent = OptimizationAgent(self.params, PGPE(minimize=False, verbose=False)) elif self._CMAES: self.agent = OptimizationAgent( self.params, CMAES(minimize=False, verbose=False)) # init and perform experiment exp = EpisodicExperiment(self.task, self.agent) for i in range(episodes): exp.doEpisodes(1) avgReward += self.task.getTotalReward() print "reward episode ", i, self.task.getTotalReward() # print initial info print "\naverage reward over training = ", avgReward / episodes # save trained network self._saveWeights()
class BaggerBot: def __init__(self, host, port, net=None): self.conn = ServerConnection(host, port) self.env = self.conn.env self.conn.join() self.task = SurviveTask(self.env, self.conn) self.net = buildNetwork(self.env.outdim, 4, self.env.indim, outclass=TanhLayer) self.agent = OptimizationAgent(self.net, PGPE()) self.experiment = EpisodicExperiment(self.task, self.agent) def wait_connected(self): self.conn.wait_connected() def train(self): ''' Infinitely play the game. Figure out the next move(s), parse incoming data, discard all that, do stupid stuff and die :) ''' while self.env.in_game: # Ask to be spawned logging.info('Requesting spawn...') self.conn.send_spawn() while not self.env.playing: self.conn.parse_pregame() while self.env.playing: self.experiment.doEpisodes(100)
def train(self, episodes, maxSteps): avgReward = 0 # set up environment and task self.env = InfoMaxEnv(self.objectNames, self.actionNames, self.numCategories) self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \ do_decay_beliefs = True, uniformInitialBeliefs = True) # create neural net and learning agent self.params = buildNetwork(self.task.outdim, self.task.indim, \ bias=True, outclass=SoftmaxLayer) if self._PGPE: self.agent = OptimizationAgent(self.params, PGPE(minimize=False,verbose=False)) elif self._CMAES: self.agent = OptimizationAgent(self.params, CMAES(minimize=False,verbose=False)) # init and perform experiment exp = EpisodicExperiment(self.task, self.agent) for i in range(episodes): exp.doEpisodes(1) avgReward += self.task.getTotalReward() print "reward episode ",i,self.task.getTotalReward() # print initial info print "\naverage reward over training = ",avgReward/episodes # save trained network self._saveWeights()
def simulate(self, random_state): """ Simulates agent behavior in 'n_sim' episodes. """ logger.debug("Simulating user actions ({} episodes)".format( self.rl_params.n_simulation_episodes)) self.experiment = EpisodicExperiment(self.task, self.agent) # set training flag off self.task.env.training = False # deactivate learning for experiment self.agent.learning = False # deactivate exploration explorer = self.agent.learner.explorer self.agent.learner.explorer = EGreedyExplorer( epsilon=0, decay=1, random_state=random_state) self.agent.learner.explorer.module = self.agent.module # activate logging self.task.env.start_logging() # simulate behavior self.experiment.doEpisodes(self.rl_params.n_simulation_episodes) # store log data dataset = self.task.env.log # deactivate logging self.task.env.end_logging() # reactivate exploration self.agent.learner.explorer = explorer # reactivate learning for experiment self.agent.learning = True # set training flag back on self.task.env.training = True return dataset
def train(): # Make the environment environment = TwentyFortyEightEnvironment() # The task is the game this time task = environment # Make the reinforcement learning agent (use a network because inputs are continuous) network = ActionValueNetwork(task.nSenses, task.nActions) # Use Q learning for updating the table (NFQ is for networks) learner = NFQ() learner.gamma = GAMMA agent = LearningAgent(network, learner) # Set up an experiment experiment = EpisodicExperiment(task, agent) # Train the Learner meanScores = [] for i in xrange(LEARNING_EPOCHS): experiment.doEpisodes(GAMES_PER_EPOCH) print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock meanScores.append(task.meanScore) agent.learn() agent.reset() params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA } return meanScores, params, agent
def _train_model(self): """ Uses reinforcement learning to find the optimal strategy """ self.experiment = EpisodicExperiment(self.task, self.agent) n_epochs = int(self.rl_params.n_training_episodes / self.rl_params.n_episodes_per_epoch) logger.debug( "Fitting user model over {} epochs, each {} episodes, total {} episodes." .format(n_epochs, self.rl_params.n_episodes_per_epoch, n_epochs * self.rl_params.n_episodes_per_epoch)) for i in range(n_epochs): logger.debug("RL epoch {}".format(i)) self.experiment.doEpisodes(self.rl_params.n_episodes_per_epoch) self.agent.learn() self.agent.reset() # reset buffers
def __init__(self, host, port, net=None): self.conn = ServerConnection(host, port) self.env = self.conn.env self.conn.join() self.task = SurviveTask(self.env, self.conn) self.net = buildNetwork(self.env.outdim, 4, self.env.indim, outclass=TanhLayer) self.agent = OptimizationAgent(self.net, PGPE()) self.experiment = EpisodicExperiment(self.task, self.agent)
def main(): """ The task represents one full simulation. Therefore it is episodic. Each episode calls performAction after passing getObservation to the agent. Once isFinished is true, the reward is returned and one simulation is done. The net is the neural network. It has 7 input nodes, a hidden layer of 5 nodes, and 2 output nodes. It is a feed-forward network using sigmoid activation functions. OptimizationAgent(module, learner) EpisodicExperiment.optimizer = learner learner.setEvaluator(task, module) optimizer.learn() """ task = LanderTask(batchSize=1) net = buildNetwork(task.indim, 5, task.outdim) learner = StochasticHillClimber() agent = OptimizationAgent(net, learner) experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(100000) tasks = [LanderTask(environment=Lander(acceleration=float(i))) for i in range(1, 4)] test_size = 1000 for task in tasks: print("Running task with acceleration {}".format(task.env.acceleration)) success = 0 for _ in range(test_size): task.env.reset() while not task.isFinished(): observation = task.getObservation() action = net.activate(observation) task.performAction(action) print("Finished a simulation with result {}".format(task.env.status)) if task.env.status == 'landed': success += 1 print("Succeeded {} times out of {}".format(success, test_size))
def main(): client_id = Utils.connectToVREP() # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = MyActionValueTable() learner = Q(0.5, 0.9) learner.explorer = EpsilonGreedyExplorer(0.15, 1) # EpsilonGreedyBoltzmannExplorer() agent = LearningAgent(controller, learner) experiment = EpisodicExperiment(task, agent) controller.initialize(agent) i = 0 try: while True: i += 1 print('Episode ' + str(i)) experiment.doEpisodes() agent.learn() agent.reset() print('mean: '+str(numpy.mean(controller.params))) print('max: '+str(numpy.max(controller.params))) print('min: '+str(numpy.min(controller.params))) if i % 500 == 0: # Save q-table every 500 episodes print('Save q-table') controller.save() task.t_table.save() except (KeyboardInterrupt, SystemExit): with open('../data/standing-up-q.pkl', 'wb') as handle: pickle.dump(controller.params, handle) task.t_table.save() controller.save() vrep.simxFinish(client_id)
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False): """ Return the fitness value for one episode of play, given the policy defined by a neural network. """ task = GameTask(game_env) game_env.recordingEnabled = True game_env.reset() net.reset() task.maxSteps=maxSteps agent = LearningAgent(net) agent.learning = False agent.logging = False exper = EpisodicExperiment(task, agent) fitness = 0 for _ in range(avgOver): rs = exper.doEpisodes(1) # add a slight bonus for more exploration, if rewards are identical fitness += len(set(game_env._allEvents)) * 1e-6 # the true, discounted reward fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs]) fitness /= avgOver if returnEvents: return fitness, game_env._allEvents else: return fitness
def main(): vrep.simxFinish(-1) # just in case, close all opened connections client_id = vrep.simxStart('127.0.0.1', 19997, True, True, 5000, 5) # Connect to V-REP if client_id < 0: print('Failed connecting to remote API server') return -1 print('Connected to remote API server') # Define RL elements environment = StandingUpEnvironment(client_id) task = StandingUpTask(environment) controller = ActionValueTable(task.get_state_space_size(), task.get_action_space_size()) controller.initialize(1.) file = open('standing-up-q.pkl', 'rb') controller._params = pickle.load(file) file.close() # learner = Q() agent = LearningAgent(controller) experiment = EpisodicExperiment(task, agent) i = 0 while True: i += 1 print('Iteration n° ' + str(i)) experiment.doEpisodes(1) vrep.simxFinish(client_id)
env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure()
class RLModel(): def __init__(self, rl_params, parameter_names, env, task, clean_after_call=False): """ Parameters ---------- rl_params : RLParams parameters : parameter names in order env : Environment model task : EpisodecTask instance clean_after_call: bool """ self.rl_params = rl_params self.parameter_names = parameter_names self.env = env self.task = task self.agent = None self.clean_after_call = clean_after_call def to_dict(self): return { "rl_params": self.rl_params.to_dict(), "parameters": self.parameters, } def train_model(self, parameter_values, random_state=None): self._parameters(parameter_values) self._build_model(random_state) self._train_model() def __call__(self, *parameter_values, index_in_batch=None, random_state=None): """ Simulates data. Interfaces to ELFI as a sequential simulator. Parameters ---------- parameter_values : list of model variables Length should equal length of parameters random_state: random number generator Returns ------- Simulated trajectories as a dict """ print("SIM AT", parameter_values) self.train_model(parameter_values, random_state=random_state) log_dict = self.simulate(random_state) if self.clean_after_call is True: self.clean() return log_dict def get_policy(self): """ Returns the current policy of the agent """ return self.agent.get_policy() def _parameters(self, parameter_values): """ Parse parameter values """ self.p = dict() if len(self.parameter_names) != len(parameter_values): raise ValueError( "Number of model variables was {} ({}), expected {}".format( len(parameter_values), parameter_values, len(self.parameter_names))) for name, val in zip(self.parameter_names, parameter_values): self.p[name] = float(val) logger.debug("Model parameters: {}".format(self.p)) def _build_model(self, random_state): """ Initialize the model """ self.env.setup(self.p, random_state) self.task.setup(self.p) outdim = self.task.env.outdim n_actions = self.task.env.numActions self.agent = RLAgent(outdim, n_actions, random_state, rl_params=self.rl_params) logger.debug("Model initialized") def _train_model(self): """ Uses reinforcement learning to find the optimal strategy """ self.experiment = EpisodicExperiment(self.task, self.agent) n_epochs = int(self.rl_params.n_training_episodes / self.rl_params.n_episodes_per_epoch) logger.debug( "Fitting user model over {} epochs, each {} episodes, total {} episodes." .format(n_epochs, self.rl_params.n_episodes_per_epoch, n_epochs * self.rl_params.n_episodes_per_epoch)) for i in range(n_epochs): logger.debug("RL epoch {}".format(i)) self.experiment.doEpisodes(self.rl_params.n_episodes_per_epoch) self.agent.learn() self.agent.reset() # reset buffers def simulate(self, random_state): """ Simulates agent behavior in 'n_sim' episodes. """ logger.debug("Simulating user actions ({} episodes)".format( self.rl_params.n_simulation_episodes)) self.experiment = EpisodicExperiment(self.task, self.agent) # set training flag off self.task.env.training = False # deactivate learning for experiment self.agent.learning = False # deactivate exploration explorer = self.agent.learner.explorer self.agent.learner.explorer = EGreedyExplorer( epsilon=0, decay=1, random_state=random_state) self.agent.learner.explorer.module = self.agent.module # activate logging self.task.env.start_logging() # simulate behavior self.experiment.doEpisodes(self.rl_params.n_simulation_episodes) # store log data dataset = self.task.env.log # deactivate logging self.task.env.end_logging() # reactivate exploration self.agent.learner.explorer = explorer # reactivate learning for experiment self.agent.learning = True # set training flag back on self.task.env.training = True return dataset def clean(self): self.agent = None self.env.clean() self.task.clean() gc.collect()
from environments.continous_maze_discrete import CTS_Maze from tasks.CTS_TASK import CTS_MazeTask from pybrain.rl.experiments import EpisodicExperiment from learners.baseline_learner import GP_SARSA from agents.baseline_agent import GPSARSA_Agent env = CTS_Maze([0.40, 0.40]) #goal task = CTS_MazeTask(env) learner = GP_SARSA(gamma=0.95) learner.batchMode = False #extra , not in use , set to True for batch learning agent = GPSARSA_Agent(learner) agent.logging = True exp = EpisodicExperiment( task, agent) #epsilon greedy exploration (with and without use of uncertainity) plt.ion() i = 1000 performance = [] #reward accumulation, dump variable for any evaluation metric sum = [] agent.reset() i = 0 for num_exp in range(100): performance = exp.doEpisodes(1) sum = np.append(sum, np.sum(performance)) if (num_exp % 10 == 0): agent.init_exploration -= agent.init_exploration * 0.10
# create environment env = SimpleEnvironment() env.setNoise(0.01) # create task task = MinimizeTask(env) # create controller network (flat network) net = buildNetwork(1, 1, bias=False) net._setParameters(array([0.0])) # create agent with controller and learner agent = FiniteDifferenceAgent(net, FDBasic()) # initialize parameters (variance) #agent.setSigma([-2.]) # learning options agent.learner.alpha = 0.1 # agent.learner.rprop = True experiment = EpisodicExperiment(task, agent) best = 0.0 base = 0.0 rew = 0.0 for updates in range(1000): # testing step agent.disableLearning() experiment.doEpisodes(2) # append mean reward to sr array ret = [] for n in range(agent.history.getNumSequences()): state, action, reward = agent.history.getSequence(n)
from pybrain.rl.environments.ode import CCRLEnvironment from pybrain.rl.environments.ode.tasks import CCRLGlasTask from pybrain.tools.shortcuts import buildNetwork from pybrain.structure.modules.tanhlayer import TanhLayer from pybrain.optimization import PGPE from pybrain.rl.agents import OptimizationAgent from pybrain.rl.experiments import EpisodicExperiment environment = CCRLEnvironment() task = CCRLGlasTask(environment) net = buildNetwork(len(task.getObservation()),4,environment.indim,outclass=TanhLayer) agent = OptimizationAgent(net,PGPE()) experiment = EpisodicExperiment(task,agent) for updates in range(20000): experiment.doEpisode()
#the task is the game this time task = environment #make the reinforcement learning agent (use a network because inputs are continuous) controller = ActionValueNetwork(task.nsenses,task.nactions) #use Q learning for updating the table (NFQ is for networks) learner = NFQ() agent = LearningAgent(controller, learner) #set up an experiment experiment = EpisodicExperiment(task, agent) meanscores = [] m = 0.0 for i in xrange(learning_eps): print i experiment.doEpisodes(games_per_ep) meanscores.append(task.meanscore) if meanscores[-1] > m: m = meanscores[-1] f = open("bestRL.pkl",'w') pickle.dump(agent,f) f.close() agent.learn() agent.reset()
def run_experiment(): # Create the controller network HIDDEN_NODES = 4 RUNS = 2 BATCHES = 1 PRINTS = 1 EPISODES = 500 env = None start_state_net = None run_results = [] # Set up plotting tools for the experiments tools = ExTools(BATCHES, PRINTS) # Run the experiment for run in range(RUNS): if run == 0: continue # If an environment already exists, shut it down if env: env.closeSocket() # Create the environment env = create_environment() # Create the task task = Pa10MovementTask(env) # Create the neural network. Only create the network once so it retains # the same starting values for each run. if start_state_net: net = start_state_net.copy() else: # Create the initial neural network net = create_network( in_nodes=env.obsLen, hidden_nodes=HIDDEN_NODES, out_nodes=env.actLen ) start_state_net = net.copy() # Create the learning agent learner = HillClimber(storeAllEvaluations=True) agent = OptimizationAgent(net, learner) tools.agent = agent # Create the experiment experiment = EpisodicExperiment(task, agent) # Perform all episodes in the run for episode in range(EPISODES): experiment.doEpisodes(BATCHES) # Calculate results all_results = agent.learner._allEvaluations max_result = np.max(all_results) min_result = np.min(all_results) avg_result = np.sum(all_results) / len(all_results) run_results.append((run, max_result, min_result, avg_result)) # Make the results directory if it does not exist if not os.path.exists(G_RESULTS_DIR): os.mkdir(G_RESULTS_DIR) # Write all results to the results file with open(os.path.join(G_RESULTS_DIR, 'run_%d.txt' % run), 'w+') as f: # Store the calculated max, min, avg f.write('RUN, MAX, MIN, AVG\n') f.write('%d, %f, %f, %f\n' % (run, max_result, min_result, avg_result)) # Store all results from this run f.write('EPISODE, REWARD\n') for episode, result in enumerate(all_results): f.write('%d, %f\n' % (episode, result)) return
env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # create agent and set parameters from command line agent = LearningAgent(net, None) agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])]) # create experiment experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(episodes) # run environment ret = [] for n in range(agent.history.getNumSequences()): returns = agent.history.getSequence(n) reward = returns[2] ret.append( sum(reward, 0).item() ) # print results print ret, "mean:",mean(ret) #env.getRenderer().stop()
hiddenUnits = 4 batch=2 #number of samples per learning step prnts=1 #number of learning steps after results are printed epis=5000000/batch/prnts #number of roleouts numbExp=10 #number of experiments et = ExTools(batch, prnts) #tool for printing and plotting for runs in range(numbExp): # create environment #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560) env = FlexCubeEnvironment() # create task task = WalkTask(env) # create controller network net = buildNetwork(len(task.getObservation()), hiddenUnits, env.actLen, outclass=TanhLayer) # create agent with controller and learner (and its options) agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True)) et.agent = agent # create the experiment experiment = EpisodicExperiment(task, agent) #Do the experiment for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates) et.addExps() et.showExps() #To view what the simulation is doing at the moment, go to pybrain/rl/environments/flexcube/ and start renderer.py (python-openGL musst be installed)
# create environment #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560) env = ShipSteeringEnvironment(False) # create task task = GoNorthwardTask(env,maxsteps = 500) # create controller network net = buildNetwork(task.outdim, task.indim, outclass=TanhLayer) # create agent with controller and learner agent = FiniteDifferenceAgent(net, SPLA()) # learning options agent.learner.gd.alpha = 0.3 #step size of \mu adaption agent.learner.gdSig.alpha = 0.15 #step size of \sigma adaption agent.learner.gd.momentum = 0.0 batch=2 #number of samples per gradient estimate (was: 2; more here due to stochastic setting) #create experiment experiment = EpisodicExperiment(task, agent) prnts=1 #frequency of console output epis=2000/batch/prnts #actual roll outs filename="dataSPLA08NoRew"+repr(int(random.random()*1000000.0))+".dat" wf = open(filename, 'wb') for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) #execute #batch episodes agent.learn() #learn from the gather experience agent.reset() #reset agent and environment #print out related data stp = (updates+1)*batch*prnts print "Step: ", runs, "/", stp, "Best: ", agent.learner.best, "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward wf.write(repr(stp)+"\n")
__author__ = 'Stubborn' from pybrain.rl.environments.ode import CCRLEnvironment from pybrain.rl.environments.ode.tasks import CCRLGlasTask from pybrain.tools.shortcuts import buildNetwork from pybrain.structure.modules.tanhlayer import TanhLayer from pybrain.optimization import PGPE from pybrain.rl.agents import OptimizationAgent from pybrain.rl.experiments import EpisodicExperiment environment = CCRLEnvironment() task = CCRLGlasTask(environment) net = buildNetwork(len(task.getObservation()), 4, environment.indim, outclass=TanhLayer) agent = OptimizationAgent(net, PGPE()) experiment = EpisodicExperiment(task, agent) for updates in range(20000): experiment.doEpisodes(1)
def main(): if len(sys.argv) != 2: print 'Please provide a path to a model data directory.' print ('The script will load the newest model data from the directory,' 'then continue to improve that model') sys.exit(0) model_directory = sys.argv[1] existing_models = sorted(glob(os.path.join(model_directory, '*.rlmdl'))) if existing_models: newest_model_name = existing_models[-1] iteration_count = int(newest_model_name[-12:-6]) + 1 print 'Loading model {}'.format(newest_model_name) newest_model = open(newest_model_name, 'r') agent = pickle.load(newest_model) else: net = buildNetwork(Environment.outdim, Environment.outdim + Environment.indim, Environment.indim) agent = OptimizationAgent(net, PGPE()) iteration_count = 1 environment = Environment(LOCAL_HOST, PORT, PATH_TO_SCENE) task = Task(environment) experiment = EpisodicExperiment(task, agent) def signal_handler(signal, frame): print 'Exiting gracefully' environment.teardown() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) while True: time.sleep(1) print '>>>>> Running iteration {}'.format(iteration_count) # NOTE this weird stuff is hacky, but we need it to plug in our autosave # stuff properly. Took a long time to figure this out. experiment.optimizer.maxEvaluations = experiment.optimizer.numEvaluations + experiment.optimizer.batchSize try: experiment.doEpisodes() except Exception as e: print 'ERROR RUNNING SIMULATION: \n{}'.format(e) environment.teardown() else: if iteration_count % AUTOSAVE_INTERVAL == 0: filename = str(iteration_count).zfill(6) + '.rlmdl' filename = os.path.join(model_directory, filename) f = open(filename, 'w+') print 'Saving model to {}'.format(filename) pickle.dump(agent, f) iteration_count += 1 print 'Iteration finished <<<<<'
def run(nao,pad): # ################################ # choose bottom cam, so nao can see object when standing next to it nao.camera.selectCam(1) env = grabbingEnvironment(nao) #env.connect(nao) task = grabbingTask(env) net = buildNetwork(len(task.getObservation()),8, env.indim, bias = True, recurrent=True) print env.indim #net = ActionValueNetwork(5,4) #, outclass=TanhLayer) #, hiddenclass=TanhLayer, outclass=TanhLayer # not correct right now.. # TODO: train into RL Modules, dataset needs to be merged with exploration data #generateTraining.generateTraining().runDeltaMovements(nao,net,env,pad) #module = ActionValueNetwork(3, 3) #module = NeuronLayer(40) #agent = LearningAgent(net, SARSA()) #learner = PolicyGradientLearner() #learner._setExplorer(StateDependentExplorer(3,3)) #learner._setModule(module) #agent = LearningAgent(module, learner) #agent = LearningAgent(net, ENAC()) #agent = LearningAgent(net, Reinforce()) #learner = NFQ() #learner.explorer.epsilon = 0.4 #agent = LearningAgent(net, learner) testagent = OptimizationAgent(net,None,env) #agent = LearningAgent(module, Q()) #agent = LearningAgent(module, QLambda()) learner = grabbingPGPE(storeAllEvaluations = True, verbose = True, epsilon = 1.0, deltamax =5.0, sigmaLearningRate = 0.1, learningRate = 0.2) agent = OptimizationAgent(net, learner,env) #agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, HillClimber(storeAllEvaluations = True, verbose = True)) #agent = OptimizationAgent(net, RandomSearch(storeAllEvaluations = True, verbose = True)) experiment = EpisodicExperiment(task, agent) # only for optimizationAgent #experiment.doOptimization = True # only for simulator! nao.fractionMaxSpeed = 1.0 print "#env" print " sensors:", env.outdim print " actions:", env.indim print " discreteStates:", env.discreteStates print " discreteActions:", env.discreteActions print print "#task" print " sensor_limits:", task.sensor_limits print " actor_limits:", task.actor_limits print " epilen: ", task.epiLen print "#EpisodicTask" print " discount:", task.discount print " batchsize:", task.batchSize print print "#PGPE" print " exploration type:", grabbingPGPE().exploration print " LearningRate:", grabbingPGPE().learningRate print " sigmaLearningRate:", grabbingPGPE().sigmaLearningRate print " epsilon:", grabbingPGPE().epsilon print " wDecay:", grabbingPGPE().wDecay print " momentum:", grabbingPGPE().momentum print " rprop:", grabbingPGPE().rprop # # switch this to True if you want to see the cart balancing the pole (slower) # render = False # # plt.ion() # # env = CartPoleEnvironment() # if render: # renderer = CartPoleRenderer() # env.setRenderer(renderer) # renderer.start() # # module = ActionValueNetwork(4, 3) # # task = DiscreteBalanceTask(env, 100) # learner = NFQ() # learner.explorer.epsilon = 0.4 # # agent = LearningAgent(module, learner) # testagent = LearningAgent(module, None) # experiment = EpisodicExperiment(task, agent) # # performance = [] # # if not render: # pf_fig = plt.figure() count = 0 while(True): # one learning step after one episode of world-interaction count += 1 print "learning #",count experiment.agent = agent experiment.doOptimization = True erg = experiment.doEpisodes(1) print erg #experiment.doOptimization = False #print "agent learn" #agent.learner.learn(1) if count > 8: # test performance (these real-world experiences are not used for training) # if render: # env.delay = True #experiment.agent = testagent print "testing" experiment.doOptimization = False erg = experiment.doEpisodes(1) summe = 0 #print erg # for x in erg: # summe = sum(x) # print summe #r = mean([sum(x) for x in experiment.doEpisodes(5)]) # env.delay = False # testagent.reset() # performance.append(r) # if not render: # plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # #for updates in range(5000000) # updates = 0 # episodes = 10 # while True: # updates += 1 # #raw_input("next episode") # print "lerne episode:",updates # experiment.doEpisodes(episodes) ## print "lernen beendet, starte testlauf" # env.reset() ## if updates > 0: ## experiment.doInteractions(20) ## rewsum = 0 ## rewlist = [] ## for i in range (0,100): ## rew = task.performAction(net.activate(task.getObservation())) ## ## task.getObservation() ## rewlist.append(rew) ## rewsum += rew # #print " testlauf: ",updates,"aktion: ",i+1," reward: ",rew ## print "-> summe = ",rewsum, " avg: ",rewsum / 100.0 # #print "episodes:",updates," rewsum: ",rewsum," testrewards:",rewlist ## #x = "episode:" + updates + " testrewards:" + rewlist ## #o.write(x) ## for i in range(0,len(rewlist)): ## x = (updates % 20) - 10 ## y = i - 10 ## z = rewlist[i] ## #g.plot((x,y,z),x=1, y=2, z=3) # # #g.doplot() # # # # #print "-------------------------------------------------------------------" print "finished grabbingTest"
[1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]) side = 9 goal = 3,2 env = mazeEnv(structure, goal) #use maze environment for now; note pos is Y,X # our own task and environment for later #env = policyEnv() thetask = MDPMazeTaskEpisodic(env) # create neural net; create and train agent theparams = buildNetwork(thetask.outdim, thetask.indim, bias=False) agent = OptimizationAgent(theparams, CMAES()) exp = EpisodicExperiment(thetask, agent) # train agent exp.doEpisodes(NUM_EPISODES) print "\ntotal reward = ",thetask.getTotalReward() #print "\n" #print "initial weights: "; print theparams.params print "\n" print "NOTE positions below are (Y,X)" print "\n" print "getting observation 1" print "robot = ",thetask.getObservation() print "goal = ",goal print "reward: ", thetask.getReward()
env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if not render: pf_fig = plt.figure() while(True):
from pybrain_extension.environment.morse_environment import ContinuousControllerEnvironment from pybrain_extension.task.red_cube_task import CameraPixelsRedCubeTask try: import __builtin__ input = getattr(__builtin__, 'raw_input') except (ImportError, AttributeError): pass with CurrentController(3) as control: environment = ContinuousControllerEnvironment(control) task = CameraPixelsRedCubeTask(environment, True) experiment = EpisodicExperiment(task, None) # control.calibrate() start = time() bias = True def eval_fitness(genomes): for g in genomes: # visualize.draw_net(g, view=False) agent = NeatAgent(g, bias=bias) g.fitness = 0 for state in range(control.get_randomize_states()): control.randomize(state) g.fitness += task.f(agent) if not task.found_cube:
env = Maze(envmatrix, (7, 7)) # create task task = MDPMazeTask(env) # create value table and initialize with ones table = ActionValueTable(81, 4) table.initialize(1.) # create agent with controller and learner - use SARSA(), Q() or QLambda() here # learner = Q() learner = SARSA() # standard exploration is e-greedy, but a different type can be chosen as well # learner.explorer = BoltzmannExplorer() # create agent agent = LearningAgent(table, learner) # create experiment # experiment = Experiment(task, agent) experiment = EpisodicExperiment(task, agent) # prepare plotting pylab.gray() pylab.ion() for i in range(50): # interact with the environment (here in batch mode) experiment.doInteractions(100) agent.learn() agent.reset() # and draw the table pylab.imshow(table.params.reshape(81, 4).max(1).reshape(9, 9), origin='upper', interpolation='none') pylab.title('Episode: ' + str(i + 1) + '/50') pylab.show() pylab.pause(0.01)
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"]) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions module = ActionValueNetwork(task.outdim, task.indim) learner = NFQ() # % of random actions learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) env.delay = False testagent.reset() #experiment.agent = agent performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2)
from pybrain.rl.experiments import EpisodicExperiment # any episodic task task = BalanceTask() # any neural network controller net = buildNetwork(task.outdim, 1, task.indim) # any optimization algorithm to be plugged in, for example: # learner = CMAES(storeAllEvaluations = True) # or: learner = HillClimber(storeAllEvaluations=True) # in a non-optimization case the agent would be a LearningAgent: # agent = LearningAgent(net, ENAC()) # here it is an OptimizationAgent: agent = OptimizationAgent(net, learner) # the agent and task are linked in an Experiment # and everything else happens under the hood. exp = EpisodicExperiment(task, agent) exp.doEpisodes(100) print('Episodes learned from:', len(learner._allEvaluations)) n, fit = learner._bestFound() print('Best fitness found:', fit) print('with this network:') print(n) print('containing these parameters:') print(fListToString(n.params, 4))
# create task task = GrowTask(env) # create controller network (flat network) net = buildNetwork(32, 10, 12) # create agent with controller and learner agent = FiniteDifferenceAgent(net, SPLA()) # learning options agent.learner.gd.alpha = 0.05 agent.learner.gdSig.alpha = 0.1 agent.learner.gd.momentum = 0.0 agent.learner.epsilon = 2.0 agent.learner.initSigmas() sr = [] experiment = EpisodicExperiment(task, agent) for updates in range(1000): # training step for i in range(5): experiment.doEpisodes(10) agent.learn() print "parameters:", agent.module.params agent.reset() # learning step agent.disableLearning() experiment.doEpisodes(50) # append mean reward to sr array ret = [] for n in range(agent.history.getNumSequences()): state, action, reward, _ = agent.history.getSequence(n)
def run(arg): task = arg[0] parameters = arg[1] #print "run with", task,parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed) render = False plot = False plt.ion() env = CartPoleEnvironment() env.randomInitialization = False if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, 50) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions bmodule = ActionValueRAND(task.outdim, task.indim) rlearner = RAND() blearner = RAND() # % of random actions bagent = LearningAgent(bmodule, rlearner) from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False)) testagent = LearningAgent(module, None) pgpeexperiment = EpisodicExperiment(task, agent) randexperiment = EpisodicExperiment(task, bagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] ## train pgpe for episode in range(0,50): # one learning step after one episode of world-interaction y =pgpeexperiment.doEpisodes(1) be, bf = agent.learner._bestFound() print be,bf print "generate data" be.numActions = 1 gdagent = LearningAgent(be, blearner) experiment = EpisodicExperiment(task, gdagent) for episode in range(0,1000): # print episode, " of 1000" # one learning step after one episode of world-interaction y =experiment.doEpisodes(1) # print y x = randexperiment.doEpisodes(1) # print len(y[0]) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True l = 5 resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break #print resList performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) blearner.add_ds(rlearner.dataset) blearner.learn() #blearner.learnX(agent.learner._allEvaluated) print "done" return performance
task = LinearFATileCoding3456BalanceTask() env = task.env # The learning is very sensitive to the learning rate decay. learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim, learningRateDecay=2000) learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) agent.logging = False exp = EpisodicExperiment(task, agent) performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False env.saveWheelContactTrajectories(True) plt.ion() plt.figure(figsize=(8, 4)) ax1 = plt.subplot(1, 2, 1) ax2 = plt.subplot(1, 2, 2) def update_wheel_trajectories():
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None) #print "dim: ", task.indim, task.outdim from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False)) # # print agent # from pprint import pprint # pprint (vars(agent.learner)) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) #agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) #for i in range(0,parameters["TestWith"]): # y = testexperiment.doEpisodes(1) # print (agent.learner._allEvaluated) # # # from pprint import pprint # pprint (vars(task)) l = parameters["TestWith"] task.N = parameters["MaxRunsPerEpisodeTest"] experiment.doEpisodes(l) task.N = parameters["MaxRunsPerEpisode"] resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2) #import sumatra.parameters as p #import sys #parameter_file = sys.argv[1] #parameters = p.SimpleParameterSet(parameter_file) # # #run(["BalanceTask",parameters])
agent = FiniteDifferenceAgent(net, SPLA()) # learning options agent.learner.gd.alpha = 0.2 #step size of \mu adaption agent.learner.gdSig.alpha = 0.085 #step size of \sigma adaption agent.learner.gd.momentum = 0.0 #Loading weights if loadNet: agent.learner.original = loadWeights("grasp.wgt") agent.learner.gd.init(agent.learner.original) agent.learner.epsilon = 0.2 agent.learner.initSigmas() batch = 2 #number of samples per gradient estimate #create experiment experiment = EpisodicExperiment(task, agent) prnts = 1 #frequency of console output epis = 5000000 / batch / prnts #actual roll outs for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) #execute batch episodes agent.learn() #learn from the gather experience agent.reset() #reset agent and environment #print out related data print "Step: ", runs, "/", ( updates + 1) * batch * prnts, "Best: ", agent.learner.best, print "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward #Saving weights if saveNet:
def train(self, size, goal, initPose, mapSelect, envSelect, episodes, maxSteps, goalTol, randomizeInitPose): avgReward = 0 # set up environment and task self.env = mazeEnv(size, goal, initPose, mapSelect, envSelect, randomizeInitPose) self.task = MDPMazeTaskEpisodic(self.env, maxSteps, goalTol) # create neural net and learning agent self.params = buildNetwork(self.task.outdim, 48, self.task.indim, \ bias=True, outclass=SoftmaxLayer) if self._PGPE: self.agent = OptimizationAgent(self.params, PGPE(minimize=True,verbose=False)) elif self._CMAES: self.agent = OptimizationAgent(self.params, CMAES(minimize=True,verbose=False)) # init experiment exp = EpisodicExperiment(self.task, self.agent) for i in range(0, episodes): exp.doEpisodes() avgReward += self.task.getTotalReward() print "reward episode ",i,self.task.getTotalReward() # print initial info print "\naverage reward over training = ",avgReward/episodes # import weights into network and save network if self._PGPE: for i in range(len(self.params.params)): self.params.params[i] = self.agent.learner.current[i] pickle.dump(self.params, open('policyNet.pkl','w')) elif self._CMAES: ################ following code came from WWInfoMaxCMAES.py script from ICDL 2010 paper arz = randn(self.agent.learner.numParameters, self.agent.learner.batchSize) arx = tile(self.agent.learner.center.reshape(self.agent.learner.numParameters, 1),\ (1, self.agent.learner.batchSize)) + \ self.agent.learner.stepSize * dot(dot(self.agent.learner.B, self.agent.learner.D), arz) # Go through the parameters and pick the current best arfitness = zeros(self.agent.learner.batchSize) for k in xrange(self.agent.learner.batchSize): self.agent.learner.wrappingEvaluable._setParameters(arx[:, k]); arfitness[k] = self.agent.learner._BlackBoxOptimizer__evaluator\ (self.agent.learner.wrappingEvaluable) # Sort by fitness and compute weighted mean into center tmp = sorted(map(lambda (x, y): (y, x), enumerate(ravel(arfitness)))) arfitness = array(map(lambda x: x[0], tmp)) arindex = array(map(lambda x: int(x[1]), tmp)) arz = arz[:, arindex] curparams = arx[:, arindex[0]]; # update network weights with selected parameters for i in range(len(self.params.params)): self.params.params[i] = curparams[i] # save trained network pickle.dump(self.params, open('policyNet.pkl','w'))
sum = [] track_time = [] dict_size = [] for repeat in range(1): env = CTS_Maze([0.50, 0.50]) # goal task = CTS_MazeTask(env) learner = GP_SARSA_SPARSE(gamma=0.95) learner.sigma = 1 learner.batchMode = False # extra , not in use , set to True for batch learning agent = GPSARSA_Agent(learner) agent.logging = True exp = EpisodicExperiment(task, agent) agent.reset() sum = [] performance = [] track_time = [] agent.init_exploration = 1.0 starttime = time.time() dict_size = [] epsilon = [] b = [] c = [] for num_exp in range(600): performance = exp.doEpisodes(1) sum = np.append(sum, np.sum(performance))
task = LinearFATileCoding3456BalanceTaskRewardBipolar() learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim) learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) agent.epsilonGreedy = True agent.init_exploration = 0.5 # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False experiment = EpisodicExperiment(task, agent) # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 2000 # NOTE increasing this number above from the default of 100 is what got the # learning to actually happen, and fixed the bug/issue where the performance # agent's performance stopped improving. tr = LinearFATraining( 'balance_sarsalambda_linfa_replacetrace_anneal_RewardBipolar_take2', experiment, performance_agent, verbose=True)
from pybrain.rl.environments.cartpole.balancetask import BalanceTask from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.rl.experiments import EpisodicExperiment from pybrain.optimization import HillClimber task = BalanceTask() net = buildNetwork(task.outdim, 3, task.indim) HillClimber(task, net, maxEvaluations=100).learn() agent = OptimizationAgent(net, HillClimber()) exp = EpisodicExperiment(task, agent) print(exp.doEpisodes(100))
from pybrain.rl.environments.timeseries.timeseries import MonthlySnPEnvironment from pybrain.rl.learners.directsearch.rrl import RRL from pybrain.structure import RecurrentNetwork from pybrain.structure import LinearLayer, SigmoidLayer, TanhLayer, BiasUnit from pybrain.structure import FullConnection from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from numpy import sign, round from matplotlib import pyplot net= RecurrentNetwork() #Single linear layer with bias unit, and single tanh layer. the linear layer is whats optimised net.addInputModule(BiasUnit(name='bias')) net.addOutputModule(TanhLayer(1, name='out')) net.addRecurrentConnection(FullConnection(net['out'], net['out'], name='c3')) net.addInputModule(LinearLayer(1,name='in')) net.addConnection(FullConnection(net['in'],net['out'],name='c1')) net.addConnection((FullConnection(net['bias'],net['out'],name='c2'))) net.sortModules() net._setParameters([-8.79227886e-02, -8.29319017e+02, 1.25946474e+00]) print(net._params) env=MonthlySnPEnvironment() task=MaximizeReturnTask(env) learner = RRL() # ENAC() #Q_LinFA(2,1) agent = LearningAgent(net,learner) exp=EpisodicExperiment(task,agent) exp.doEpisodes(10)
self._etraces[argwhere(action_bit != 1), argstate] = 0.0 task = LinearFATileCoding3456BalanceTask() env = task.env # The learning is very sensitive to the learning rate decay. learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim, learningRateDecay=2000) learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) agent.logging = False exp = EpisodicExperiment(task, agent) performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False env.saveWheelContactTrajectories(True) plt.ion() plt.figure(figsize=(8, 4)) ax1 = plt.subplot(1, 2, 1) ax2 = plt.subplot(1, 2, 2) def update_wheel_trajectories():
for runs in range(numbExp): # create environment #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560) if env != None: env.closeSocket() env = ShipSteeringEnvironment() # create task task = GoNorthwardTask(env, maxsteps=500) # create controller network net = buildNetwork(task.outdim, task.indim, outclass=TanhLayer) # create agent with controller and learner (and its options) agent = OptimizationAgent( net, PGPE(learningRate=0.3, sigmaLearningRate=0.15, momentum=0.0, epsilon=2.0, rprop=False, storeAllEvaluations=True)) et.agent = agent #create experiment experiment = EpisodicExperiment(task, agent) #Do the experiment for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates) et.addExps() et.showExps() #To view what the simulation is doing at the moment set the environment with True, go to pybrain/rl/environments/ode/ and start viewer.py (python-openGL musst be installed, see PyBrain documentation)
last_xf = self.env.last_xf last_yf = self.env.last_yf dist_to_goal_last = np.linalg.norm(target - np.array([last_xf, last_yf])) delta_dist = dist_to_goal - dist_to_goal_last return -delta_tilt - delta_dist * 0.01 task = LSPI_task() learner = LSPI(9, 20) task.rewardDiscount = 0.8 learner.rewardDiscount = 0.8 agent = LinearFA_Agent(learner) agent.epsilonGreedy = True exp = EpisodicExperiment(task, agent) learner.learningRateDecay = 3000 max_agent = LinearFA_Agent(learner) max_agent.learnerning = False max_agent.greedy = True task.env.saveWheelContactTrajectories(True) plt.ion() plt.figure(figsize=(8, 4)) ax1 = plt.subplot(1, 2, 1) ax2 = plt.subplot(1, 2, 2) def update_wheel_trajectories(): front_lines = ax2.plot(task.env.get_xfhist(), task.env.get_yfhist(), 'r')