def __init__(self, env=None, maxsteps=1000, desiredValue=0, location='Portland, OR'): """ :key env: (optional) an instance of a CartPoleEnvironment (or a subclass thereof) :key maxsteps: maximal number of steps (default: 1000) """ self.location = location self.airport_code = weather.airport(location) self.desiredValue = desiredValue if env is None: env = CartPoleEnvironment() EpisodicTask.__init__(self, env) self.N = maxsteps self.t = 0 # scale position and angle, don't scale velocities (unknown maximum) self.sensor_limits = [(-3, 3)] for i in range(1, self.outdim): if isinstance(self.env, NonMarkovPoleEnvironment) and i % 2 == 0: self.sensor_limits.append(None) else: self.sensor_limits.append((-np.pi, np.pi)) # self.sensor_limits = [None] * 4 # actor between -10 and 10 Newton self.actor_limits = [(-50, 50)]
from pybrain.tools.example_tools import ExTools from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.environments.cartpole import CartPoleEnvironment, BalanceTask from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import ExactNES from pybrain.rl.experiments import EpisodicExperiment batch = 2 #number of samples per learning step prnts = 100 #number of learning steps after results are printed epis = 4000 / batch / prnts #number of roleouts numbExp = 10 #number of experiments et = ExTools(batch, prnts) #tool for printing and plotting for runs in range(numbExp): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # create controller network net = buildNetwork(4, 1, bias=False) # create agent with controller and learner (and its options) agent = OptimizationAgent(net, ExactNES(storeAllEvaluations=True)) et.agent = agent # create the experiment experiment = EpisodicExperiment(task, agent) #Do the experiment for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) print "Epsilon : ", agent.learner.sigma
from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.environments.cartpole import CartPoleEnvironment, CartPoleRenderer, BalanceTask from pybrain.rl.agents.learning import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from scipy import mean import sys episodes = 1 epilen = 200 if len(sys.argv) < 5: sys.exit('please give 4 parameters. run: "python play_catpole.py <p1> <p2> <p3> <p4>"\n') # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # create agent and set parameters from command line agent = LearningAgent(net, None) agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])]) # create experiment
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None) #print "dim: ", task.indim, task.outdim from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False)) # # print agent # from pprint import pprint # pprint (vars(agent.learner)) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) #agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) #for i in range(0,parameters["TestWith"]): # y = testexperiment.doEpisodes(1) # print (agent.learner._allEvaluated) # # # from pprint import pprint # pprint (vars(task)) l = parameters["TestWith"] task.N = parameters["MaxRunsPerEpisodeTest"] experiment.doEpisodes(l) task.N = parameters["MaxRunsPerEpisode"] resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2) #import sumatra.parameters as p #import sys #parameter_file = sys.argv[1] #parameters = p.SimpleParameterSet(parameter_file) # # #run(["BalanceTask",parameters])
import numpy as np from __future__ import print_function from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask, CartPoleRenderer from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.explorers import BoltzmannExplorer from numpy import array, arange, meshgrid, pi, zeros, mean from matplotlib import pyplot as plt # switch this to True if you want to see the cart balancing the pole (slower) render = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent)
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.experiments import EpisodicExperiment from training import NFQTraining task = DiscreteBalanceTask(CartPoleEnvironment(), 100) action_value_function = ActionValueNetwork(4, 3, name='CartPoleNFQActionValueNetwork') learner = NFQ() #learner.gamma = 0.99 learner.explorer.epsilon = 0.4 task.discount = learner.gamma agent = LearningAgent(action_value_function, learner) performance_agent = LearningAgent(action_value_function, None) experiment = EpisodicExperiment(task, agent) tr = NFQTraining('cartpole_nfq', experiment, performance_agent) tr.train(7000, performance_interval=1, n_performance_episodes=5)
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask, CartPoleRenderer from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.explorers import BoltzmannExplorer from numpy import array, arange, meshgrid, pi, zeros, mean from matplotlib import pyplot as plt # switch this to True if you want to see the cart balancing the pole (slower) render = False #render = True plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent)
def run(arg): task = arg[0] parameters = arg[1] #print "run with", parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed + process_id) render = False plot = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, parameters["MaxRunsPerEpisode"]) testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"]) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions module = ActionValueNetwork(task.outdim, task.indim) learner = NFQ() # % of random actions learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent) testexperiment = EpisodicExperiment(testtask, testagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] for episode in range(0,m): # one learning step after one episode of world-interaction experiment.doEpisodes(parameters["EpisodesPerLearn"]) agent.learn(1) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True if (episode) % parameters["TestAfter"] == 0: #print "Evaluating at episode: ", episode #experiment.agent = testagent r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])]) env.delay = False testagent.reset() #experiment.agent = agent performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) # print "done" return performance #print "network", json.dumps(module.bn.net.E, indent=2)
def run(arg): task = arg[0] parameters = arg[1] #print "run with", task,parameters seed = parameters["seed"] process_id = hash(multiprocessing.current_process()._identity) numpy.random.seed(seed) render = False plot = False plt.ion() env = CartPoleEnvironment() env.randomInitialization = False if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() task_class = getattr(cp, task) task = task_class(env, 50) #print "dim: ", task.indim, task.outdim # to inputs state and 4 actions bmodule = ActionValueRAND(task.outdim, task.indim) rlearner = RAND() blearner = RAND() # % of random actions bagent = LearningAgent(bmodule, rlearner) from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import PGPE module = buildNetwork(task.outdim, task.indim, bias=False) # create agent with controller and learner (and its options) # % of random actions #learner.explorer.epsilon = parameters["ExplorerEpsilon"] agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False)) testagent = LearningAgent(module, None) pgpeexperiment = EpisodicExperiment(task, agent) randexperiment = EpisodicExperiment(task, bagent) def plotPerformance(values, fig): plt.figure(fig.number) plt.clf() plt.plot(values, 'o-') plt.gcf().canvas.draw() # Without the next line, the pyplot plot won't actually show up. plt.pause(0.001) performance = [] if plot: pf_fig = plt.figure() m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"] ## train pgpe for episode in range(0,50): # one learning step after one episode of world-interaction y =pgpeexperiment.doEpisodes(1) be, bf = agent.learner._bestFound() print be,bf print "generate data" be.numActions = 1 gdagent = LearningAgent(be, blearner) experiment = EpisodicExperiment(task, gdagent) for episode in range(0,1000): # print episode, " of 1000" # one learning step after one episode of world-interaction y =experiment.doEpisodes(1) # print y x = randexperiment.doEpisodes(1) # print len(y[0]) #renderer.drawPlot() # test performance (these real-world experiences are not used for training) if plot: env.delay = True l = 5 resList = (agent.learner._allEvaluations)[-l:-1] # print agent.learner._allEvaluations from scipy import array rLen = len(resList) avReward = array(resList).sum()/rLen # print avReward # print resList # exit(0) # print("Parameters:", agent.learner._bestFound()) # print( # " Evaluation:", episode, # " BestReward:", agent.learner.bestEvaluation, # " AverageReward:", avReward) # if agent.learner.bestEvaluation == 0: # # print resList[-20:-1] # print "done" # break #print resList performance.append(avReward) env.delay = False testagent.reset() #experiment.agent = agent # performance.append(r) if plot: plotPerformance(performance, pf_fig) # print "reward avg", r # print "explorer epsilon", learner.explorer.epsilon # print "num episodes", agent.history.getNumSequences() # print "update step", len(performance) blearner.add_ds(rlearner.dataset) blearner.learn() #blearner.learnX(agent.learner._allEvaluated) print "done" return performance
def main(): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] real_world_sample_counts = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1))) baseline = None num_parameters = 4 # five parameters init_sigma = 3 # initial number sigma sigmas = ones(num_parameters) * init_sigma best_reward = -1000 current = all_params[0].get_value()[:, 0] arg_reward = [] previous_cost = 10000 real_world_sample_count = 0 thinking_count = 0 cost_confidence = 2 for n in xrange(1500): epsilon, epsilon_star = sample_parameter(sigmas=sigmas) if previous_cost <= cost_confidence: rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon) rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon) thinking_count += 1 if thinking_count == 2: previous_cost = 10000 thinking_count = 0 else: # Perform actions in real environment rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon) real_world_sample_count += 1 if reward1 > best_reward: best_reward = reward1 rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon) real_world_sample_count += 1 if reward2 > best_reward: best_reward = reward2 # Prepare for data for first process actions1 = theano_form(actions1, shape=(len(actions1), 1)) observations1 = theano_form(observations1, shape=(len(observations1), 4)) predicted_obs1 = concatenate([observations1[1::], [last_obs1]]) input_data1 = concatenate([actions1, observations1], axis=1) output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1) # Training with data gathered from first process critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS)) critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS)) # Prepare for data for second process actions2 = theano_form(actions2, shape=(len(actions2), 1)) observations2 = theano_form(observations2, shape=(len(observations2), 4)) predicted_obs2 = concatenate([observations2[1::], [last_obs2]]) input_data2 = concatenate([actions2, observations2], axis=1) output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1) # Training with data gathered from second process critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS)) critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS)) train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence count1 = 0 while True: count1 += 1 costs1 = [] for input, output in zip(critic_train_inputs1, critic_train_outputs1): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs1.append(train(critic_train_input, critic_train_output)) if mean(costs1) < train_base_line: break else: if not count1%50: print mean(costs1) #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line if count1 > 1: break count2 = 0 while True: count2 += 1 costs2 = [] for input, output in zip(critic_train_inputs2, critic_train_outputs2): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs2.append(train(critic_train_input, critic_train_output)) if mean(costs2) < train_base_line: break else: if not count2%50: print mean(costs2) #print "mean cost2: ", mean(costs2), "baseline :", train_base_line if count2 > 1: break previous_cost = sum(costs1) + sum(costs2) mreward = (reward1 + reward2) / 2. if baseline is None: # first learning step baseline = mreward fakt = 0. fakt2 = 0. else: #calc the gradients if reward1 != reward2: #gradient estimate alla SPSA but with likelihood gradient and normalization fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2) else: fakt=0. #normalized sigma gradient with moving average baseline norm = (best_reward - baseline) if norm != 0.0: fakt2=(mreward-baseline)/(best_reward-baseline) else: fakt2 = 0.0 #update baseline baseline = 0.9 * baseline + 0.1 * mreward # update parameters and sigmas current = current + LEARNING_RATE * fakt * epsilon if fakt2 > 0: #for sigma adaption alg. follows only positive gradients #apply sigma update locally sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas # Test set epsilon, epsilon_star = sample_parameter(sigmas=sigmas) _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon) _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon) test_mreward = (test_reward1 + test_reward2)/ 2.0 arg_reward.append(test_mreward) print n if not n%10: print "test_reward 1:", test_reward1 _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon) print "simulated reward 1:", sim_test_reward1 print "test_reward 2:", test_reward2 _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon) print "simulated reward 2:", sim_test_reward2 print "previous_cost :", previous_cost print "real_word_example :", real_world_sample_count temp_arg = sum(arg_reward)/len(arg_reward) records[time].append([real_world_sample_count, temp_arg]) print "best reward:", best_reward, "average reward:", temp_arg print arg_reward = [] real_world_sample_counts.append(real_world_sample_count) #print records pickle.dump(records, open("records_lambda_mu.p", "wb")) pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))
from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.environments.cartpole import CartPoleEnvironment, CartPoleRenderer, BalanceTask from pybrain.rl.agents.learning import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from scipy import mean import sys import pylab episodes = 1 epilen = 200 if len(sys.argv) < 5: sys.exit('please give 4 parameters. run: "python play_catpole.py <p1> <p2> <p3> <p4>"\n') # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # create agent and set parameters from command line agent = LearningAgent(net, None) arg1 = float(sys.argv[1]) arg2 = float(sys.argv[2]) arg3 = float(sys.argv[3])