Beispiel #1
0
    def train(self, episodes, maxSteps):

        avgReward = 0

        # set up environment and task
        self.env = InfoMaxEnv(self.objectNames, self.actionNames,
                              self.numCategories)
        self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \
           do_decay_beliefs = True, uniformInitialBeliefs = True)

        # create neural net and learning agent
        self.params = buildNetwork(self.task.outdim, self.task.indim, \
            bias=True, outclass=SoftmaxLayer)

        if self._PGPE:
            self.agent = OptimizationAgent(self.params,
                                           PGPE(minimize=False, verbose=False))
        elif self._CMAES:
            self.agent = OptimizationAgent(
                self.params, CMAES(minimize=False, verbose=False))

        # init and perform experiment
        exp = EpisodicExperiment(self.task, self.agent)

        for i in range(episodes):
            exp.doEpisodes(1)
            avgReward += self.task.getTotalReward()
            print "reward episode ", i, self.task.getTotalReward()

        # print initial info
        print "\naverage reward over training = ", avgReward / episodes

        # save trained network
        self._saveWeights()
Beispiel #2
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # The task is the game this time
    task = environment

    # Make the reinforcement learning agent (use a network because inputs are continuous)
    network = ActionValueNetwork(task.nSenses, task.nActions)

    # Use Q learning for updating the table (NFQ is for networks)
    learner = NFQ()
    learner.gamma = GAMMA

    agent = LearningAgent(network, learner)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the Learner
    meanScores = []
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        meanScores.append(task.meanScore)
        agent.learn()
        agent.reset()

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA }
    return meanScores, params, agent
Beispiel #3
0
class BaggerBot:
	def __init__(self, host, port, net=None):
		self.conn = ServerConnection(host, port)
		self.env = self.conn.env
		self.conn.join()
		self.task = SurviveTask(self.env, self.conn)
		self.net = buildNetwork(self.env.outdim, 4, self.env.indim, outclass=TanhLayer)
		self.agent = OptimizationAgent(self.net, PGPE())
		self.experiment = EpisodicExperiment(self.task, self.agent)

	def wait_connected(self):
		self.conn.wait_connected()

	def train(self):
		'''
		Infinitely play the game. Figure out the next move(s), parse incoming
		data, discard all that, do stupid stuff and die :)
		'''
		while self.env.in_game:
			# Ask to be spawned
			logging.info('Requesting spawn...')
			self.conn.send_spawn()
			while not self.env.playing:
				self.conn.parse_pregame()
			while self.env.playing:
				self.experiment.doEpisodes(100)
Beispiel #4
0
	def train(self, episodes, maxSteps):
 	
		avgReward = 0

		# set up environment and task
		self.env = InfoMaxEnv(self.objectNames, self.actionNames, self.numCategories)
		self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \
					do_decay_beliefs = True, uniformInitialBeliefs = True)

		# create neural net and learning agent
		self.params = buildNetwork(self.task.outdim, self.task.indim, \
						bias=True, outclass=SoftmaxLayer)

		if self._PGPE:
			self.agent = OptimizationAgent(self.params, PGPE(minimize=False,verbose=False))
		elif self._CMAES:
			self.agent = OptimizationAgent(self.params, CMAES(minimize=False,verbose=False))

		# init and perform experiment
		exp = EpisodicExperiment(self.task, self.agent)

		for i in range(episodes):        
			exp.doEpisodes(1)
			avgReward += self.task.getTotalReward()
			print "reward episode ",i,self.task.getTotalReward()

		# print initial info
		print "\naverage reward over training = ",avgReward/episodes

		# save trained network
		self._saveWeights()
Beispiel #5
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # Store the environment as the task
    task = environment

    # Set up the Neural Network
    neuralNet = buildNetwork(task.nSenses, HIDDEN_NODES, task.nActions)

    # Use a Genetic Algorithm as the Trainer
    trainer = GA( populationSize=20, topProportion=0.2, elitism=False
                , eliteProportion=0.25, mutationProb=0.1
                , mutationStdDev=0.2, tournament=False
                , tournamentSize=2 )

    agent = OptimizationAgent(neuralNet, trainer)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the network
    meanScores = []
    print "Starting HillClimberNN"
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Training Iteration", i, "With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        environment.maxGameBlock = 0
        meanScores.append(task.meanScore)

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "hiddenNodes": HIDDEN_NODES }
    return meanScores, params, experiment
Beispiel #6
0
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False, exploretoo=True):
    """ Return the fitness value for one episode of play, given the policy defined by a neural network. """
    import pdb
    pdb.set_trace()

    task = GameTask(game_env)
    game_env.recordingEnabled = True        
    game_env.reset()        
    net.reset()
    task.maxSteps=maxSteps
    agent = LearningAgent(net)
    agent.learning = False
    agent.logging = False
    exper = EpisodicExperiment(task, agent)
    fitness = 0
    for _ in range(avgOver):
        rs = exper.doEpisodes(1)
        # add a slight bonus for more exploration, if rewards are identical
        if exploretoo:
            fitness += len(set(game_env._allEvents)) * 1e-6
        # the true, discounted reward        
        fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs])
    fitness /= avgOver
    if returnEvents:
        return fitness, game_env._allEvents
    else:
        return fitness
def main():
    """
    The task represents one full simulation. Therefore it is episodic.
    Each episode calls performAction after passing getObservation to the agent.
    Once isFinished is true, the reward is returned and one simulation is done.

    The net is the neural network. It has 7 input nodes, a hidden layer of 5
    nodes, and 2 output nodes. It is a feed-forward network using sigmoid
    activation functions.

    OptimizationAgent(module, learner)
    EpisodicExperiment.optimizer = learner
    learner.setEvaluator(task, module)
    optimizer.learn()
    """
    task = LanderTask(batchSize=1)
    net = buildNetwork(task.indim, 5, task.outdim)
    learner = StochasticHillClimber()
    agent = OptimizationAgent(net, learner)
    experiment = EpisodicExperiment(task, agent)
    experiment.doEpisodes(100000)

    tasks = [LanderTask(environment=Lander(acceleration=float(i)))
             for i in range(1, 4)]
    test_size = 1000
    for task in tasks:
        print("Running task with acceleration {}".format(task.env.acceleration))
        success = 0
        for _ in range(test_size):
            task.env.reset()
            while not task.isFinished():
                observation = task.getObservation()
                action = net.activate(observation)
                task.performAction(action)
            print("Finished a simulation with result {}".format(task.env.status))
            if task.env.status == 'landed':
                success += 1
        print("Succeeded {} times out of {}".format(success, test_size))
def main():
    client_id = Utils.connectToVREP()

    # Define RL elements
    environment = StandingUpEnvironment(client_id)
    task = StandingUpTask(environment)
    controller = MyActionValueTable()
    learner = Q(0.5, 0.9)
    learner.explorer = EpsilonGreedyExplorer(0.15, 1)  # EpsilonGreedyBoltzmannExplorer()
    agent = LearningAgent(controller, learner)
    experiment = EpisodicExperiment(task, agent)

    controller.initialize(agent)

    i = 0
    try:
        while True:
            i += 1
            print('Episode ' + str(i))
            experiment.doEpisodes()
            agent.learn()
            agent.reset()
            print('mean: '+str(numpy.mean(controller.params)))
            print('max: '+str(numpy.max(controller.params)))
            print('min: '+str(numpy.min(controller.params)))

            if i % 500 == 0:  # Save q-table every 500 episodes
                print('Save q-table')
                controller.save()
                task.t_table.save()

    except (KeyboardInterrupt, SystemExit):
        with open('../data/standing-up-q.pkl', 'wb') as handle:
            pickle.dump(controller.params, handle)
        task.t_table.save()
        controller.save()

    vrep.simxFinish(client_id)
Beispiel #9
0
def main():
    vrep.simxFinish(-1)  # just in case, close all opened connections
    client_id = vrep.simxStart('127.0.0.1', 19997, True, True, 5000,
                               5)  # Connect to V-REP

    if client_id < 0:
        print('Failed connecting to remote API server')
        return -1

    print('Connected to remote API server')

    # Define RL elements
    environment = StandingUpEnvironment(client_id)

    task = StandingUpTask(environment)

    controller = ActionValueTable(task.get_state_space_size(),
                                  task.get_action_space_size())
    controller.initialize(1.)

    file = open('standing-up-q.pkl', 'rb')
    controller._params = pickle.load(file)
    file.close()

    # learner = Q()
    agent = LearningAgent(controller)

    experiment = EpisodicExperiment(task, agent)

    i = 0
    while True:
        i += 1
        print('Iteration n° ' + str(i))
        experiment.doEpisodes(1)

    vrep.simxFinish(client_id)
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False):
    """ Return the fitness value for one episode of play, given the policy defined by a neural network. """
    task = GameTask(game_env)
    game_env.recordingEnabled = True        
    game_env.reset()        
    net.reset()
    task.maxSteps=maxSteps
    agent = LearningAgent(net)
    agent.learning = False
    agent.logging = False
    exper = EpisodicExperiment(task, agent)
    fitness = 0
    for _ in range(avgOver):
        rs = exper.doEpisodes(1)
        # add a slight bonus for more exploration, if rewards are identical
        fitness += len(set(game_env._allEvents)) * 1e-6
        # the true, discounted reward        
        fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs])
    fitness /= avgOver
    if returnEvents:
        return fitness, game_env._allEvents
    else:
        return fitness
Beispiel #11
0
def plotPerformance(values, fig):
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)

performance = []

if not render:
    pf_fig = plt.figure()

while(True):
	# one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)
Beispiel #12
0
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)


performance = []

if not render:
    pf_fig = plt.figure()

while (True):
    # one learning step after one episode of world-interaction
    experiment.doEpisodes(1)
    agent.learn(1)

    # test performance (these real-world experiences are not used for training)
    if render:
        env.delay = True
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(5)])
    env.delay = False
    testagent.reset()
    experiment.agent = agent

    performance.append(r)
    if not render:
        plotPerformance(performance, pf_fig)
Beispiel #13
0
agent = GPSARSA_Agent(learner)
agent.logging = True

exp = EpisodicExperiment(
    task,
    agent)  #epsilon greedy exploration (with and without use of uncertainity)
plt.ion()

i = 1000
performance = []  #reward accumulation, dump variable for any evaluation metric
sum = []
agent.reset()
i = 0
for num_exp in range(100):

    performance = exp.doEpisodes(1)
    sum = np.append(sum, np.sum(performance))

    if (num_exp % 10 == 0):
        agent.init_exploration -= agent.init_exploration * 0.10

    agent.learn()
    print('alpha', np.dot(learner.inv, learner.ret_reward().T))

    agent.reset()
'''
b=learner.ret_cov()

a=learner.state_dict

print(sum)
Beispiel #14
0
class RLModel():
    def __init__(self,
                 rl_params,
                 parameter_names,
                 env,
                 task,
                 clean_after_call=False):
        """

        Parameters
        ----------
        rl_params : RLParams
        parameters : parameter names in order
        env : Environment model
        task : EpisodecTask instance
        clean_after_call: bool
        """
        self.rl_params = rl_params
        self.parameter_names = parameter_names
        self.env = env
        self.task = task
        self.agent = None
        self.clean_after_call = clean_after_call

    def to_dict(self):
        return {
            "rl_params": self.rl_params.to_dict(),
            "parameters": self.parameters,
        }

    def train_model(self, parameter_values, random_state=None):
        self._parameters(parameter_values)
        self._build_model(random_state)
        self._train_model()

    def __call__(self,
                 *parameter_values,
                 index_in_batch=None,
                 random_state=None):
        """ Simulates data.
        Interfaces to ELFI as a sequential simulator.

        Parameters
        ----------
        parameter_values : list of model variables
            Length should equal length of parameters
        random_state: random number generator

        Returns
        -------
        Simulated trajectories as a dict
        """
        print("SIM AT", parameter_values)
        self.train_model(parameter_values, random_state=random_state)
        log_dict = self.simulate(random_state)
        if self.clean_after_call is True:
            self.clean()
        return log_dict

    def get_policy(self):
        """ Returns the current policy of the agent
        """
        return self.agent.get_policy()

    def _parameters(self, parameter_values):
        """ Parse parameter values
        """
        self.p = dict()
        if len(self.parameter_names) != len(parameter_values):
            raise ValueError(
                "Number of model variables was {} ({}), expected {}".format(
                    len(parameter_values), parameter_values,
                    len(self.parameter_names)))
        for name, val in zip(self.parameter_names, parameter_values):
            self.p[name] = float(val)
        logger.debug("Model parameters: {}".format(self.p))

    def _build_model(self, random_state):
        """ Initialize the model
        """
        self.env.setup(self.p, random_state)
        self.task.setup(self.p)
        outdim = self.task.env.outdim
        n_actions = self.task.env.numActions
        self.agent = RLAgent(outdim,
                             n_actions,
                             random_state,
                             rl_params=self.rl_params)
        logger.debug("Model initialized")

    def _train_model(self):
        """ Uses reinforcement learning to find the optimal strategy
        """
        self.experiment = EpisodicExperiment(self.task, self.agent)
        n_epochs = int(self.rl_params.n_training_episodes /
                       self.rl_params.n_episodes_per_epoch)
        logger.debug(
            "Fitting user model over {} epochs, each {} episodes, total {} episodes."
            .format(n_epochs, self.rl_params.n_episodes_per_epoch,
                    n_epochs * self.rl_params.n_episodes_per_epoch))
        for i in range(n_epochs):
            logger.debug("RL epoch {}".format(i))
            self.experiment.doEpisodes(self.rl_params.n_episodes_per_epoch)
            self.agent.learn()
            self.agent.reset()  # reset buffers

    def simulate(self, random_state):
        """ Simulates agent behavior in 'n_sim' episodes.
        """
        logger.debug("Simulating user actions ({} episodes)".format(
            self.rl_params.n_simulation_episodes))
        self.experiment = EpisodicExperiment(self.task, self.agent)

        # set training flag off
        self.task.env.training = False
        # deactivate learning for experiment
        self.agent.learning = False
        # deactivate exploration
        explorer = self.agent.learner.explorer
        self.agent.learner.explorer = EGreedyExplorer(
            epsilon=0, decay=1, random_state=random_state)
        self.agent.learner.explorer.module = self.agent.module
        # activate logging
        self.task.env.start_logging()

        # simulate behavior
        self.experiment.doEpisodes(self.rl_params.n_simulation_episodes)
        # store log data
        dataset = self.task.env.log

        # deactivate logging
        self.task.env.end_logging()
        # reactivate exploration
        self.agent.learner.explorer = explorer
        # reactivate learning for experiment
        self.agent.learning = True
        # set training flag back on
        self.task.env.training = True

        return dataset

    def clean(self):
        self.agent = None
        self.env.clean()
        self.task.clean()
        gc.collect()
Beispiel #15
0
# initialize parameters (variance)
#agent.setSigma([-2.])
# learning options
agent.learner.alpha = 0.1
# agent.learner.rprop = True
experiment = EpisodicExperiment(task, agent)

best = 0.0
base = 0.0
rew = 0.0

for updates in range(1000):

    # testing step
    agent.disableLearning()
    experiment.doEpisodes(2)

    # append mean reward to sr array
    ret = []
    for n in range(agent.history.getNumSequences()):
        state, action, reward = agent.history.getSequence(n)
        ret.append(sum(reward, 0).item())
    rew = mean(ret)
    base = 0.9 * base + 0.1 * rew
    if rew > best: best = rew
    print "Parameters:", agent.module.params, "Epsilon: ", agent.learner.epsilon, "Best: ", best, "Base: ", base, "Reward %f\n" % rew
    agent.enableLearning()
    agent.reset()

    # training step
    for i in range(5):
Beispiel #16
0
# create agent with controller and learner
agent = FiniteDifferenceAgent(net, SPLA())
# learning options
agent.learner.gd.alpha = 0.05
agent.learner.gdSig.alpha = 0.1
agent.learner.gd.momentum = 0.0
agent.learner.epsilon = 2.0
agent.learner.initSigmas()

sr = []

experiment = EpisodicExperiment(task, agent)
for updates in range(1000):
    # training step
    for i in range(5):
        experiment.doEpisodes(10)
        agent.learn()
        print "parameters:", agent.module.params
        agent.reset()
        
    # learning step
    agent.disableLearning()
    experiment.doEpisodes(50)
    # append mean reward to sr array
    ret = []
    for n in range(agent.history.getNumSequences()):
        state, action, reward, _ = agent.history.getSequence(n)
        ret.append( sum(reward, 0).item() )
    sr.append(mean(ret))
        
    agent.enableLearning()
Beispiel #17
0
def run(nao,pad):





    # ################################
    # choose bottom cam, so nao can see object when standing next to it
    nao.camera.selectCam(1)
    
    env = grabbingEnvironment(nao)
    #env.connect(nao)

    task = grabbingTask(env)

    net = buildNetwork(len(task.getObservation()),8, env.indim, bias = True, recurrent=True)
    print env.indim
    #net = ActionValueNetwork(5,4)
    #, outclass=TanhLayer)
    #, hiddenclass=TanhLayer, outclass=TanhLayer

    # not correct right now..
    # TODO: train into RL Modules, dataset needs to be merged with exploration data
    #generateTraining.generateTraining().runDeltaMovements(nao,net,env,pad)


    #module = ActionValueNetwork(3, 3)
    #module = NeuronLayer(40)

    #agent = LearningAgent(net, SARSA())
    #learner = PolicyGradientLearner()
    #learner._setExplorer(StateDependentExplorer(3,3))
    #learner._setModule(module)
    #agent = LearningAgent(module, learner)
    #agent = LearningAgent(net, ENAC())
    #agent = LearningAgent(net, Reinforce())

    #learner = NFQ()
    #learner.explorer.epsilon = 0.4
    #agent = LearningAgent(net, learner)

    testagent = OptimizationAgent(net,None,env)
    #agent = LearningAgent(module, Q())
    #agent = LearningAgent(module, QLambda())
    learner = grabbingPGPE(storeAllEvaluations = True, verbose = True, epsilon = 1.0, deltamax =5.0, sigmaLearningRate = 0.1, learningRate = 0.2)
    agent = OptimizationAgent(net, learner,env)
    #agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True, verbose = True))
    #agent = OptimizationAgent(net, HillClimber(storeAllEvaluations = True, verbose = True))

    #agent = OptimizationAgent(net, RandomSearch(storeAllEvaluations = True, verbose = True))
    
    experiment = EpisodicExperiment(task, agent)
    # only for optimizationAgent
    #experiment.doOptimization = True

    # only for simulator!
    nao.fractionMaxSpeed = 1.0



    print "#env"
    print "  sensors:", env.outdim
    print "  actions:", env.indim
    print "  discreteStates:", env.discreteStates
    print "  discreteActions:", env.discreteActions
    
    print
    print "#task"
    print "  sensor_limits:", task.sensor_limits
    print "  actor_limits:", task.actor_limits
    print "  epilen: ", task.epiLen
    print "#EpisodicTask"
    print "  discount:", task.discount
    print "  batchsize:", task.batchSize
    

    print
    print "#PGPE"
    print "  exploration type:", grabbingPGPE().exploration
    print "  LearningRate:", grabbingPGPE().learningRate
    print "  sigmaLearningRate:", grabbingPGPE().sigmaLearningRate
    print "  epsilon:", grabbingPGPE().epsilon
    print "  wDecay:", grabbingPGPE().wDecay
    print "  momentum:", grabbingPGPE().momentum
    print "  rprop:", grabbingPGPE().rprop



#    # switch this to True if you want to see the cart balancing the pole (slower)
#    render = False
#
#    plt.ion()
#
#    env = CartPoleEnvironment()
#    if render:
#        renderer = CartPoleRenderer()
#        env.setRenderer(renderer)
#        renderer.start()
#
#    module = ActionValueNetwork(4, 3)
#
#    task = DiscreteBalanceTask(env, 100)
#    learner = NFQ()
#    learner.explorer.epsilon = 0.4
#
#    agent = LearningAgent(module, learner)
#    testagent = LearningAgent(module, None)
#    experiment = EpisodicExperiment(task, agent)
#
#    performance = []
#
#    if not render:
#        pf_fig = plt.figure()

    count = 0
    while(True):
            # one learning step after one episode of world-interaction
        count += 1
        print "learning #",count
        experiment.agent = agent
        experiment.doOptimization = True
        erg = experiment.doEpisodes(1)
        print erg
        #experiment.doOptimization = False
        #print "agent learn"
        #agent.learner.learn(1)

        if count > 8:
        # test performance (these real-world experiences are not used for training)
#        if render:
#            env.delay = True
            #experiment.agent = testagent
            print "testing"
            experiment.doOptimization = False

            erg = experiment.doEpisodes(1)
            summe = 0
            #print erg
#            for x in erg:
#                summe = sum(x)
#            print summe
        #r = mean([sum(x) for x in experiment.doEpisodes(5)])
#        env.delay = False
#            testagent.reset()
        

#        performance.append(r)
#        if not render:
#            plotPerformance(performance, pf_fig)

#        print "reward avg", r
#        print "explorer epsilon", learner.explorer.epsilon
#        print "num episodes", agent.history.getNumSequences()
#        print "update step", len(performance)




#    #for updates in range(5000000)
#    updates = 0
#    episodes = 10
#    while True:
#        updates += 1
#        #raw_input("next episode")
#        print "lerne episode:",updates
#        experiment.doEpisodes(episodes)
##        print "lernen beendet, starte testlauf"
#        env.reset()
##        if updates > 0:
##            experiment.doInteractions(20)
##            rewsum = 0
##            rewlist = []
##            for i in range (0,100):
##                rew = task.performAction(net.activate(task.getObservation()))
##
##                task.getObservation()
##                rewlist.append(rew)
##                rewsum += rew
#                #print "  testlauf: ",updates,"aktion: ",i+1," reward: ",rew
##            print "-> summe = ",rewsum, " avg: ",rewsum / 100.0
#            #print "episodes:",updates," rewsum: ",rewsum," testrewards:",rewlist
##            #x = "episode:" + updates + " testrewards:" + rewlist
##            #o.write(x)
##            for i in range(0,len(rewlist)):
##                x = (updates % 20) - 10
##                y = i - 10
##                z = rewlist[i]
##                #g.plot((x,y,z),x=1, y=2, z=3)
#
#            #g.doplot()
#
#
#
#        #print "-------------------------------------------------------------------"

    print "finished grabbingTest"
Beispiel #18
0
	def train(self, size, goal, initPose, mapSelect, envSelect, episodes, maxSteps, goalTol, randomizeInitPose):
 	
		avgReward = 0

		# set up environment and task
		self.env = mazeEnv(size, goal, initPose, mapSelect, envSelect, randomizeInitPose)
		self.task = MDPMazeTaskEpisodic(self.env, maxSteps, goalTol)

		# create neural net and learning agent
		self.params = buildNetwork(self.task.outdim, 48, self.task.indim, \
		bias=True, outclass=SoftmaxLayer)

		if self._PGPE:
			self.agent = OptimizationAgent(self.params, PGPE(minimize=True,verbose=False))
		elif self._CMAES:
			self.agent = OptimizationAgent(self.params, CMAES(minimize=True,verbose=False))

		# init experiment
		exp = EpisodicExperiment(self.task, self.agent)

		for i in range(0, episodes):        
			exp.doEpisodes()
			avgReward += self.task.getTotalReward()
			print "reward episode ",i,self.task.getTotalReward()

		# print initial info
		print "\naverage reward over training = ",avgReward/episodes

		# import weights into network and save network
		if self._PGPE:
			for i in range(len(self.params.params)):
				self.params.params[i] = self.agent.learner.current[i]
			pickle.dump(self.params, open('policyNet.pkl','w'))

		elif self._CMAES:

			################ following code came from WWInfoMaxCMAES.py script from ICDL 2010 paper
			arz = randn(self.agent.learner.numParameters, self.agent.learner.batchSize)
			arx = tile(self.agent.learner.center.reshape(self.agent.learner.numParameters, 1),\
			(1, self.agent.learner.batchSize)) + \
			self.agent.learner.stepSize * dot(dot(self.agent.learner.B, self.agent.learner.D), arz)
			# Go through the parameters and pick the current best 
			arfitness = zeros(self.agent.learner.batchSize)
			for k in xrange(self.agent.learner.batchSize):
			  self.agent.learner.wrappingEvaluable._setParameters(arx[:, k]);
			  arfitness[k] = self.agent.learner._BlackBoxOptimizer__evaluator\
			(self.agent.learner.wrappingEvaluable)

			# Sort by fitness and compute weighted mean into center
			tmp = sorted(map(lambda (x, y): (y, x), enumerate(ravel(arfitness))))
			arfitness = array(map(lambda x: x[0], tmp))
			arindex = array(map(lambda x: int(x[1]), tmp))

			arz = arz[:, arindex]
			curparams = arx[:, arindex[0]];

			# update network weights with selected parameters
			for i in range(len(self.params.params)):
				self.params.params[i] = curparams[i]
			# save trained network
			pickle.dump(self.params, open('policyNet.pkl','w'))
Beispiel #19
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None)

    #print "dim: ", task.indim, task.outdim

    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False))
#
#    print agent
#    from pprint import pprint
#    pprint (vars(agent.learner))
    
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        #agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            #for i in range(0,parameters["TestWith"]):
#            y = testexperiment.doEpisodes(1)
#            print (agent.learner._allEvaluated)
#                
#            
#            from pprint import pprint
#            pprint (vars(task))
                
            l = parameters["TestWith"]
            
            task.N = parameters["MaxRunsPerEpisodeTest"]
            experiment.doEpisodes(l)
            task.N = parameters["MaxRunsPerEpisode"]

            resList = (agent.learner._allEvaluations)[-l:-1]
            
#            print agent.learner._allEvaluations
            from scipy import array

            rLen = len(resList)
            avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
            performance.append(avReward)
            

            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
#            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
            
            
#import sumatra.parameters as p
#import sys
#parameter_file = sys.argv[1]
#parameters = p.SimpleParameterSet(parameter_file)
#
#
#run(["BalanceTask",parameters])
Beispiel #20
0
def main():
    if len(sys.argv) != 2:
        print 'Please provide a path to a model data directory.'
        print ('The script will load the newest model data from the directory,'
               'then continue to improve that model')
        sys.exit(0)

    model_directory = sys.argv[1]
    existing_models = sorted(glob(os.path.join(model_directory, '*.rlmdl')))

    if existing_models:
        newest_model_name = existing_models[-1]
        iteration_count = int(newest_model_name[-12:-6]) + 1
        print 'Loading model {}'.format(newest_model_name)

        newest_model = open(newest_model_name, 'r')
        agent = pickle.load(newest_model)
    else:
        net = buildNetwork(Environment.outdim,
                           Environment.outdim + Environment.indim,
                           Environment.indim)
        agent = OptimizationAgent(net, PGPE())
        iteration_count = 1

    environment = Environment(LOCAL_HOST, PORT, PATH_TO_SCENE)
    task = Task(environment)


    experiment = EpisodicExperiment(task, agent)


    def signal_handler(signal, frame):
        print 'Exiting gracefully'
        environment.teardown()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)


    while True:
        time.sleep(1)

        print '>>>>> Running iteration {}'.format(iteration_count)
        # NOTE this weird stuff is hacky, but we need it to plug in our autosave
        # stuff properly. Took a long time to figure this out.
        experiment.optimizer.maxEvaluations = experiment.optimizer.numEvaluations + experiment.optimizer.batchSize

        try:
            experiment.doEpisodes()
        except Exception as e:
            print 'ERROR RUNNING SIMULATION: \n{}'.format(e)
            environment.teardown()
        else:
            if iteration_count % AUTOSAVE_INTERVAL == 0:
                filename = str(iteration_count).zfill(6) + '.rlmdl'
                filename = os.path.join(model_directory, filename)
                f = open(filename, 'w+')
                print 'Saving model to {}'.format(filename)

                pickle.dump(agent, f)

            iteration_count += 1

        print 'Iteration finished <<<<<'
Beispiel #21
0
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()


performance = []

sv_fig = plt.figure()
pf_fig = plt.figure()

# experiment.doEpisodes(50)
    
while(True):
    env.delay = True
    experiment.doEpisodes(10)
    env.delay = False

    while agent.history.getNumSequences() > 50:
        agent.history.removeSequence(0)
        
    agent.learn(20)
    
    experiment.agent = testagent
    r = mean([sum(x) for x in experiment.doEpisodes(20)])

    testagent.reset()
    experiment.agent = agent
    

    performance.append(r) 
  side = 9
  goal = 3,2

  env = mazeEnv(structure, goal)   #use maze environment for now; note pos is Y,X

  # our own task and environment for later
  #env = policyEnv()
  thetask = MDPMazeTaskEpisodic(env)

  # create neural net; create and train agent
  theparams = buildNetwork(thetask.outdim, thetask.indim, bias=False)
  agent = OptimizationAgent(theparams, CMAES())
  exp = EpisodicExperiment(thetask, agent)

  # train agent        
  exp.doEpisodes(NUM_EPISODES)
  print "\ntotal reward = ",thetask.getTotalReward()

  #print "\n"
  #print "initial weights: "; print theparams.params
  print "\n"
  print "NOTE positions below are (Y,X)"

  print "\n"
  print "getting observation 1"
  print "robot = ",thetask.getObservation()
  print "goal  = ",goal
  print "reward: ", thetask.getReward()

  print "\n"
  print "performing action 1"
def run_experiment():
    # Create the controller network
    HIDDEN_NODES = 4

    RUNS = 2
    BATCHES = 1
    PRINTS = 1
    EPISODES = 500

    env = None
    start_state_net = None

    run_results = []

    # Set up plotting tools for the experiments
    tools = ExTools(BATCHES, PRINTS)

    # Run the experiment
    for run in range(RUNS):
        if run == 0:
            continue

        # If an environment already exists, shut it down
        if env:
            env.closeSocket()

        # Create the environment
        env = create_environment()

        # Create the task
        task = Pa10MovementTask(env)

        # Create the neural network. Only create the network once so it retains
        # the same starting values for each run.
        if start_state_net:
            net = start_state_net.copy()
        else:
            # Create the initial neural network
            net = create_network(
                    in_nodes=env.obsLen,
                    hidden_nodes=HIDDEN_NODES,
                    out_nodes=env.actLen
            )
            start_state_net = net.copy()

        # Create the learning agent
        learner = HillClimber(storeAllEvaluations=True)
        agent = OptimizationAgent(net, learner)
        tools.agent = agent

        # Create the experiment
        experiment = EpisodicExperiment(task, agent)

        # Perform all episodes in the run
        for episode in range(EPISODES):
            experiment.doEpisodes(BATCHES)

        # Calculate results
        all_results = agent.learner._allEvaluations
        max_result = np.max(all_results)
        min_result = np.min(all_results)
        avg_result = np.sum(all_results) / len(all_results)
        run_results.append((run, max_result, min_result, avg_result))

        # Make the results directory if it does not exist
        if not os.path.exists(G_RESULTS_DIR):
            os.mkdir(G_RESULTS_DIR)

        # Write all results to the results file
        with open(os.path.join(G_RESULTS_DIR, 'run_%d.txt' % run), 'w+') as f:
            # Store the calculated max, min, avg
            f.write('RUN, MAX, MIN, AVG\n')
            f.write('%d, %f, %f, %f\n' % (run, max_result, min_result, avg_result))

            # Store all results from this run
            f.write('EPISODE, REWARD\n')
            for episode, result in enumerate(all_results):
                f.write('%d, %f\n' % (episode, result))

    return
#use Q learning for updating the table (NFQ is for networks)
learner = NFQ()

agent = LearningAgent(controller, learner)



#set up an experiment
experiment = EpisodicExperiment(task, agent)

meanscores = []
m = 0.0
for i in xrange(learning_eps):
    print i
    experiment.doEpisodes(games_per_ep)
    meanscores.append(task.meanscore)
    if meanscores[-1] > m:
        m = meanscores[-1]
        f = open("bestRL.pkl",'w')
        pickle.dump(agent,f)
        f.close()
    agent.learn()
    agent.reset()

import matplotlib.pyplot as plt
plt.plot(meanscores)

plt.title("Mean Agent Score Per Batch")
plt.show()
Beispiel #25
0
from pybrain.rl.experiments import EpisodicExperiment

# any episodic task
task = BalanceTask()

# any neural network controller
net = buildNetwork(task.outdim, 1, task.indim)

# any optimization algorithm to be plugged in, for example:
# learner = CMAES(storeAllEvaluations = True)
# or:
learner = HillClimber(storeAllEvaluations=True)

# in a non-optimization case the agent would be a LearningAgent:
# agent = LearningAgent(net, ENAC())
# here it is an OptimizationAgent:
agent = OptimizationAgent(net, learner)

# the agent and task are linked in an Experiment
# and everything else happens under the hood.
exp = EpisodicExperiment(task, agent)
exp.doEpisodes(100)

print('Episodes learned from:', len(learner._allEvaluations))
n, fit = learner._bestFound()
print('Best fitness found:', fit)
print('with this network:')
print(n)
print('containing these parameters:')
print(fListToString(n.params, 4))
Beispiel #26
0
env.getRenderer().start()
env.delay = (episodes == 1)

# create task
task = BalanceTask(env, epilen)

# create controller network
net = buildNetwork(4, 1, bias=False)

# create agent and set parameters from command line
agent = LearningAgent(net, None)
agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])])

# create experiment
experiment = EpisodicExperiment(task, agent)
experiment.doEpisodes(episodes)

# run environment
ret = []
for n in range(agent.history.getNumSequences()):
    returns = agent.history.getSequence(n)
    reward = returns[2]
    ret.append( sum(reward, 0).item() )

# print results
print ret, "mean:",mean(ret)
#env.getRenderer().stop()



Beispiel #27
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", task,parameters
    
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed)
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    env.randomInitialization = False
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, 50)

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    bmodule = ActionValueRAND(task.outdim, task.indim)
    rlearner = RAND()

    blearner = RAND()
    # % of random actions
    
    bagent = LearningAgent(bmodule, rlearner)
    
    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False))


    
    
    testagent = LearningAgent(module, None)
    pgpeexperiment = EpisodicExperiment(task, agent)
    randexperiment = EpisodicExperiment(task, bagent)


    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    
    ## train pgpe
    for episode in range(0,50):
    	# one learning step after one episode of world-interaction
        y =pgpeexperiment.doEpisodes(1)
        
    be, bf = agent.learner._bestFound()
    print be,bf
    
    print "generate data"
    be.numActions = 1
    gdagent = LearningAgent(be, blearner)
    experiment = EpisodicExperiment(task, gdagent)
    
    for episode in range(0,1000):
#        print episode, " of 1000"
    	# one learning step after one episode of world-interaction
        y =experiment.doEpisodes(1)
        
#        print y
        x = randexperiment.doEpisodes(1)
#        print len(y[0])
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        

        l = 5
        resList = (agent.learner._allEvaluations)[-l:-1]
        
#            print agent.learner._allEvaluations
        from scipy import array

        rLen = len(resList)
        avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
        #print resList
        performance.append(avReward)
        

        env.delay = False
        testagent.reset()
        #experiment.agent = agent
    
#            performance.append(r)
        if plot:
            plotPerformance(performance, pf_fig)
            
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
    blearner.add_ds(rlearner.dataset)
    
    blearner.learn()
    #blearner.learnX(agent.learner._allEvaluated)
    print "done"
    return performance
Beispiel #28
0
hiddenUnits = 4
batch=2 #number of samples per learning step
prnts=1 #number of learning steps after results are printed
epis=5000000/batch/prnts #number of roleouts
numbExp=10 #number of experiments
et = ExTools(batch, prnts) #tool for printing and plotting

for runs in range(numbExp):
    # create environment
    #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560)
    env = FlexCubeEnvironment()
    # create task
    task = WalkTask(env)
    # create controller network
    net = buildNetwork(len(task.getObservation()), hiddenUnits, env.actLen, outclass=TanhLayer)    
    # create agent with controller and learner (and its options)
    agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True))
    et.agent = agent
     # create the experiment
    experiment = EpisodicExperiment(task, agent)

    #Do the experiment
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)
        et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates)
    et.addExps()
et.showExps()
#To view what the simulation is doing at the moment, go to pybrain/rl/environments/flexcube/ and start renderer.py (python-openGL musst be installed)
    # set up environment, task, neural net, agent, and experiment
    env = InfoMaxEnv(object_names, action_names, num_objects, False)
    task = InfoMaxTask(env, max_steps=max_steps)
    net = buildNetwork(task.outdim,
                       task.indim,
                       bias=True,
                       outclass=SoftmaxLayer)

    if algorithm == 'pgpe':
        agent = OptimizationAgent(
            net, PGPE(storeAllEvaluations=True, minimize=False, verbose=False))
    elif algorithm == 'cmaes':
        agent = OptimizationAgent(net, CMAES(minimize=False, verbose=False))

    experiment = EpisodicExperiment(task, agent)
    experiment.doEpisodes(1)

    #agent.learner.wrappingEvaluable._setParameters(best_params2)
    agent.learner._setInitEvaluable(best_params2)

    joint_probs_learned = np.zeros(
        (num_best_test_runs, num_objects, max_steps, num_categories))
    joint_probs_handcoded = np.zeros(
        (num_best_test_runs, num_objects, max_steps, num_categories))
    learned_steps = []
    handcoded_steps = []

    learned_true = []
    handcoded_true = []

    avg_prob_learned = np.zeros((num_best_test_runs, num_objects, max_steps))
Beispiel #30
0
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)


def update_wheel_trajectories():
    front_lines = ax2.plot(env.get_xfhist(), env.get_yfhist(), 'r')
    back_lines = ax2.plot(env.get_xbhist(), env.get_ybhist(), 'b')
    plt.axis('equal')


perform_cumrewards = []
for irehearsal in range(7000):

    # Learn.
    # ------
    r = exp.doEpisodes(1)
    # Discounted reward.
    cumreward = exp.task.getTotalReward()
    #print 'cumreward: %.4f; nsteps: %i; learningRate: %.4f' % (
    #        cumreward, len(r[0]), exp.agent.learner.learningRate)

    if irehearsal % 50 == 0:
        # Perform (no learning).
        # ----------------------
        # Swap out the agent.
        exp.agent = performance_agent

        # Perform.
        r = exp.doEpisodes(1)
        perform_cumreward = task.getTotalReward()
        perform_cumrewards.append(perform_cumreward)
Beispiel #31
0
    # learning options
    agent.learner.gd.alpha = 0.3 #step size of \mu adaption
    agent.learner.gdSig.alpha = 0.15 #step size of \sigma adaption
    agent.learner.gd.momentum = 0.0
    batch=2 #number of samples per gradient estimate (was: 2; more here due to stochastic setting)
    #create experiment
    experiment = EpisodicExperiment(task, agent)
    prnts=1 #frequency of console output
    epis=2000/batch/prnts
    
    #actual roll outs
    filename="dataSPLA08NoRew"+repr(int(random.random()*1000000.0))+".dat"
    wf = open(filename, 'wb')
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch) #execute #batch episodes
            agent.learn() #learn from the gather experience
            agent.reset() #reset agent and environment
        #print out related data
        stp = (updates+1)*batch*prnts
        print "Step: ", runs, "/", stp, "Best: ", agent.learner.best, "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward   
        wf.write(repr(stp)+"\n") 
        wf.write(repr(agent.learner.baseline[0])+"\n") 
        if useGraphics:
            pl.addData(0,float(stp),agent.learner.baseline)
            pl.addData(1,float(stp),agent.learner.best)
            pl.update()

        #if updates/100 == float(updates)/100.0:
        #    saveWeights("walk.wgt", agent.learner.original)  
    wf.close()      
Beispiel #32
0
    agent.learner.gdSig.alpha = 0.085  #step size of \sigma adaption
    agent.learner.gd.momentum = 0.0

    #Loading weights
    if loadNet:
        agent.learner.original = loadWeights("grasp.wgt")
        agent.learner.gd.init(agent.learner.original)
        agent.learner.epsilon = 0.2
        agent.learner.initSigmas()

    batch = 2  #number of samples per gradient estimate
    #create experiment
    experiment = EpisodicExperiment(task, agent)
    prnts = 1  #frequency of console output
    epis = 5000000 / batch / prnts

    #actual roll outs
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)  #execute batch episodes
            agent.learn()  #learn from the gather experience
            agent.reset()  #reset agent and environment
        #print out related data
        print "Step: ", runs, "/", (
            updates + 1) * batch * prnts, "Best: ", agent.learner.best,
        print "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward
        #Saving weights
        if saveNet:
            if updates / 100 == float(updates) / 100.0:
                saveWeights(saveName, agent.learner.original)
__author__ = 'Stubborn'

from pybrain.rl.environments.ode import CCRLEnvironment
from pybrain.rl.environments.ode.tasks import CCRLGlasTask
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure.modules.tanhlayer import TanhLayer
from pybrain.optimization import PGPE
from pybrain.rl.agents import OptimizationAgent
from pybrain.rl.experiments import EpisodicExperiment

environment = CCRLEnvironment()
task = CCRLGlasTask(environment)

net = buildNetwork(len(task.getObservation()), 4, environment.indim, outclass=TanhLayer)

agent = OptimizationAgent(net, PGPE())

experiment = EpisodicExperiment(task, agent)

for updates in range(20000):
    experiment.doEpisodes(1)



Beispiel #34
0
from pybrain.rl.environments.cartpole.balancetask import BalanceTask
from pybrain.tools.shortcuts import buildNetwork
from pybrain.rl.agents import OptimizationAgent
from pybrain.rl.experiments import EpisodicExperiment
from pybrain.optimization import HillClimber

task = BalanceTask()
net = buildNetwork(task.outdim, 3, task.indim)
HillClimber(task, net, maxEvaluations=100).learn()
agent = OptimizationAgent(net, HillClimber())
exp = EpisodicExperiment(task, agent)
print(exp.doEpisodes(100))
Beispiel #35
0
from pybrain.rl.environments.timeseries.timeseries import MonthlySnPEnvironment
from pybrain.rl.learners.directsearch.rrl import RRL

from pybrain.structure import RecurrentNetwork
from pybrain.structure import LinearLayer, SigmoidLayer, TanhLayer, BiasUnit
from pybrain.structure import FullConnection
from pybrain.rl.agents import LearningAgent
from pybrain.rl.experiments import EpisodicExperiment

from numpy import sign, round
from matplotlib import pyplot

net= RecurrentNetwork()
#Single linear layer with bias unit, and single tanh layer. the linear layer is whats optimised
net.addInputModule(BiasUnit(name='bias'))
net.addOutputModule(TanhLayer(1, name='out'))
net.addRecurrentConnection(FullConnection(net['out'], net['out'], name='c3'))
net.addInputModule(LinearLayer(1,name='in'))
net.addConnection(FullConnection(net['in'],net['out'],name='c1'))
net.addConnection((FullConnection(net['bias'],net['out'],name='c2')))
net.sortModules()
net._setParameters([-8.79227886e-02, -8.29319017e+02, 1.25946474e+00])
print(net._params)
env=MonthlySnPEnvironment()
task=MaximizeReturnTask(env)
learner = RRL() # ENAC() #Q_LinFA(2,1)
agent = LearningAgent(net,learner)
exp=EpisodicExperiment(task,agent)

exp.doEpisodes(10)
Beispiel #36
0
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)


def update_wheel_trajectories():
    front_lines = ax2.plot(env.get_xfhist(), env.get_yfhist(), "r")
    back_lines = ax2.plot(env.get_xbhist(), env.get_ybhist(), "b")
    plt.axis("equal")


perform_cumrewards = []
for irehearsal in range(7000):

    # Learn.
    # ------
    r = exp.doEpisodes(1)
    # Discounted reward.
    cumreward = exp.task.getTotalReward()
    # print 'cumreward: %.4f; nsteps: %i; learningRate: %.4f' % (
    #        cumreward, len(r[0]), exp.agent.learner.learningRate)

    if irehearsal % 50 == 0:
        # Perform (no learning).
        # ----------------------
        # Swap out the agent.
        exp.agent = performance_agent

        # Perform.
        r = exp.doEpisodes(1)
        perform_cumreward = task.getTotalReward()
        perform_cumrewards.append(perform_cumreward)
Beispiel #37
0
for runs in range(numbExp):
    # create environment
    #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560)
    if env != None: env.closeSocket()
    env = ShipSteeringEnvironment()
    # create task
    task = GoNorthwardTask(env, maxsteps=500)
    # create controller network
    net = buildNetwork(task.outdim, task.indim, outclass=TanhLayer)
    # create agent with controller and learner (and its options)
    agent = OptimizationAgent(
        net,
        PGPE(learningRate=0.3,
             sigmaLearningRate=0.15,
             momentum=0.0,
             epsilon=2.0,
             rprop=False,
             storeAllEvaluations=True))
    et.agent = agent
    #create experiment
    experiment = EpisodicExperiment(task, agent)

    #Do the experiment
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)
        et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates)
    et.addExps()
et.showExps()
#To view what the simulation is doing at the moment set the environment with True, go to pybrain/rl/environments/ode/ and start viewer.py (python-openGL musst be installed, see PyBrain documentation)
Beispiel #38
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"])

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    module = ActionValueNetwork(task.outdim, task.indim)
    

    learner = NFQ()
    # % of random actions
    learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = LearningAgent(module, learner)
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            
            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)