Exemple #1
0
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False, exploretoo=True):
    """ Return the fitness value for one episode of play, given the policy defined by a neural network. """
    import pdb
    pdb.set_trace()

    task = GameTask(game_env)
    game_env.recordingEnabled = True        
    game_env.reset()        
    net.reset()
    task.maxSteps=maxSteps
    agent = LearningAgent(net)
    agent.learning = False
    agent.logging = False
    exper = EpisodicExperiment(task, agent)
    fitness = 0
    for _ in range(avgOver):
        rs = exper.doEpisodes(1)
        # add a slight bonus for more exploration, if rewards are identical
        if exploretoo:
            fitness += len(set(game_env._allEvents)) * 1e-6
        # the true, discounted reward        
        fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs])
    fitness /= avgOver
    if returnEvents:
        return fitness, game_env._allEvents
    else:
        return fitness
Exemple #2
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # Store the environment as the task
    task = environment

    # Set up the Neural Network
    neuralNet = buildNetwork(task.nSenses, HIDDEN_NODES, task.nActions)

    # Use a Genetic Algorithm as the Trainer
    trainer = GA( populationSize=20, topProportion=0.2, elitism=False
                , eliteProportion=0.25, mutationProb=0.1
                , mutationStdDev=0.2, tournament=False
                , tournamentSize=2 )

    agent = OptimizationAgent(neuralNet, trainer)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the network
    meanScores = []
    print "Starting HillClimberNN"
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Training Iteration", i, "With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        environment.maxGameBlock = 0
        meanScores.append(task.meanScore)

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "hiddenNodes": HIDDEN_NODES }
    return meanScores, params, experiment
Exemple #3
0
    def train(self, episodes, maxSteps):

        avgReward = 0

        # set up environment and task
        self.env = InfoMaxEnv(self.objectNames, self.actionNames,
                              self.numCategories)
        self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \
           do_decay_beliefs = True, uniformInitialBeliefs = True)

        # create neural net and learning agent
        self.params = buildNetwork(self.task.outdim, self.task.indim, \
            bias=True, outclass=SoftmaxLayer)

        if self._PGPE:
            self.agent = OptimizationAgent(self.params,
                                           PGPE(minimize=False, verbose=False))
        elif self._CMAES:
            self.agent = OptimizationAgent(
                self.params, CMAES(minimize=False, verbose=False))

        # init and perform experiment
        exp = EpisodicExperiment(self.task, self.agent)

        for i in range(episodes):
            exp.doEpisodes(1)
            avgReward += self.task.getTotalReward()
            print "reward episode ", i, self.task.getTotalReward()

        # print initial info
        print "\naverage reward over training = ", avgReward / episodes

        # save trained network
        self._saveWeights()
Exemple #4
0
class BaggerBot:
	def __init__(self, host, port, net=None):
		self.conn = ServerConnection(host, port)
		self.env = self.conn.env
		self.conn.join()
		self.task = SurviveTask(self.env, self.conn)
		self.net = buildNetwork(self.env.outdim, 4, self.env.indim, outclass=TanhLayer)
		self.agent = OptimizationAgent(self.net, PGPE())
		self.experiment = EpisodicExperiment(self.task, self.agent)

	def wait_connected(self):
		self.conn.wait_connected()

	def train(self):
		'''
		Infinitely play the game. Figure out the next move(s), parse incoming
		data, discard all that, do stupid stuff and die :)
		'''
		while self.env.in_game:
			# Ask to be spawned
			logging.info('Requesting spawn...')
			self.conn.send_spawn()
			while not self.env.playing:
				self.conn.parse_pregame()
			while self.env.playing:
				self.experiment.doEpisodes(100)
Exemple #5
0
	def train(self, episodes, maxSteps):
 	
		avgReward = 0

		# set up environment and task
		self.env = InfoMaxEnv(self.objectNames, self.actionNames, self.numCategories)
		self.task = InfoMaxTask(self.env, maxSteps=maxSteps, \
					do_decay_beliefs = True, uniformInitialBeliefs = True)

		# create neural net and learning agent
		self.params = buildNetwork(self.task.outdim, self.task.indim, \
						bias=True, outclass=SoftmaxLayer)

		if self._PGPE:
			self.agent = OptimizationAgent(self.params, PGPE(minimize=False,verbose=False))
		elif self._CMAES:
			self.agent = OptimizationAgent(self.params, CMAES(minimize=False,verbose=False))

		# init and perform experiment
		exp = EpisodicExperiment(self.task, self.agent)

		for i in range(episodes):        
			exp.doEpisodes(1)
			avgReward += self.task.getTotalReward()
			print "reward episode ",i,self.task.getTotalReward()

		# print initial info
		print "\naverage reward over training = ",avgReward/episodes

		# save trained network
		self._saveWeights()
Exemple #6
0
    def simulate(self, random_state):
        """ Simulates agent behavior in 'n_sim' episodes.
        """
        logger.debug("Simulating user actions ({} episodes)".format(
            self.rl_params.n_simulation_episodes))
        self.experiment = EpisodicExperiment(self.task, self.agent)

        # set training flag off
        self.task.env.training = False
        # deactivate learning for experiment
        self.agent.learning = False
        # deactivate exploration
        explorer = self.agent.learner.explorer
        self.agent.learner.explorer = EGreedyExplorer(
            epsilon=0, decay=1, random_state=random_state)
        self.agent.learner.explorer.module = self.agent.module
        # activate logging
        self.task.env.start_logging()

        # simulate behavior
        self.experiment.doEpisodes(self.rl_params.n_simulation_episodes)
        # store log data
        dataset = self.task.env.log

        # deactivate logging
        self.task.env.end_logging()
        # reactivate exploration
        self.agent.learner.explorer = explorer
        # reactivate learning for experiment
        self.agent.learning = True
        # set training flag back on
        self.task.env.training = True

        return dataset
Exemple #7
0
def train():

    # Make the environment
    environment = TwentyFortyEightEnvironment()

    # The task is the game this time
    task = environment

    # Make the reinforcement learning agent (use a network because inputs are continuous)
    network = ActionValueNetwork(task.nSenses, task.nActions)

    # Use Q learning for updating the table (NFQ is for networks)
    learner = NFQ()
    learner.gamma = GAMMA

    agent = LearningAgent(network, learner)

    # Set up an experiment
    experiment = EpisodicExperiment(task, agent)

    # Train the Learner
    meanScores = []
    for i in xrange(LEARNING_EPOCHS):
        experiment.doEpisodes(GAMES_PER_EPOCH)
        print "Iteration ", i, " With mean score ", task.meanScore, "Max block achieved ", environment.maxGameBlock
        meanScores.append(task.meanScore)
        agent.learn()
        agent.reset()

    params = {"learningEpochs": LEARNING_EPOCHS, "gamesPerEpoch": GAMES_PER_EPOCH, "gamma": GAMMA }
    return meanScores, params, agent
Exemple #8
0
 def _train_model(self):
     """ Uses reinforcement learning to find the optimal strategy
     """
     self.experiment = EpisodicExperiment(self.task, self.agent)
     n_epochs = int(self.rl_params.n_training_episodes /
                    self.rl_params.n_episodes_per_epoch)
     logger.debug(
         "Fitting user model over {} epochs, each {} episodes, total {} episodes."
         .format(n_epochs, self.rl_params.n_episodes_per_epoch,
                 n_epochs * self.rl_params.n_episodes_per_epoch))
     for i in range(n_epochs):
         logger.debug("RL epoch {}".format(i))
         self.experiment.doEpisodes(self.rl_params.n_episodes_per_epoch)
         self.agent.learn()
         self.agent.reset()  # reset buffers
Exemple #9
0
	def __init__(self, host, port, net=None):
		self.conn = ServerConnection(host, port)
		self.env = self.conn.env
		self.conn.join()
		self.task = SurviveTask(self.env, self.conn)
		self.net = buildNetwork(self.env.outdim, 4, self.env.indim, outclass=TanhLayer)
		self.agent = OptimizationAgent(self.net, PGPE())
		self.experiment = EpisodicExperiment(self.task, self.agent)
def main():
    """
    The task represents one full simulation. Therefore it is episodic.
    Each episode calls performAction after passing getObservation to the agent.
    Once isFinished is true, the reward is returned and one simulation is done.

    The net is the neural network. It has 7 input nodes, a hidden layer of 5
    nodes, and 2 output nodes. It is a feed-forward network using sigmoid
    activation functions.

    OptimizationAgent(module, learner)
    EpisodicExperiment.optimizer = learner
    learner.setEvaluator(task, module)
    optimizer.learn()
    """
    task = LanderTask(batchSize=1)
    net = buildNetwork(task.indim, 5, task.outdim)
    learner = StochasticHillClimber()
    agent = OptimizationAgent(net, learner)
    experiment = EpisodicExperiment(task, agent)
    experiment.doEpisodes(100000)

    tasks = [LanderTask(environment=Lander(acceleration=float(i)))
             for i in range(1, 4)]
    test_size = 1000
    for task in tasks:
        print("Running task with acceleration {}".format(task.env.acceleration))
        success = 0
        for _ in range(test_size):
            task.env.reset()
            while not task.isFinished():
                observation = task.getObservation()
                action = net.activate(observation)
                task.performAction(action)
            print("Finished a simulation with result {}".format(task.env.status))
            if task.env.status == 'landed':
                success += 1
        print("Succeeded {} times out of {}".format(success, test_size))
def main():
    client_id = Utils.connectToVREP()

    # Define RL elements
    environment = StandingUpEnvironment(client_id)
    task = StandingUpTask(environment)
    controller = MyActionValueTable()
    learner = Q(0.5, 0.9)
    learner.explorer = EpsilonGreedyExplorer(0.15, 1)  # EpsilonGreedyBoltzmannExplorer()
    agent = LearningAgent(controller, learner)
    experiment = EpisodicExperiment(task, agent)

    controller.initialize(agent)

    i = 0
    try:
        while True:
            i += 1
            print('Episode ' + str(i))
            experiment.doEpisodes()
            agent.learn()
            agent.reset()
            print('mean: '+str(numpy.mean(controller.params)))
            print('max: '+str(numpy.max(controller.params)))
            print('min: '+str(numpy.min(controller.params)))

            if i % 500 == 0:  # Save q-table every 500 episodes
                print('Save q-table')
                controller.save()
                task.t_table.save()

    except (KeyboardInterrupt, SystemExit):
        with open('../data/standing-up-q.pkl', 'wb') as handle:
            pickle.dump(controller.params, handle)
        task.t_table.save()
        controller.save()

    vrep.simxFinish(client_id)
def someEpisodes(game_env, net, discountFactor=0.99, maxSteps=100, avgOver=1, returnEvents=False):
    """ Return the fitness value for one episode of play, given the policy defined by a neural network. """
    task = GameTask(game_env)
    game_env.recordingEnabled = True        
    game_env.reset()        
    net.reset()
    task.maxSteps=maxSteps
    agent = LearningAgent(net)
    agent.learning = False
    agent.logging = False
    exper = EpisodicExperiment(task, agent)
    fitness = 0
    for _ in range(avgOver):
        rs = exper.doEpisodes(1)
        # add a slight bonus for more exploration, if rewards are identical
        fitness += len(set(game_env._allEvents)) * 1e-6
        # the true, discounted reward        
        fitness += sum([sum([v*discountFactor**step for step, v in enumerate(r)]) for r in rs])
    fitness /= avgOver
    if returnEvents:
        return fitness, game_env._allEvents
    else:
        return fitness
Exemple #13
0
def main():
    vrep.simxFinish(-1)  # just in case, close all opened connections
    client_id = vrep.simxStart('127.0.0.1', 19997, True, True, 5000,
                               5)  # Connect to V-REP

    if client_id < 0:
        print('Failed connecting to remote API server')
        return -1

    print('Connected to remote API server')

    # Define RL elements
    environment = StandingUpEnvironment(client_id)

    task = StandingUpTask(environment)

    controller = ActionValueTable(task.get_state_space_size(),
                                  task.get_action_space_size())
    controller.initialize(1.)

    file = open('standing-up-q.pkl', 'rb')
    controller._params = pickle.load(file)
    file.close()

    # learner = Q()
    agent = LearningAgent(controller)

    experiment = EpisodicExperiment(task, agent)

    i = 0
    while True:
        i += 1
        print('Iteration n° ' + str(i))
        experiment.doEpisodes(1)

    vrep.simxFinish(client_id)
Exemple #14
0
env = CartPoleEnvironment()
if render:
    renderer = CartPoleRenderer()
    env.setRenderer(renderer)
    renderer.start()

module = ActionValueNetwork(4, 3)

task = DiscreteBalanceTask(env, 100)
learner = NFQ()
learner.explorer.epsilon = 0.4

agent = LearningAgent(module, learner)
testagent = LearningAgent(module, None)
experiment = EpisodicExperiment(task, agent)


def plotPerformance(values, fig):
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)


performance = []

if not render:
    pf_fig = plt.figure()
Exemple #15
0
class RLModel():
    def __init__(self,
                 rl_params,
                 parameter_names,
                 env,
                 task,
                 clean_after_call=False):
        """

        Parameters
        ----------
        rl_params : RLParams
        parameters : parameter names in order
        env : Environment model
        task : EpisodecTask instance
        clean_after_call: bool
        """
        self.rl_params = rl_params
        self.parameter_names = parameter_names
        self.env = env
        self.task = task
        self.agent = None
        self.clean_after_call = clean_after_call

    def to_dict(self):
        return {
            "rl_params": self.rl_params.to_dict(),
            "parameters": self.parameters,
        }

    def train_model(self, parameter_values, random_state=None):
        self._parameters(parameter_values)
        self._build_model(random_state)
        self._train_model()

    def __call__(self,
                 *parameter_values,
                 index_in_batch=None,
                 random_state=None):
        """ Simulates data.
        Interfaces to ELFI as a sequential simulator.

        Parameters
        ----------
        parameter_values : list of model variables
            Length should equal length of parameters
        random_state: random number generator

        Returns
        -------
        Simulated trajectories as a dict
        """
        print("SIM AT", parameter_values)
        self.train_model(parameter_values, random_state=random_state)
        log_dict = self.simulate(random_state)
        if self.clean_after_call is True:
            self.clean()
        return log_dict

    def get_policy(self):
        """ Returns the current policy of the agent
        """
        return self.agent.get_policy()

    def _parameters(self, parameter_values):
        """ Parse parameter values
        """
        self.p = dict()
        if len(self.parameter_names) != len(parameter_values):
            raise ValueError(
                "Number of model variables was {} ({}), expected {}".format(
                    len(parameter_values), parameter_values,
                    len(self.parameter_names)))
        for name, val in zip(self.parameter_names, parameter_values):
            self.p[name] = float(val)
        logger.debug("Model parameters: {}".format(self.p))

    def _build_model(self, random_state):
        """ Initialize the model
        """
        self.env.setup(self.p, random_state)
        self.task.setup(self.p)
        outdim = self.task.env.outdim
        n_actions = self.task.env.numActions
        self.agent = RLAgent(outdim,
                             n_actions,
                             random_state,
                             rl_params=self.rl_params)
        logger.debug("Model initialized")

    def _train_model(self):
        """ Uses reinforcement learning to find the optimal strategy
        """
        self.experiment = EpisodicExperiment(self.task, self.agent)
        n_epochs = int(self.rl_params.n_training_episodes /
                       self.rl_params.n_episodes_per_epoch)
        logger.debug(
            "Fitting user model over {} epochs, each {} episodes, total {} episodes."
            .format(n_epochs, self.rl_params.n_episodes_per_epoch,
                    n_epochs * self.rl_params.n_episodes_per_epoch))
        for i in range(n_epochs):
            logger.debug("RL epoch {}".format(i))
            self.experiment.doEpisodes(self.rl_params.n_episodes_per_epoch)
            self.agent.learn()
            self.agent.reset()  # reset buffers

    def simulate(self, random_state):
        """ Simulates agent behavior in 'n_sim' episodes.
        """
        logger.debug("Simulating user actions ({} episodes)".format(
            self.rl_params.n_simulation_episodes))
        self.experiment = EpisodicExperiment(self.task, self.agent)

        # set training flag off
        self.task.env.training = False
        # deactivate learning for experiment
        self.agent.learning = False
        # deactivate exploration
        explorer = self.agent.learner.explorer
        self.agent.learner.explorer = EGreedyExplorer(
            epsilon=0, decay=1, random_state=random_state)
        self.agent.learner.explorer.module = self.agent.module
        # activate logging
        self.task.env.start_logging()

        # simulate behavior
        self.experiment.doEpisodes(self.rl_params.n_simulation_episodes)
        # store log data
        dataset = self.task.env.log

        # deactivate logging
        self.task.env.end_logging()
        # reactivate exploration
        self.agent.learner.explorer = explorer
        # reactivate learning for experiment
        self.agent.learning = True
        # set training flag back on
        self.task.env.training = True

        return dataset

    def clean(self):
        self.agent = None
        self.env.clean()
        self.task.clean()
        gc.collect()
Exemple #16
0
from environments.continous_maze_discrete import CTS_Maze
from tasks.CTS_TASK import CTS_MazeTask
from pybrain.rl.experiments import EpisodicExperiment
from learners.baseline_learner import GP_SARSA
from agents.baseline_agent import GPSARSA_Agent
env = CTS_Maze([0.40, 0.40])  #goal

task = CTS_MazeTask(env)
learner = GP_SARSA(gamma=0.95)
learner.batchMode = False  #extra , not in use , set to True for batch learning
agent = GPSARSA_Agent(learner)
agent.logging = True

exp = EpisodicExperiment(
    task,
    agent)  #epsilon greedy exploration (with and without use of uncertainity)
plt.ion()

i = 1000
performance = []  #reward accumulation, dump variable for any evaluation metric
sum = []
agent.reset()
i = 0
for num_exp in range(100):

    performance = exp.doEpisodes(1)
    sum = np.append(sum, np.sum(performance))

    if (num_exp % 10 == 0):
        agent.init_exploration -= agent.init_exploration * 0.10
Exemple #17
0
# create environment
env = SimpleEnvironment()
env.setNoise(0.01)
# create task
task = MinimizeTask(env)
# create controller network (flat network)
net = buildNetwork(1, 1, bias=False)
net._setParameters(array([0.0]))
# create agent with controller and learner
agent = FiniteDifferenceAgent(net, FDBasic())
# initialize parameters (variance)
#agent.setSigma([-2.])
# learning options
agent.learner.alpha = 0.1
# agent.learner.rprop = True
experiment = EpisodicExperiment(task, agent)

best = 0.0
base = 0.0
rew = 0.0

for updates in range(1000):

    # testing step
    agent.disableLearning()
    experiment.doEpisodes(2)

    # append mean reward to sr array
    ret = []
    for n in range(agent.history.getNumSequences()):
        state, action, reward = agent.history.getSequence(n)
Exemple #18
0
from pybrain.rl.environments.ode import CCRLEnvironment
from pybrain.rl.environments.ode.tasks import CCRLGlasTask
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure.modules.tanhlayer import TanhLayer
from pybrain.optimization import PGPE
from pybrain.rl.agents import OptimizationAgent
from pybrain.rl.experiments import EpisodicExperiment

environment = CCRLEnvironment()
task = CCRLGlasTask(environment)
net = buildNetwork(len(task.getObservation()),4,environment.indim,outclass=TanhLayer)

agent = OptimizationAgent(net,PGPE())
experiment = EpisodicExperiment(task,agent)
for updates in range(20000):
    experiment.doEpisode()

#the task is the game this time
task = environment

#make the reinforcement learning agent (use a network because inputs are continuous)
controller = ActionValueNetwork(task.nsenses,task.nactions)

#use Q learning for updating the table (NFQ is for networks)
learner = NFQ()

agent = LearningAgent(controller, learner)



#set up an experiment
experiment = EpisodicExperiment(task, agent)

meanscores = []
m = 0.0
for i in xrange(learning_eps):
    print i
    experiment.doEpisodes(games_per_ep)
    meanscores.append(task.meanscore)
    if meanscores[-1] > m:
        m = meanscores[-1]
        f = open("bestRL.pkl",'w')
        pickle.dump(agent,f)
        f.close()
    agent.learn()
    agent.reset()
def run_experiment():
    # Create the controller network
    HIDDEN_NODES = 4

    RUNS = 2
    BATCHES = 1
    PRINTS = 1
    EPISODES = 500

    env = None
    start_state_net = None

    run_results = []

    # Set up plotting tools for the experiments
    tools = ExTools(BATCHES, PRINTS)

    # Run the experiment
    for run in range(RUNS):
        if run == 0:
            continue

        # If an environment already exists, shut it down
        if env:
            env.closeSocket()

        # Create the environment
        env = create_environment()

        # Create the task
        task = Pa10MovementTask(env)

        # Create the neural network. Only create the network once so it retains
        # the same starting values for each run.
        if start_state_net:
            net = start_state_net.copy()
        else:
            # Create the initial neural network
            net = create_network(
                    in_nodes=env.obsLen,
                    hidden_nodes=HIDDEN_NODES,
                    out_nodes=env.actLen
            )
            start_state_net = net.copy()

        # Create the learning agent
        learner = HillClimber(storeAllEvaluations=True)
        agent = OptimizationAgent(net, learner)
        tools.agent = agent

        # Create the experiment
        experiment = EpisodicExperiment(task, agent)

        # Perform all episodes in the run
        for episode in range(EPISODES):
            experiment.doEpisodes(BATCHES)

        # Calculate results
        all_results = agent.learner._allEvaluations
        max_result = np.max(all_results)
        min_result = np.min(all_results)
        avg_result = np.sum(all_results) / len(all_results)
        run_results.append((run, max_result, min_result, avg_result))

        # Make the results directory if it does not exist
        if not os.path.exists(G_RESULTS_DIR):
            os.mkdir(G_RESULTS_DIR)

        # Write all results to the results file
        with open(os.path.join(G_RESULTS_DIR, 'run_%d.txt' % run), 'w+') as f:
            # Store the calculated max, min, avg
            f.write('RUN, MAX, MIN, AVG\n')
            f.write('%d, %f, %f, %f\n' % (run, max_result, min_result, avg_result))

            # Store all results from this run
            f.write('EPISODE, REWARD\n')
            for episode, result in enumerate(all_results):
                f.write('%d, %f\n' % (episode, result))

    return
Exemple #21
0
env.setRenderer(CartPoleRenderer())
env.getRenderer().start()
env.delay = (episodes == 1)

# create task
task = BalanceTask(env, epilen)

# create controller network
net = buildNetwork(4, 1, bias=False)

# create agent and set parameters from command line
agent = LearningAgent(net, None)
agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])])

# create experiment
experiment = EpisodicExperiment(task, agent)
experiment.doEpisodes(episodes)

# run environment
ret = []
for n in range(agent.history.getNumSequences()):
    returns = agent.history.getSequence(n)
    reward = returns[2]
    ret.append( sum(reward, 0).item() )

# print results
print ret, "mean:",mean(ret)
#env.getRenderer().stop()


Exemple #22
0
hiddenUnits = 4
batch=2 #number of samples per learning step
prnts=1 #number of learning steps after results are printed
epis=5000000/batch/prnts #number of roleouts
numbExp=10 #number of experiments
et = ExTools(batch, prnts) #tool for printing and plotting

for runs in range(numbExp):
    # create environment
    #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560)
    env = FlexCubeEnvironment()
    # create task
    task = WalkTask(env)
    # create controller network
    net = buildNetwork(len(task.getObservation()), hiddenUnits, env.actLen, outclass=TanhLayer)    
    # create agent with controller and learner (and its options)
    agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True))
    et.agent = agent
     # create the experiment
    experiment = EpisodicExperiment(task, agent)

    #Do the experiment
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)
        et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates)
    et.addExps()
et.showExps()
#To view what the simulation is doing at the moment, go to pybrain/rl/environments/flexcube/ and start renderer.py (python-openGL musst be installed)
Exemple #23
0
 # create environment
 #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560)
 env = ShipSteeringEnvironment(False)
 # create task
 task = GoNorthwardTask(env,maxsteps = 500)
 # create controller network
 net = buildNetwork(task.outdim, task.indim, outclass=TanhLayer)
 # create agent with controller and learner
 agent = FiniteDifferenceAgent(net, SPLA())
 # learning options
 agent.learner.gd.alpha = 0.3 #step size of \mu adaption
 agent.learner.gdSig.alpha = 0.15 #step size of \sigma adaption
 agent.learner.gd.momentum = 0.0
 batch=2 #number of samples per gradient estimate (was: 2; more here due to stochastic setting)
 #create experiment
 experiment = EpisodicExperiment(task, agent)
 prnts=1 #frequency of console output
 epis=2000/batch/prnts
 
 #actual roll outs
 filename="dataSPLA08NoRew"+repr(int(random.random()*1000000.0))+".dat"
 wf = open(filename, 'wb')
 for updates in range(epis):
     for i in range(prnts):
         experiment.doEpisodes(batch) #execute #batch episodes
         agent.learn() #learn from the gather experience
         agent.reset() #reset agent and environment
     #print out related data
     stp = (updates+1)*batch*prnts
     print "Step: ", runs, "/", stp, "Best: ", agent.learner.best, "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward   
     wf.write(repr(stp)+"\n") 
__author__ = 'Stubborn'

from pybrain.rl.environments.ode import CCRLEnvironment
from pybrain.rl.environments.ode.tasks import CCRLGlasTask
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure.modules.tanhlayer import TanhLayer
from pybrain.optimization import PGPE
from pybrain.rl.agents import OptimizationAgent
from pybrain.rl.experiments import EpisodicExperiment

environment = CCRLEnvironment()
task = CCRLGlasTask(environment)

net = buildNetwork(len(task.getObservation()), 4, environment.indim, outclass=TanhLayer)

agent = OptimizationAgent(net, PGPE())

experiment = EpisodicExperiment(task, agent)

for updates in range(20000):
    experiment.doEpisodes(1)



Exemple #25
0
def main():
    if len(sys.argv) != 2:
        print 'Please provide a path to a model data directory.'
        print ('The script will load the newest model data from the directory,'
               'then continue to improve that model')
        sys.exit(0)

    model_directory = sys.argv[1]
    existing_models = sorted(glob(os.path.join(model_directory, '*.rlmdl')))

    if existing_models:
        newest_model_name = existing_models[-1]
        iteration_count = int(newest_model_name[-12:-6]) + 1
        print 'Loading model {}'.format(newest_model_name)

        newest_model = open(newest_model_name, 'r')
        agent = pickle.load(newest_model)
    else:
        net = buildNetwork(Environment.outdim,
                           Environment.outdim + Environment.indim,
                           Environment.indim)
        agent = OptimizationAgent(net, PGPE())
        iteration_count = 1

    environment = Environment(LOCAL_HOST, PORT, PATH_TO_SCENE)
    task = Task(environment)


    experiment = EpisodicExperiment(task, agent)


    def signal_handler(signal, frame):
        print 'Exiting gracefully'
        environment.teardown()
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)


    while True:
        time.sleep(1)

        print '>>>>> Running iteration {}'.format(iteration_count)
        # NOTE this weird stuff is hacky, but we need it to plug in our autosave
        # stuff properly. Took a long time to figure this out.
        experiment.optimizer.maxEvaluations = experiment.optimizer.numEvaluations + experiment.optimizer.batchSize

        try:
            experiment.doEpisodes()
        except Exception as e:
            print 'ERROR RUNNING SIMULATION: \n{}'.format(e)
            environment.teardown()
        else:
            if iteration_count % AUTOSAVE_INTERVAL == 0:
                filename = str(iteration_count).zfill(6) + '.rlmdl'
                filename = os.path.join(model_directory, filename)
                f = open(filename, 'w+')
                print 'Saving model to {}'.format(filename)

                pickle.dump(agent, f)

            iteration_count += 1

        print 'Iteration finished <<<<<'
Exemple #26
0
def run(nao,pad):





    # ################################
    # choose bottom cam, so nao can see object when standing next to it
    nao.camera.selectCam(1)
    
    env = grabbingEnvironment(nao)
    #env.connect(nao)

    task = grabbingTask(env)

    net = buildNetwork(len(task.getObservation()),8, env.indim, bias = True, recurrent=True)
    print env.indim
    #net = ActionValueNetwork(5,4)
    #, outclass=TanhLayer)
    #, hiddenclass=TanhLayer, outclass=TanhLayer

    # not correct right now..
    # TODO: train into RL Modules, dataset needs to be merged with exploration data
    #generateTraining.generateTraining().runDeltaMovements(nao,net,env,pad)


    #module = ActionValueNetwork(3, 3)
    #module = NeuronLayer(40)

    #agent = LearningAgent(net, SARSA())
    #learner = PolicyGradientLearner()
    #learner._setExplorer(StateDependentExplorer(3,3))
    #learner._setModule(module)
    #agent = LearningAgent(module, learner)
    #agent = LearningAgent(net, ENAC())
    #agent = LearningAgent(net, Reinforce())

    #learner = NFQ()
    #learner.explorer.epsilon = 0.4
    #agent = LearningAgent(net, learner)

    testagent = OptimizationAgent(net,None,env)
    #agent = LearningAgent(module, Q())
    #agent = LearningAgent(module, QLambda())
    learner = grabbingPGPE(storeAllEvaluations = True, verbose = True, epsilon = 1.0, deltamax =5.0, sigmaLearningRate = 0.1, learningRate = 0.2)
    agent = OptimizationAgent(net, learner,env)
    #agent = OptimizationAgent(net, SimpleSPSA(storeAllEvaluations = True, verbose = True))
    #agent = OptimizationAgent(net, HillClimber(storeAllEvaluations = True, verbose = True))

    #agent = OptimizationAgent(net, RandomSearch(storeAllEvaluations = True, verbose = True))
    
    experiment = EpisodicExperiment(task, agent)
    # only for optimizationAgent
    #experiment.doOptimization = True

    # only for simulator!
    nao.fractionMaxSpeed = 1.0



    print "#env"
    print "  sensors:", env.outdim
    print "  actions:", env.indim
    print "  discreteStates:", env.discreteStates
    print "  discreteActions:", env.discreteActions
    
    print
    print "#task"
    print "  sensor_limits:", task.sensor_limits
    print "  actor_limits:", task.actor_limits
    print "  epilen: ", task.epiLen
    print "#EpisodicTask"
    print "  discount:", task.discount
    print "  batchsize:", task.batchSize
    

    print
    print "#PGPE"
    print "  exploration type:", grabbingPGPE().exploration
    print "  LearningRate:", grabbingPGPE().learningRate
    print "  sigmaLearningRate:", grabbingPGPE().sigmaLearningRate
    print "  epsilon:", grabbingPGPE().epsilon
    print "  wDecay:", grabbingPGPE().wDecay
    print "  momentum:", grabbingPGPE().momentum
    print "  rprop:", grabbingPGPE().rprop



#    # switch this to True if you want to see the cart balancing the pole (slower)
#    render = False
#
#    plt.ion()
#
#    env = CartPoleEnvironment()
#    if render:
#        renderer = CartPoleRenderer()
#        env.setRenderer(renderer)
#        renderer.start()
#
#    module = ActionValueNetwork(4, 3)
#
#    task = DiscreteBalanceTask(env, 100)
#    learner = NFQ()
#    learner.explorer.epsilon = 0.4
#
#    agent = LearningAgent(module, learner)
#    testagent = LearningAgent(module, None)
#    experiment = EpisodicExperiment(task, agent)
#
#    performance = []
#
#    if not render:
#        pf_fig = plt.figure()

    count = 0
    while(True):
            # one learning step after one episode of world-interaction
        count += 1
        print "learning #",count
        experiment.agent = agent
        experiment.doOptimization = True
        erg = experiment.doEpisodes(1)
        print erg
        #experiment.doOptimization = False
        #print "agent learn"
        #agent.learner.learn(1)

        if count > 8:
        # test performance (these real-world experiences are not used for training)
#        if render:
#            env.delay = True
            #experiment.agent = testagent
            print "testing"
            experiment.doOptimization = False

            erg = experiment.doEpisodes(1)
            summe = 0
            #print erg
#            for x in erg:
#                summe = sum(x)
#            print summe
        #r = mean([sum(x) for x in experiment.doEpisodes(5)])
#        env.delay = False
#            testagent.reset()
        

#        performance.append(r)
#        if not render:
#            plotPerformance(performance, pf_fig)

#        print "reward avg", r
#        print "explorer epsilon", learner.explorer.epsilon
#        print "num episodes", agent.history.getNumSequences()
#        print "update step", len(performance)




#    #for updates in range(5000000)
#    updates = 0
#    episodes = 10
#    while True:
#        updates += 1
#        #raw_input("next episode")
#        print "lerne episode:",updates
#        experiment.doEpisodes(episodes)
##        print "lernen beendet, starte testlauf"
#        env.reset()
##        if updates > 0:
##            experiment.doInteractions(20)
##            rewsum = 0
##            rewlist = []
##            for i in range (0,100):
##                rew = task.performAction(net.activate(task.getObservation()))
##
##                task.getObservation()
##                rewlist.append(rew)
##                rewsum += rew
#                #print "  testlauf: ",updates,"aktion: ",i+1," reward: ",rew
##            print "-> summe = ",rewsum, " avg: ",rewsum / 100.0
#            #print "episodes:",updates," rewsum: ",rewsum," testrewards:",rewlist
##            #x = "episode:" + updates + " testrewards:" + rewlist
##            #o.write(x)
##            for i in range(0,len(rewlist)):
##                x = (updates % 20) - 10
##                y = i - 10
##                z = rewlist[i]
##                #g.plot((x,y,z),x=1, y=2, z=3)
#
#            #g.doplot()
#
#
#
#        #print "-------------------------------------------------------------------"

    print "finished grabbingTest"
                         [1, 0, 0, 0, 0, 0, 0, 0, 1],
                         [1, 1, 1, 1, 1, 1, 1, 1, 1]])

  side = 9
  goal = 3,2

  env = mazeEnv(structure, goal)   #use maze environment for now; note pos is Y,X

  # our own task and environment for later
  #env = policyEnv()
  thetask = MDPMazeTaskEpisodic(env)

  # create neural net; create and train agent
  theparams = buildNetwork(thetask.outdim, thetask.indim, bias=False)
  agent = OptimizationAgent(theparams, CMAES())
  exp = EpisodicExperiment(thetask, agent)

  # train agent        
  exp.doEpisodes(NUM_EPISODES)
  print "\ntotal reward = ",thetask.getTotalReward()

  #print "\n"
  #print "initial weights: "; print theparams.params
  print "\n"
  print "NOTE positions below are (Y,X)"

  print "\n"
  print "getting observation 1"
  print "robot = ",thetask.getObservation()
  print "goal  = ",goal
  print "reward: ", thetask.getReward()
Exemple #28
0
env = CartPoleEnvironment()
if render:
    renderer = CartPoleRenderer()
    env.setRenderer(renderer)
    renderer.start()

module = ActionValueNetwork(4, 3)

task = DiscreteBalanceTask(env, 100)
learner = NFQ()
learner.explorer.epsilon = 0.4

agent = LearningAgent(module, learner)
testagent = LearningAgent(module, None)
experiment = EpisodicExperiment(task, agent)

def plotPerformance(values, fig):
    plt.figure(fig.number)
    plt.clf()
    plt.plot(values, 'o-')
    plt.gcf().canvas.draw()
    # Without the next line, the pyplot plot won't actually show up.
    plt.pause(0.001)

performance = []

if not render:
    pf_fig = plt.figure()

while(True):
Exemple #29
0
from pybrain_extension.environment.morse_environment import ContinuousControllerEnvironment
from pybrain_extension.task.red_cube_task import CameraPixelsRedCubeTask

try:
    import __builtin__
    input = getattr(__builtin__, 'raw_input')
except (ImportError, AttributeError):
    pass


with CurrentController(3) as control:
    environment = ContinuousControllerEnvironment(control)
    task = CameraPixelsRedCubeTask(environment, True)

    experiment = EpisodicExperiment(task, None)

    # control.calibrate()

    start = time()
    bias = True

    def eval_fitness(genomes):
        for g in genomes:
            # visualize.draw_net(g, view=False)
            agent = NeatAgent(g, bias=bias)
            g.fitness = 0
            for state in range(control.get_randomize_states()):
                control.randomize(state)
                g.fitness += task.f(agent)
                if not task.found_cube:
Exemple #30
0
env = Maze(envmatrix, (7, 7))
# create task
task = MDPMazeTask(env)
# create value table and initialize with ones
table = ActionValueTable(81, 4)
table.initialize(1.)
# create agent with controller and learner - use SARSA(), Q() or QLambda() here
# learner = Q()
learner = SARSA()
# standard exploration is e-greedy, but a different type can be chosen as well
# learner.explorer = BoltzmannExplorer()
# create agent
agent = LearningAgent(table, learner)
# create experiment
# experiment = Experiment(task, agent)
experiment = EpisodicExperiment(task, agent)
# prepare plotting
pylab.gray()
pylab.ion()
for i in range(50):
    # interact with the environment (here in batch mode)
    experiment.doInteractions(100)
    agent.learn()
    agent.reset()
    # and draw the table
    pylab.imshow(table.params.reshape(81, 4).max(1).reshape(9, 9),
                 origin='upper',
                 interpolation='none')
    pylab.title('Episode: ' + str(i + 1) + '/50')
    pylab.show()
    pylab.pause(0.01)
Exemple #31
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"])

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    module = ActionValueNetwork(task.outdim, task.indim)
    

    learner = NFQ()
    # % of random actions
    learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = LearningAgent(module, learner)
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            
            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
Exemple #32
0
from pybrain.rl.experiments import EpisodicExperiment

# any episodic task
task = BalanceTask()

# any neural network controller
net = buildNetwork(task.outdim, 1, task.indim)

# any optimization algorithm to be plugged in, for example:
# learner = CMAES(storeAllEvaluations = True)
# or:
learner = HillClimber(storeAllEvaluations=True)

# in a non-optimization case the agent would be a LearningAgent:
# agent = LearningAgent(net, ENAC())
# here it is an OptimizationAgent:
agent = OptimizationAgent(net, learner)

# the agent and task are linked in an Experiment
# and everything else happens under the hood.
exp = EpisodicExperiment(task, agent)
exp.doEpisodes(100)

print('Episodes learned from:', len(learner._allEvaluations))
n, fit = learner._bestFound()
print('Best fitness found:', fit)
print('with this network:')
print(n)
print('containing these parameters:')
print(fListToString(n.params, 4))
Exemple #33
0
# create task
task = GrowTask(env)
# create controller network (flat network)
net = buildNetwork(32, 10, 12)
# create agent with controller and learner
agent = FiniteDifferenceAgent(net, SPLA())
# learning options
agent.learner.gd.alpha = 0.05
agent.learner.gdSig.alpha = 0.1
agent.learner.gd.momentum = 0.0
agent.learner.epsilon = 2.0
agent.learner.initSigmas()

sr = []

experiment = EpisodicExperiment(task, agent)
for updates in range(1000):
    # training step
    for i in range(5):
        experiment.doEpisodes(10)
        agent.learn()
        print "parameters:", agent.module.params
        agent.reset()
        
    # learning step
    agent.disableLearning()
    experiment.doEpisodes(50)
    # append mean reward to sr array
    ret = []
    for n in range(agent.history.getNumSequences()):
        state, action, reward, _ = agent.history.getSequence(n)
Exemple #34
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", task,parameters
    
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed)
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    env.randomInitialization = False
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, 50)

    #print "dim: ", task.indim, task.outdim
    
    # to inputs state and 4 actions
    bmodule = ActionValueRAND(task.outdim, task.indim)
    rlearner = RAND()

    blearner = RAND()
    # % of random actions
    
    bagent = LearningAgent(bmodule, rlearner)
    
    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=True, maxEvaluations=None, verbose=False))


    
    
    testagent = LearningAgent(module, None)
    pgpeexperiment = EpisodicExperiment(task, agent)
    randexperiment = EpisodicExperiment(task, bagent)


    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    
    ## train pgpe
    for episode in range(0,50):
    	# one learning step after one episode of world-interaction
        y =pgpeexperiment.doEpisodes(1)
        
    be, bf = agent.learner._bestFound()
    print be,bf
    
    print "generate data"
    be.numActions = 1
    gdagent = LearningAgent(be, blearner)
    experiment = EpisodicExperiment(task, gdagent)
    
    for episode in range(0,1000):
#        print episode, " of 1000"
    	# one learning step after one episode of world-interaction
        y =experiment.doEpisodes(1)
        
#        print y
        x = randexperiment.doEpisodes(1)
#        print len(y[0])
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        

        l = 5
        resList = (agent.learner._allEvaluations)[-l:-1]
        
#            print agent.learner._allEvaluations
        from scipy import array

        rLen = len(resList)
        avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
        #print resList
        performance.append(avReward)
        

        env.delay = False
        testagent.reset()
        #experiment.agent = agent
    
#            performance.append(r)
        if plot:
            plotPerformance(performance, pf_fig)
            
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
    blearner.add_ds(rlearner.dataset)
    
    blearner.learn()
    #blearner.learnX(agent.learner._allEvaluated)
    print "done"
    return performance
Exemple #35
0
task = LinearFATileCoding3456BalanceTask()
env = task.env

# The learning is very sensitive to the learning rate decay.
learner = SARSALambda_LinFA_ReplacingTraces(task.nactions,
                                            task.outdim,
                                            learningRateDecay=2000)
learner._lambda = 0.95

task.discount = learner.rewardDiscount

agent = LinearFA_Agent(learner)
agent.logging = False

exp = EpisodicExperiment(task, agent)

performance_agent = LinearFA_Agent(learner)
performance_agent.logging = False
performance_agent.greedy = True
performance_agent.learning = False

env.saveWheelContactTrajectories(True)
plt.ion()
plt.figure(figsize=(8, 4))

ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)


def update_wheel_trajectories():
Exemple #36
0
def run(arg):
    task = arg[0]
    parameters = arg[1]
    #print "run with", parameters
    
    seed = parameters["seed"]
   

    process_id = hash(multiprocessing.current_process()._identity)
    numpy.random.seed(seed + process_id)


    
    
    render = False    
    plot = False
    
    plt.ion()
    
    env = CartPoleEnvironment()
    if render:
        renderer = CartPoleRenderer()
        env.setRenderer(renderer)
        renderer.start()
    
    task_class = getattr(cp, task)
    task = task_class(env, parameters["MaxRunsPerEpisode"])
    
    testtask = task_class(env, parameters["MaxRunsPerEpisodeTest"],desiredValue=None)

    #print "dim: ", task.indim, task.outdim

    from pybrain.tools.shortcuts import buildNetwork
    from pybrain.rl.agents import OptimizationAgent
    from pybrain.optimization import PGPE

    module = buildNetwork(task.outdim, task.indim, bias=False)
    # create agent with controller and learner (and its options)

    # % of random actions
    #learner.explorer.epsilon = parameters["ExplorerEpsilon"]
    
    
    agent = OptimizationAgent(module, PGPE(storeAllEvaluations = True,storeAllEvaluated=False, maxEvaluations=None,desiredEvaluation=1, verbose=False))
#
#    print agent
#    from pprint import pprint
#    pprint (vars(agent.learner))
    
    testagent = LearningAgent(module, None)
    experiment = EpisodicExperiment(task, agent)
    testexperiment = EpisodicExperiment(testtask, testagent)

    
    def plotPerformance(values, fig):
        plt.figure(fig.number)
        plt.clf()
        plt.plot(values, 'o-')
        plt.gcf().canvas.draw()
        # Without the next line, the pyplot plot won't actually show up.
        plt.pause(0.001)
    
    performance = []
    
    if plot:
        pf_fig = plt.figure()
    
    m = parameters["MaxTotalEpisodes"]/parameters["EpisodesPerLearn"]
    for episode in range(0,m):
    	# one learning step after one episode of world-interaction
        experiment.doEpisodes(parameters["EpisodesPerLearn"])
        #agent.learn(1)
    
        #renderer.drawPlot()
        
        # test performance (these real-world experiences are not used for training)
        if plot:
            env.delay = True
        
        if (episode) % parameters["TestAfter"] == 0:
            #print "Evaluating at episode: ", episode
            
            #experiment.agent = testagent
            #r = mean([sum(x) for x in testexperiment.doEpisodes(parameters["TestWith"])])
            #for i in range(0,parameters["TestWith"]):
#            y = testexperiment.doEpisodes(1)
#            print (agent.learner._allEvaluated)
#                
#            
#            from pprint import pprint
#            pprint (vars(task))
                
            l = parameters["TestWith"]
            
            task.N = parameters["MaxRunsPerEpisodeTest"]
            experiment.doEpisodes(l)
            task.N = parameters["MaxRunsPerEpisode"]

            resList = (agent.learner._allEvaluations)[-l:-1]
            
#            print agent.learner._allEvaluations
            from scipy import array

            rLen = len(resList)
            avReward = array(resList).sum()/rLen
#            print avReward
#            print resList
#            exit(0)
#            print("Parameters:", agent.learner._bestFound())
#            print(
#                " Evaluation:", episode,
#                " BestReward:", agent.learner.bestEvaluation,
#                " AverageReward:", avReward)
#            if agent.learner.bestEvaluation == 0:
#                
#                print resList[-20:-1]
#                print "done"
#                break
            performance.append(avReward)
            

            env.delay = False
            testagent.reset()
            #experiment.agent = agent
        
#            performance.append(r)
            if plot:
                plotPerformance(performance, pf_fig)
        
#            print "reward avg", r
#            print "explorer epsilon", learner.explorer.epsilon
#            print "num episodes", agent.history.getNumSequences()
#            print "update step", len(performance)
            
#    print "done"
    return performance
            
        #print "network",   json.dumps(module.bn.net.E, indent=2)
            
            
#import sumatra.parameters as p
#import sys
#parameter_file = sys.argv[1]
#parameters = p.SimpleParameterSet(parameter_file)
#
#
#run(["BalanceTask",parameters])
Exemple #37
0
    agent = FiniteDifferenceAgent(net, SPLA())
    # learning options
    agent.learner.gd.alpha = 0.2  #step size of \mu adaption
    agent.learner.gdSig.alpha = 0.085  #step size of \sigma adaption
    agent.learner.gd.momentum = 0.0

    #Loading weights
    if loadNet:
        agent.learner.original = loadWeights("grasp.wgt")
        agent.learner.gd.init(agent.learner.original)
        agent.learner.epsilon = 0.2
        agent.learner.initSigmas()

    batch = 2  #number of samples per gradient estimate
    #create experiment
    experiment = EpisodicExperiment(task, agent)
    prnts = 1  #frequency of console output
    epis = 5000000 / batch / prnts

    #actual roll outs
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)  #execute batch episodes
            agent.learn()  #learn from the gather experience
            agent.reset()  #reset agent and environment
        #print out related data
        print "Step: ", runs, "/", (
            updates + 1) * batch * prnts, "Best: ", agent.learner.best,
        print "Base: ", agent.learner.baseline, "Reward: ", agent.learner.reward
        #Saving weights
        if saveNet:
	def train(self, size, goal, initPose, mapSelect, envSelect, episodes, maxSteps, goalTol, randomizeInitPose):
 	
		avgReward = 0

		# set up environment and task
		self.env = mazeEnv(size, goal, initPose, mapSelect, envSelect, randomizeInitPose)
		self.task = MDPMazeTaskEpisodic(self.env, maxSteps, goalTol)

		# create neural net and learning agent
		self.params = buildNetwork(self.task.outdim, 48, self.task.indim, \
		bias=True, outclass=SoftmaxLayer)

		if self._PGPE:
			self.agent = OptimizationAgent(self.params, PGPE(minimize=True,verbose=False))
		elif self._CMAES:
			self.agent = OptimizationAgent(self.params, CMAES(minimize=True,verbose=False))

		# init experiment
		exp = EpisodicExperiment(self.task, self.agent)

		for i in range(0, episodes):        
			exp.doEpisodes()
			avgReward += self.task.getTotalReward()
			print "reward episode ",i,self.task.getTotalReward()

		# print initial info
		print "\naverage reward over training = ",avgReward/episodes

		# import weights into network and save network
		if self._PGPE:
			for i in range(len(self.params.params)):
				self.params.params[i] = self.agent.learner.current[i]
			pickle.dump(self.params, open('policyNet.pkl','w'))

		elif self._CMAES:

			################ following code came from WWInfoMaxCMAES.py script from ICDL 2010 paper
			arz = randn(self.agent.learner.numParameters, self.agent.learner.batchSize)
			arx = tile(self.agent.learner.center.reshape(self.agent.learner.numParameters, 1),\
			(1, self.agent.learner.batchSize)) + \
			self.agent.learner.stepSize * dot(dot(self.agent.learner.B, self.agent.learner.D), arz)
			# Go through the parameters and pick the current best 
			arfitness = zeros(self.agent.learner.batchSize)
			for k in xrange(self.agent.learner.batchSize):
			  self.agent.learner.wrappingEvaluable._setParameters(arx[:, k]);
			  arfitness[k] = self.agent.learner._BlackBoxOptimizer__evaluator\
			(self.agent.learner.wrappingEvaluable)

			# Sort by fitness and compute weighted mean into center
			tmp = sorted(map(lambda (x, y): (y, x), enumerate(ravel(arfitness))))
			arfitness = array(map(lambda x: x[0], tmp))
			arindex = array(map(lambda x: int(x[1]), tmp))

			arz = arz[:, arindex]
			curparams = arx[:, arindex[0]];

			# update network weights with selected parameters
			for i in range(len(self.params.params)):
				self.params.params[i] = curparams[i]
			# save trained network
			pickle.dump(self.params, open('policyNet.pkl','w'))
Exemple #39
0
sum = []

track_time = []
dict_size = []

for repeat in range(1):
    env = CTS_Maze([0.50, 0.50])  # goal

    task = CTS_MazeTask(env)
    learner = GP_SARSA_SPARSE(gamma=0.95)
    learner.sigma = 1
    learner.batchMode = False  # extra , not in use , set to True for batch learning
    agent = GPSARSA_Agent(learner)
    agent.logging = True

    exp = EpisodicExperiment(task, agent)
    agent.reset()
    sum = []
    performance = []
    track_time = []
    agent.init_exploration = 1.0
    starttime = time.time()
    dict_size = []
    epsilon = []

    b = []
    c = []
    for num_exp in range(600):
        performance = exp.doEpisodes(1)
        sum = np.append(sum, np.sum(performance))
Exemple #40
0
task = LinearFATileCoding3456BalanceTaskRewardBipolar()
learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim)
learner._lambda = 0.95
task.discount = learner.rewardDiscount
agent = LinearFA_Agent(learner)
agent.epsilonGreedy = True
agent.init_exploration = 0.5
# The state has a huge number of dimensions, and the logging causes me to run
# out of memory. We needn't log, since learning is done online.
agent.logging = False
performance_agent = LinearFA_Agent(learner)
performance_agent.logging = False
performance_agent.greedy = True
performance_agent.learning = False
experiment = EpisodicExperiment(task, agent)

# TODO PyBrain says that the learning rate needs to decay, but I don't see that
# described in Randlov's paper.
# A higher number here means the learning rate decays slower.
learner.learningRateDecay = 2000
# NOTE increasing this number above from the default of 100 is what got the
# learning to actually happen, and fixed the bug/issue where the performance
# agent's performance stopped improving.

tr = LinearFATraining(
    'balance_sarsalambda_linfa_replacetrace_anneal_RewardBipolar_take2',
    experiment,
    performance_agent,
    verbose=True)
Exemple #41
0
from pybrain.rl.environments.cartpole.balancetask import BalanceTask
from pybrain.tools.shortcuts import buildNetwork
from pybrain.rl.agents import OptimizationAgent
from pybrain.rl.experiments import EpisodicExperiment
from pybrain.optimization import HillClimber

task = BalanceTask()
net = buildNetwork(task.outdim, 3, task.indim)
HillClimber(task, net, maxEvaluations=100).learn()
agent = OptimizationAgent(net, HillClimber())
exp = EpisodicExperiment(task, agent)
print(exp.doEpisodes(100))
Exemple #42
0
from pybrain.rl.environments.timeseries.timeseries import MonthlySnPEnvironment
from pybrain.rl.learners.directsearch.rrl import RRL

from pybrain.structure import RecurrentNetwork
from pybrain.structure import LinearLayer, SigmoidLayer, TanhLayer, BiasUnit
from pybrain.structure import FullConnection
from pybrain.rl.agents import LearningAgent
from pybrain.rl.experiments import EpisodicExperiment

from numpy import sign, round
from matplotlib import pyplot

net= RecurrentNetwork()
#Single linear layer with bias unit, and single tanh layer. the linear layer is whats optimised
net.addInputModule(BiasUnit(name='bias'))
net.addOutputModule(TanhLayer(1, name='out'))
net.addRecurrentConnection(FullConnection(net['out'], net['out'], name='c3'))
net.addInputModule(LinearLayer(1,name='in'))
net.addConnection(FullConnection(net['in'],net['out'],name='c1'))
net.addConnection((FullConnection(net['bias'],net['out'],name='c2')))
net.sortModules()
net._setParameters([-8.79227886e-02, -8.29319017e+02, 1.25946474e+00])
print(net._params)
env=MonthlySnPEnvironment()
task=MaximizeReturnTask(env)
learner = RRL() # ENAC() #Q_LinFA(2,1)
agent = LearningAgent(net,learner)
exp=EpisodicExperiment(task,agent)

exp.doEpisodes(10)
Exemple #43
0
            self._etraces[argwhere(action_bit != 1), argstate] = 0.0


task = LinearFATileCoding3456BalanceTask()
env = task.env

# The learning is very sensitive to the learning rate decay.
learner = SARSALambda_LinFA_ReplacingTraces(task.nactions, task.outdim, learningRateDecay=2000)
learner._lambda = 0.95

task.discount = learner.rewardDiscount

agent = LinearFA_Agent(learner)
agent.logging = False

exp = EpisodicExperiment(task, agent)

performance_agent = LinearFA_Agent(learner)
performance_agent.logging = False
performance_agent.greedy = True
performance_agent.learning = False

env.saveWheelContactTrajectories(True)
plt.ion()
plt.figure(figsize=(8, 4))

ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)


def update_wheel_trajectories():
Exemple #44
0
for runs in range(numbExp):
    # create environment
    #Options: Bool(OpenGL), Bool(Realtime simu. while client is connected), ServerIP(default:localhost), Port(default:21560)
    if env != None: env.closeSocket()
    env = ShipSteeringEnvironment()
    # create task
    task = GoNorthwardTask(env, maxsteps=500)
    # create controller network
    net = buildNetwork(task.outdim, task.indim, outclass=TanhLayer)
    # create agent with controller and learner (and its options)
    agent = OptimizationAgent(
        net,
        PGPE(learningRate=0.3,
             sigmaLearningRate=0.15,
             momentum=0.0,
             epsilon=2.0,
             rprop=False,
             storeAllEvaluations=True))
    et.agent = agent
    #create experiment
    experiment = EpisodicExperiment(task, agent)

    #Do the experiment
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)
        et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates)
    et.addExps()
et.showExps()
#To view what the simulation is doing at the moment set the environment with True, go to pybrain/rl/environments/ode/ and start viewer.py (python-openGL musst be installed, see PyBrain documentation)
Exemple #45
0
        last_xf = self.env.last_xf
        last_yf = self.env.last_yf
        dist_to_goal_last = np.linalg.norm(target -
                                           np.array([last_xf, last_yf]))
        delta_dist = dist_to_goal - dist_to_goal_last
        return -delta_tilt - delta_dist * 0.01


task = LSPI_task()
learner = LSPI(9, 20)
task.rewardDiscount = 0.8
learner.rewardDiscount = 0.8

agent = LinearFA_Agent(learner)
agent.epsilonGreedy = True
exp = EpisodicExperiment(task, agent)
learner.learningRateDecay = 3000
max_agent = LinearFA_Agent(learner)
max_agent.learnerning = False
max_agent.greedy = True

task.env.saveWheelContactTrajectories(True)
plt.ion()
plt.figure(figsize=(8, 4))

ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)


def update_wheel_trajectories():
    front_lines = ax2.plot(task.env.get_xfhist(), task.env.get_yfhist(), 'r')