Python Experiment.doInteractions Beispiele, pybrain.rl.experiments.Experiment.doInteractions Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: optimizationtest.py Projekt: gabrielhuang/pybrain

def testMaze():
    # simplified version of the reinforcement learning tutorial example
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 0, 0, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1]])
    shape = np.array(structure.shape)
    environment = Maze(structure,  tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(3):
        experiment.doInteractions(40)

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order 
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4),1)
    greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(''.join(row) for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'

Beispiel #2

0

Datei anzeigen

Datei: main.py Projekt: ThermalSoaring/Machine-Learning-Policy-Formation

def testNet(learner, moduleNet, env, maxPlaneStartDist, stepSize,numAngs,thermRadius):
    # Turn off exploration
    from pybrain.rl.explorers.discrete.egreedy import EpsilonGreedyExplorer
    learner._setExplorer(EpsilonGreedyExplorer(0))
    agent = LearningAgent(moduleNet, learner)      

    # Move the plane back to the start by resetting the environment
    env = contEnv.contThermEnvironment(maxPlaneStartDist, stepSize,numAngs,thermRadius) 
    from simpleThermalTask import SimpThermTask
    task = SimpThermTask(env)
    from pybrain.rl.experiments import Experiment
    experiment = Experiment(task, agent)

    # Have the plane move 100 times, and plot the position of the plane (hopefully it moves to the high reward area)
    testIter = 100
    trainResults = [env.distPlane()]
    for i in range(testIter):
        experiment.doInteractions(1) 
        trainResults.append(env.distPlane())  
        
    # Plot the training results
    import matplotlib.pyplot as plt
    plt.figure(1)
    plt.plot(trainResults,'o')
    plt.ylabel('Distance from center of thermal')
    plt.xlabel('Interaction iteration')
    plt.title('Test Results for Neural Fitted Q Learner')
    plt.show()

Beispiel #3

0

Datei anzeigen

Datei: bot.py Projekt: tsvvladimir95/simple_bot

def run_bbox(verbose=False):
    n_features = n_actions = max_time = -1

    if bbox.is_level_loaded():
        bbox.reset_level()
    else:
        bbox.load_level("../levels/train_level.data", verbose=1)
        n_features = bbox.get_num_of_features()
        n_actions = bbox.get_num_of_actions()
        max_time = bbox.get_max_time()

    av_table = ActionValueTable(n_features, n_actions)
    av_table.initialize(0.2)
    print av_table._params
    learner = Q(0.5, 0.1)
    learner._setExplorer(EpsilonGreedyExplorer(0.4))
    agent = LearningAgent(av_table, learner)
    environment = GameEnvironment()
    task = GameTask(environment)
    experiment = Experiment(task, agent)

    while environment.finish_flag:
        experiment.doInteractions(1)
        agent.learn()
 
    bbox.finish(verbose=1)

Beispiel #4

0

Datei anzeigen

 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)),
         self.force_granularity)
     self.controller.initialize(1.)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)

Beispiel #5

0

Datei anzeigen

Datei: ReinforcedController.py Projekt: pawel-k/pendulum

 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)), self.force_granularity
     )
     self.controller.initialize(1.0)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)

Beispiel #6

0

Datei anzeigen

Datei: example.py Projekt: nvaller/pug-ann

    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            "!!!!!!!!!!",
            "! !  ! ! !",
            "! !! ! ! !",
            "!    !   !",
            "! !!!!!! !",
            "! ! !    !",
            "! ! !!!! !",
            "!        !",
            "! !!!!!  !",
            "!   !    !",
            "!!!!!!!!!!",
        ]
        structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.0)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(" #"))[structure])
        print("Maze map:")
        print("\n".join("".join(row) for row in maze))
        print("Greedy policy:")
        print("\n".join("".join(row) for row in greedy_policy))

Beispiel #7

0

Datei anzeigen

Datei: example.py Projekt: nvaller/pug-ann

    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            '!!!!!!!!!!',
            '! !  ! ! !',
            '! !! ! ! !',
            '!    !   !',
            '! !!!!!! !',
            '! ! !    !',
            '! ! !!!! !',
            '!        !',
            '! !!!!!  !',
            '!   !    !',
            '!!!!!!!!!!',
            ]
        structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(' #'))[structure])
        print('Maze map:')
        print('\n'.join(''.join(row) for row in maze))
        print('Greedy policy:')
        print('\n'.join(''.join(row) for row in greedy_policy))

Beispiel #8

0

Datei anzeigen

def main():
    rospy.init_node("lauron_reinforcement_learning")
    environment = RLEnvironment()
    dim_state = environment.joint_states.shape[0]
    num_actions = len(environment.actions)
    controller = ActionValueNetwork(dim_state, num_actions)
    learner = SARSA()
    agent = LearningAgent(controller, learner)
    task = RLTask(environment)
    experiment = Experiment(task, agent)

    episode_counter = 0
    while True:
        print("Training episode {}".format(episode_counter))
        experiment.doInteractions(NUM_INTERACTIONS)
        agent.learn()
        agent.reset()
        episode_counter += 1

Beispiel #9

0

Datei anzeigen

Datei: test_maze.py Projekt: ishatserka/MachineLearningAndDataAnalysisCoursera

def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list('!!!!!!!!!!'),
        list('! !  ! ! !'),
        list('! !! ! ! !'),
        list('!    !   !'),
        list('! !!!!!! !'),
        list('! ! !    !'),
        list('! ! !!!! !'),
        list('!        !'),
        list('! !!!!!  !'),
        list('!   !    !'),
        list('!!!!!!!!!!'),
    ]
    structure = np.array([[ord(c) - ord(' ') for c in row]
                          for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(
        np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(
        ''.join(row)
        for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'

Beispiel #10

0

Datei anzeigen

Datei: main.py Projekt: Petlefeu/Q_Blackjack

def run():
    """
    number of states is:
    current value: 0-20

    number of actions:
    Stand=0, Hit=1 """

    # define action value table
    av_table = ActionValueTable(MAX_VAL, MIN_VAL)
    av_table.initialize(0.)

    # define Q-learning agent
    q_learner = Q(Q_ALPHA, Q_GAMMA)
    q_learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(av_table, q_learner)

    # define the environment
    env = BlackjackEnv()

    # define the task
    task = BlackjackTask(env, verbosity=VERBOSE)

    # finally, define experiment
    experiment = Experiment(task, agent)

    # ready to go, start the process
    for _ in range(NB_ITERATION):
        experiment.doInteractions(1)
        if task.lastreward != 0:
            if VERBOSE:
                print "Agent learn"
            agent.learn()

    print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|'
    print '|:-------:|:-------|:-----|:-----|'
    for i in range(MAX_VAL):
        print '| %s | %s | %s | %s |' % (
            (i+1),
            av_table.getActionValues(i)[0],
            av_table.getActionValues(i)[1],
            av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1]
        )

Beispiel #11

0

Datei anzeigen

Datei: rl.py Projekt: savamarius/rassh

class RL:
    def __init__(self):
	self.av_table = ActionValueTable(4, 5)
	self.av_table.initialize(0.1)

	learner = SARSA()
	learner._setExplorer(EpsilonGreedyExplorer(0.0))
	self.agent = LearningAgent(self.av_table, learner)

	env = HASSHEnv()

	task = HASSHTask(env)

	self.experiment = Experiment(task, self.agent)

    def go(self):
      global rl_params
      rassh.core.constants.rl_params = self.av_table.params.reshape(4,5)[0]
      self.experiment.doInteractions(1)
      self.agent.learn()

Beispiel #12

0

Datei anzeigen

Datei: rl.py Projekt: weiyuchen/RLHPot

class RL:
    def __init__(self):
	self.av_table = ActionValueTable(2, 3)
	self.av_table.initialize(0.1)

	learner = SARSA()
	learner._setExplorer(EpsilonGreedyExplorer(0.0))
	self.agent = LearningAgent(self.av_table, learner)

	env = HASSHEnv()

	task = HASSHTask(env)

	self.experiment = Experiment(task, self.agent)

    def go(self):
      global rl_params
      kippo.core.constants.rl_params = self.av_table.params.reshape(2,3)[0]
      self.experiment.doInteractions(1)
      self.agent.learn()

Beispiel #13

0

Datei anzeigen

Datei: main.py Projekt: nbeguier/Q_Blackjack

def run():
    """
    number of states is:
    current value: 0-20

    number of actions:
    Stand=0, Hit=1 """

    # define action value table
    av_table = ActionValueTable(MAX_VAL, MIN_VAL)
    av_table.initialize(0.)

    # define Q-learning agent
    q_learner = Q(Q_ALPHA, Q_GAMMA)
    q_learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(av_table, q_learner)

    # define the environment
    env = BlackjackEnv()

    # define the task
    task = BlackjackTask(env, verbosity=VERBOSE)

    # finally, define experiment
    experiment = Experiment(task, agent)

    # ready to go, start the process
    for _ in range(NB_ITERATION):
        experiment.doInteractions(1)
        if task.lastreward != 0:
            if VERBOSE:
                print "Agent learn"
            agent.learn()

    print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|'
    print '|:-------:|:-------|:-----|:-----|'
    for i in range(MAX_VAL):
        print '| %s | %s | %s | %s |' % (
            (i + 1),
            av_table.getActionValues(i)[0], av_table.getActionValues(i)[1],
            av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])

Beispiel #14

0

Datei anzeigen

Datei: example.py Projekt: nvaller/pug-ann

def explore_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list("!!!!!!!!!!"),
        list("! !  ! ! !"),
        list("! !! ! ! !"),
        list("!    !   !"),
        list("! !!!!!! !"),
        list("! ! !    !"),
        list("! ! !!!! !"),
        list("!        !"),
        list("! !!!!!  !"),
        list("!   !    !"),
        list("!!!!!!!!!!"),
    ]
    structure = np.array([[ord(c) - ord(" ") for c in row] for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.0)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(np.array(list("NESW"))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(" #"))[structure])
    print("Maze map:")
    print("\n".join("".join(row) for row in maze))
    print("Greedy policy:")
    print("\n".join("".join(row) for row in greedy_policy))
    assert "\n".join("".join(row) for row in greedy_policy) == "NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN"

Beispiel #15

0

Datei anzeigen

Datei: playyourcardsright.py Projekt: MYMSK4K/SaltwashAR

class PlayYourCardsRight(Feature):
  
    def __init__(self, text_to_speech, speech_to_text):
        Feature.__init__(self)

        # setup AV Table
        self.av_table = GameTable(13, 2)
        if(self.av_table.loadParameters() == False):
            self.av_table.initialize(0.)
 
        # setup a Q-Learning agent
        learner = Q(0.5, 0.0)
        learner._setExplorer(EpsilonGreedyExplorer(0.0))
        self.agent = LearningAgent(self.av_table, learner)
 
        # setup game interaction
        self.game_interaction = GameInteraction(text_to_speech, speech_to_text)

        # setup environment
        environment = GameEnvironment(self.game_interaction)
 
        # setup task
        task = GameTask(environment, self.game_interaction)
 
        # setup experiment
        self.experiment = Experiment(task, self.agent)
    
    @property
    def is_speaking(self):
        return self.game_interaction.is_speaking

    def _thread(self, args):
        # let's play our cards right!
        while not self.is_stop:
            self.experiment.doInteractions(1)
            self.agent.learn()
            self.av_table.saveParameters()

Beispiel #16

0

Datei anzeigen

Datei: playyourcardsright.py Projekt: billthefighter/HydrousGlyphFinder

class PlayYourCardsRight(Feature):
    def __init__(self, text_to_speech, speech_to_text):
        Feature.__init__(self)

        # setup AV Table
        self.av_table = GameTable(13, 2)
        if (self.av_table.loadParameters() == False):
            self.av_table.initialize(0.)

        # setup a Q-Learning agent
        learner = Q(0.5, 0.0)
        learner._setExplorer(EpsilonGreedyExplorer(0.0))
        self.agent = LearningAgent(self.av_table, learner)

        # setup game interaction
        self.game_interaction = GameInteraction(text_to_speech, speech_to_text)

        # setup environment
        environment = GameEnvironment(self.game_interaction)

        # setup task
        task = GameTask(environment, self.game_interaction)

        # setup experiment
        self.experiment = Experiment(task, self.agent)

    @property
    def is_speaking(self):
        return self.game_interaction.is_speaking

    def _thread(self, args):
        # let's play our cards right!
        while not self.is_stop:
            self.experiment.doInteractions(1)
            self.agent.learn()
            self.av_table.saveParameters()

Beispiel #17

0

Datei anzeigen

Datei: bandit.py Projekt: Waqquas/pylon

table = PropensityTable(payouts.shape[0])
table.initialize(500.0)

#learner = RothErev(experimentation=0.55, recency=0.3)
learner = VariantRothErev(experimentation=0.65, recency=0.3)
learner.explorer = BoltzmannExplorer(tau=100.0, decay=0.9995)

agent = LearningAgent(table, learner)

experiment = Experiment(task, agent)

epis = int(1e1)
batch = 2
avgRewards = scipy.zeros(epis)
allActions = scipy.zeros(epis * batch)
c = 0
for i in range(epis):
    experiment.doInteractions(batch)
    avgRewards[i] = scipy.mean(agent.history["reward"])
    allActions[c:c + batch] = agent.history["action"].flatten() + 1
    agent.learn()
    agent.reset()

    c += batch

pylab.figure(figsize=(16, 6))
#pylab.plot(avgRewards)
pylab.plot(allActions)
pylab.show()

Beispiel #18

0

Datei anzeigen

Datei: chain.py Projekt: alexandrwang/6882project

#    controller.initialize(0.)

#    learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8
    learner = Q() # default alpha 0.5, gamma 0.99
#    learner._setExplorer(EpsilonGreedyExplorer(0.5))
    agent = LearningAgent(controller, learner)

    task = ChainTask(env)
    exp = Experiment(task, agent)

    reward = 0
    xs = []
    ys = []

    import matplotlib.pyplot as plt
    for i in xrange(5000):
        exp.doInteractions(1)
        agent.learn()

        reward += agent.lastreward

        if i%100 == 0:
            xs.append(i)
            ys.append(reward)
            print i
        # print learner.laststate, learner.lastaction, learner.lastreward
#        print controller.params.reshape(5, 2)

    print "TOTAL REWARD:", reward
    print ys

Beispiel #19

0

Datei anzeigen

table = PropensityTable(payouts.shape[0])
table.initialize(500.0)

#learner = RothErev(experimentation=0.55, recency=0.3)
learner = VariantRothErev(experimentation=0.65, recency=0.3)
learner.explorer = BoltzmannExplorer(tau=100.0, decay=0.9995)

agent = LearningAgent(table, learner)

experiment = Experiment(task, agent)

epis = int(1e1)
batch = 2
avgRewards = scipy.zeros(epis)
allActions = scipy.zeros(epis * batch)
c = 0
for i in range(epis):
    experiment.doInteractions(batch)
    avgRewards[i] = scipy.mean(agent.history["reward"])
    allActions[c:c + batch] = agent.history["action"].flatten() + 1
    agent.learn()
    agent.reset()

    c += batch

pylab.figure(figsize=(16, 6))
#pylab.plot(avgRewards)
pylab.plot(allActions)
pylab.show()

Beispiel #20

0

Datei anzeigen

Datei: trainRLObjectLocalizer.py Projekt: jccaicedo/localization-agent

from ObjectLocalizerEnvironment import ObjectLocalizerEnvironment
from DeepQNetwork import DeepQNetwork
from DeepQLearning import DeepQLearning
from MDPObjectLocalizerTask import MDPObjectLocalizerTask
from ObjectLocalizationAgent import ObjectLocalizationAgent

print 'Starting Environment'
epsilon = 1.0
environment = ObjectLocalizerEnvironment(config.get('imageDir'), config.get('candidatesFile'), 'Training')
print 'Initializing DeepQNetwork'
controller = DeepQNetwork()
controller.setEpsilonGreedy(epsilon)
print 'Initializing Q Learner'
learner = DeepQLearning()
print 'Preparing Agent'
agent = ObjectLocalizationAgent(controller, learner)
print 'Configuring Task'
task = MDPObjectLocalizerTask(environment, config.get('groundTruth'))
print 'Setting up Experiment'
experiment = Experiment(task, agent)
i = 0
print 'Main Loop'
while i < config.geti('maximumEpochs'):
  print 'Epoch',i,'(epsilon:{:5.3f})'.format(epsilon)
  experiment.doInteractions(int(config.get('numInteractions')))
  agent.learn()
  agent.reset()
  i += 1
  epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon)
  controller.setEpsilonGreedy(epsilon)

Beispiel #21

0

Datei anzeigen

# pylab.gray()
# pylab.ion()

# Learning phase
# Num iterations used for PROHA Workshop perliminary evaluation
# numIterations   = 1600
numIterations = 1500
numInteractions = 600

# Num iterations used for PROHA and PROLE slides
# numIterations   = 10
# numInteractions = 3
for i in range(numIterations):

    # interact with the environment (here in batch mode)
    experiment.doInteractions(numInteractions)
    agent.learn()
    agent.reset()

#    # and draw the table
#    # pylab.pcolor(table.params.reshape(numStates,numActions).max(1).reshape(numStates,numStates))
#    # # pylab.savefig('myfilename_%2d.png' % (i))
#    # pylab.show(block=True)
#    # print table.params.reshape(numStates,numActions).max(1).reshape(numStates,1)

#     print("\nIteration: %d" % (i))
#     print table.params.reshape(numStates,numActions)
#
# print "-------------------------------------------------"
# exit(0)
# print table.params.reshape(numStates,numActions)

Beispiel #22

0

Datei anzeigen

    # Setup the PyBrain and PyGame Environments
    environment = Environment()
    game = RunPacman(environment)

    # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action
    task = PacmanTask(environment, game)
    task.performAction(np.array([1]))

    # The Experiment is the PyBrain link between the task to be completed and the agent completing it
    experiment = Experiment(task, agent)
    currentGame = 1

    # Continue to loop program until the 'X' on the GUI is clicked
    while True:

        # Allow the agent to interaction with the environment (Move in a direction) then learn from it.
        experiment.doInteractions(1)
        agent.learn()

        # Check if current pacman game ended and needs to start a new one
        if game.wonGame == 1 or game.wonGame == -1:
            currentGame += 1

            # Store the information the agent has learned in long term memory,
            # Clear the short term memory to reduce any chance of overfitting,
            # Reset the Pac-Man game, and the environment for the next game test
            agent.reset()
            environment.resetMap()
            game.__init__(environment)

Beispiel #23

0

Datei anzeigen

Datei: animats.py Projekt: ericmarcincuddy/cs263c

  )
  predTable.initialize(0.)

  predLearner = Q(ALPHA, GAMMA)
  predLearner._setExplorer(EpsilonGreedyExplorer(EPSILON))
  predAgent = LearningAgent(predTable, predLearner)

  predEnv = PredatorEnvironment(world)
  predTask = PredatorTask(predEnv)
  predExp = Experiment(predTask, predAgent)

  try:
    for t in xrange(MAX_TIME):
      print 't = %d' % t 
      world.t = t
      predExp.doInteractions(1)
      predAgent.learn()
      print 'Colors vs. Q-table:'
      table_print(predTable._params, PredatorInteraction.NSTATES)
      print

  except KeyboardInterrupt:
    pass

  finally:
    print 'Background: %s' % BKGD_COLOR
    print 'Colors vs. Final Q-table:'
    table_print(predTable._params, PredatorInteraction.NSTATES)
    print

    counts = {'ate' : {}, 'poison' : 0, 'death' : 0, 'poisondeath' : 0, 'rejected' : {}}

Beispiel #24

0

Datei anzeigen

from DeepQLearning import DeepQLearning
from MDPObjectLocalizerTask import MDPObjectLocalizerTask
from ObjectLocalizationAgent import ObjectLocalizationAgent

print 'Starting Environment'
epsilon = 1.0
environment = ObjectLocalizerEnvironment(config.get('imageDir'),
                                         config.get('candidatesFile'),
                                         'Training')
print 'Initializing DeepQNetwork'
controller = DeepQNetwork()
controller.setEpsilonGreedy(epsilon)
print 'Initializing Q Learner'
learner = DeepQLearning()
print 'Preparing Agent'
agent = ObjectLocalizationAgent(controller, learner)
print 'Configuring Task'
task = MDPObjectLocalizerTask(environment, config.get('groundTruth'))
print 'Setting up Experiment'
experiment = Experiment(task, agent)
i = 0
print 'Main Loop'
while i < config.geti('maximumEpochs'):
    print 'Epoch', i, '(epsilon:{:5.3f})'.format(epsilon)
    experiment.doInteractions(int(config.get('numInteractions')))
    agent.learn()
    agent.reset()
    i += 1
    epsilon = adjustEpsilon(config.geti('maximumEpochs'), i, epsilon)
    controller.setEpsilonGreedy(epsilon)

Beispiel #25

0

Datei anzeigen

class ReinforcementLearningRunner():
    def __init__(self, mode):
        self.mode = mode
        cu.mem('Reinforcement Learning Started')
        self.environment = RegionFilteringEnvironment(
            config.get(mode + 'Database'), mode)
        self.controller = QNetwork()
        cu.mem('QNetwork controller created')
        self.learner = None
        self.agent = RegionFilteringAgent(self.controller, self.learner)
        self.task = RegionFilteringTask(self.environment,
                                        config.get(mode + 'GroundTruth'))
        self.experiment = Experiment(self.task, self.agent)

    def runEpoch(self, interactions, maxImgs):
        img = 0
        s = cu.tic()
        while img < maxImgs:
            self.experiment.doInteractions(interactions)
            self.agent.learn()
            self.agent.reset()
            self.environment.loadNextEpisode()
            img += 1
        s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s)

    def run(self):
        if self.mode == 'train':
            self.agent.persistMemory = True
            self.agent.startReplayMemory(len(self.environment.db.images),
                                         config.geti('trainInteractions'),
                                         config.geti('stateFeatures'))
            self.train()
        elif self.mode == 'test':
            self.agent.persistMemory = False
            self.test()

    def train(self):
        interactions = config.geti('trainInteractions')
        minEpsilon = config.getf('minTrainingEpsilon')
        epochSize = len(self.environment.db.images) / 2
        epsilon = 1.0
        self.controller.setEpsilonGreedy(epsilon)
        print 'Epoch 0: Exploration'
        self.runEpoch(interactions, len(self.environment.db.images))
        self.learner = QLearning()
        self.agent.learner = self.learner
        epoch = 1
        egEpochs = config.geti('epsilonGreedyEpochs')
        while epoch <= egEpochs:
            epsilon = epsilon - (1.0 - minEpsilon) / float(egEpochs)
            if epsilon < minEpsilon: epsilon = minEpsilon
            self.controller.setEpsilonGreedy(epsilon)
            print 'Epoch', epoch, '(epsilon-greedy:{:5.3f})'.format(epsilon)
            self.runEpoch(interactions, epochSize)
            epoch += 1
        epoch = 1
        maxEpochs = config.geti('exploitLearningEpochs')
        while epoch <= maxEpochs:
            print 'Epoch', epoch + egEpochs, '(exploitation mode: epsilon={:5.3f})'.format(
                epsilon)
            self.runEpoch(interactions, epochSize)
            epoch += 1

    def test(self):
        interactions = config.geti('testInteractions')
        self.controller.setEpsilonGreedy(config.getf('testEpsilon'))
        self.runEpoch(interactions, len(self.environment.db.images))

Beispiel #26

0

Datei anzeigen

Datei: bot.py Projekt: zmuda/iwium

class Player:
    def __init__(self):
        self.environment = GameEnv()

        av_table = ActionValueTable(self.environment.outdim, self.environment.indim)
        av_table.initialize(0.)  # todo: save & restore agents state
        learner = Q()
        learner._setExplorer(EpsilonGreedyExplorer())
        agent = LearningAgent(av_table, learner)

        self.agent = agent
        self.task = GameTask(self.environment)
        self.experiment = Experiment(self.task, self.agent)

    def name(self, index):
        self.me = index
        [self.opp1, self.opp2] = [i for i in range(3) if i != self.me]

    def hand(self, card):
        self.environment.reset()
        self.environment.setHand(card)
        self.environment.setStack(300)

    def bet1(self, min):
        self.environment.setPhase('bet-1')
        self.environment.setMinBet(min)
        self.experiment.doInteractions(1)
        bet = self.environment.getTranslatedAction()
        return bet

    def bet1_info(self, bets):
        opp1_bet = bets[self.opp1]
        opp2_bet = bets[self.opp2]
        self.environment.setOpponentsBets(opp1_bet, opp2_bet)

    def call1(self, current_bet):
        self.environment.setPhase('call-1')
        self.environment.setToCall(current_bet)
        self.experiment.doInteractions(1)
        is_calling = self.environment.getTranslatedAction()
        return is_calling

    def call1_info(self, in_game):
        opp1_in_game = in_game[self.opp1]
        opp2_in_game = in_game[self.opp2]
        self.environment.setOpponentsFolded(not opp1_in_game, not opp2_in_game)

    def bet2(self, min):
        self.environment.setPhase('bet-2')
        self.environment.setMinBet(min)
        self.experiment.doInteractions(1)
        bet = self.environment.getTranslatedAction()
        return bet

    def bet2_info(self, bets):
        opp1_bet = bets[self.opp1]
        opp2_bet = bets[self.opp2]
        self.environment.setOpponentsBets(opp1_bet, opp2_bet)

    def call2(self, current_bet):
        self.environment.setPhase('call-1')
        self.environment.setToCall(current_bet)
        self.experiment.doInteractions(1)
        is_calling = self.environment.getTranslatedAction()
        return is_calling

    def call2_info(self, in_game):
        opp1_in_game = in_game[self.opp1]
        opp2_in_game = in_game[self.opp2]

    def showdown(self, hand):
        opp1_hand = hand[self.opp1]
        opp2_hand = hand[self.opp2]

    def result(self, winnings):
        my_winnings = winnings[self.me]
        opp1_winnings = winnings[self.opp1]
        opp2_winnings = winnings[self.opp2]

        self.environment.setPhase('results')
        self.task.setWinnings(my_winnings)
        self.experiment.doInteractions(1)

        self.agent.learn()
        self.agent.reset()

Beispiel #27

0

Datei anzeigen

Datei: animats2.py Projekt: ericmarcincuddy/cs263c

  )
  mimicTable.initialize(0.)

  mimicLearner = Q(ALPHA, GAMMA)
  mimicLearner._setExplorer(EpsilonGreedyExplorer(EPSILON))
  mimicAgent = LearningAgent(mimicTable, mimicLearner)

  mimicEnv = MimicryPreyEnvironment(world)
  mimicTask = MimicryPreyTask(mimicEnv)
  mimicExp = Experiment(mimicTask, mimicAgent)

  try:
    for t in xrange(MAX_TIME):
      print 't = %d' % t 
      world.t = t
      predExp.doInteractions(1)
      predAgent.learn()
      mimicExp.doInteractions(1)
      mimicAgent.learn()
      print 'Mimicker Colors vs. Q-table:'
      table_print(mimicTable._params, MimicryPreyInteraction.NSTATES)
      print 'Predator Colors vs. Q-table:'
      table_print(predTable._params, PredatorInteraction.NSTATES)
      print

  except KeyboardInterrupt:
    pass

  finally:
    print 'Background: %s' % BKGD_COLOR
    print 'Predator Colors vs. Final Q-table:'

Beispiel #28

0

Datei anzeigen

Datei: hitgoalexp.py Projekt: juanpa11/SampleCode

import numpy
env=HitTheGoalEnv(5)
task=HitTheGoalTask(env,[5,0,0])


net = buildNetwork(2, 1, bias=False)
    # create agent with controller and learner (and its options)
#agent=OptimizationAgent(net, CMAES())
#agent.learner.setEvaluator(task,agent.module)
agent = LearningAgent(net,Reinforce())
#agent.learner.explorer=EpsilonGreedyExplorer(0.0)
#agent.learner._setExplorer(EpsilonGreedyExplorer(0.0))
#agent.learner.explorer.sigma=[0.1]
#print agent.learner.explorer.sigma
#exit()
experiment = Experiment(task, agent)

itr=0
#task.performAction(numpy.array([36]))
while  True:
	 #print itr
	# agent.learner.maxEvaluations += 1
	 #agent.learner.learn()
	experiment.doInteractions(50)
	agent.learn()
	agent.reset()
	task.reset()
#	 env.reset()
	# itr=itr+1

Beispiel #29

0

Datei anzeigen

Datei: ReinforcementLearningRunner.py Projekt: jccaicedo/localization-agent

class ReinforcementLearningRunner():

  def __init__(self, mode):
    self.mode = mode
    cu.mem('Reinforcement Learning Started')
    self.environment = RegionFilteringEnvironment(config.get(mode+'Database'), mode)
    self.controller = QNetwork()
    cu.mem('QNetwork controller created')
    self.learner = None
    self.agent = RegionFilteringAgent(self.controller, self.learner)
    self.task = RegionFilteringTask(self.environment, config.get(mode+'GroundTruth'))
    self.experiment = Experiment(self.task, self.agent)

  def runEpoch(self, interactions, maxImgs):
    img = 0
    s = cu.tic()
    while img < maxImgs:
      self.experiment.doInteractions(interactions)
      self.agent.learn()
      self.agent.reset()
      self.environment.loadNextEpisode()
      img += 1
    s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s)

  def run(self):
    if self.mode == 'train':
      self.agent.persistMemory = True
      self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures'))
      self.train()
    elif self.mode == 'test':
      self.agent.persistMemory = False
      self.test()

  def train(self):
    interactions = config.geti('trainInteractions')
    minEpsilon = config.getf('minTrainingEpsilon')
    epochSize = len(self.environment.db.images)/2
    epsilon = 1.0
    self.controller.setEpsilonGreedy(epsilon)
    print 'Epoch 0: Exploration'
    self.runEpoch(interactions, len(self.environment.db.images))
    self.learner = QLearning()
    self.agent.learner = self.learner
    epoch = 1
    egEpochs = config.geti('epsilonGreedyEpochs')
    while epoch <= egEpochs:
      epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) 
      if epsilon < minEpsilon: epsilon = minEpsilon
      self.controller.setEpsilonGreedy(epsilon)
      print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      epoch += 1
    epoch = 1
    maxEpochs = config.geti('exploitLearningEpochs')
    while epoch <= maxEpochs:
      print 'Epoch',epoch+egEpochs,'(exploitation mode: epsilon={:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      epoch += 1

  def test(self):
    interactions = config.geti('testInteractions')
    self.controller.setEpsilonGreedy(config.getf('testEpsilon'))
    self.runEpoch(interactions, len(self.environment.db.images))

Beispiel #30

0

Datei anzeigen

Datei: chain.py Projekt: vishruthb/bayesianreinforcementlearning

    #    controller.initialize(0.)

    #    learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8
    learner = Q()  # default alpha 0.5, gamma 0.99
    #    learner._setExplorer(EpsilonGreedyExplorer(0.5))
    agent = LearningAgent(controller, learner)

    task = ChainTask(env)
    exp = Experiment(task, agent)

    reward = 0
    xs = []
    ys = []

    import matplotlib.pyplot as plt
    for i in xrange(5000):
        exp.doInteractions(1)
        agent.learn()

        reward += agent.lastreward

        if i % 100 == 0:
            xs.append(i)
            ys.append(reward)
            print i
        # print learner.laststate, learner.lastaction, learner.lastreward
#        print controller.params.reshape(5, 2)

    print "TOTAL REWARD:", reward
    print ys

Beispiel #31

0

Datei anzeigen

Datei: NUMPY.py Projekt: rbobkoskie3/OS

def Py_Brain():
    ############################
    # pybrain
    ############################
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    import itertools
    from scipy import linalg

    from pybrain.rl.environments.mazes import Maze, MDPMazeTask
    from pybrain.rl.learners.valuebased import ActionValueTable
    from pybrain.rl.agents import LearningAgent
    from pybrain.rl.learners import Q, SARSA
    from pybrain.rl.experiments import Experiment
    from pybrain.rl.environments import Task

    import pylab
    #pylab.gray()
    #pylab.ion()

    '''
    structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                          [1, 0, 0, 1, 0, 0, 0, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 0, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1]])
    '''
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 1, 0, 0, 1],
                          [1, 1, 0, 1, 1],
                          [1, 0, 0, 1, 1],
                          [1, 1, 1, 1, 1]])

    num_states = int(structure.shape[0]*structure.shape[1])
    SQRT = int(math.sqrt(num_states))
    #print structure.item((1, 3))
    #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple
    environment = Maze(structure, (1, 3)) #second parameter is goal field tuple
    print type(environment)
    print environment
    # Standard maze environment comes with the following 4 actions:
    # North, South, East, West
    controller = ActionValueTable(num_states, 4) #[N, S, E, W] 
    controller.initialize(1)

    learner = Q()
    agent = LearningAgent(controller, learner)
    np.not_equal(agent.lastobs, None)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    #while True:
    for x in range(4):
        print x
        experiment.doInteractions(10)
        agent.learn()
        agent.reset()

        pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT))
        pylab.draw()
        #pylab.show()
        name='MAZE'
        plt.savefig(str(name)+'_PLOT.png')
    plt.close()

Beispiel #32

0

Datei anzeigen

Datei: main.py Projekt: RolandSaur/msc

# set the task
task = charge_opt(environment, 0.8, 0.01)

#do experiment
number_of_runs = 20

#change the reward and run the whole experiment
task.change_reward(1, 0.0)

# create the experiment
experiment = Experiment(task, agent)

k = 0
while k < number_of_runs:
    experiment.doInteractions(96)
    agent.learn()
    agent.reset()
    #log some data of the first and last run.
    if k == 0:  # if it is the first run
        first_run_time2 = environment.log_time
        first_run_soc2 = environment.log_soc
        first_run_volt2 = environment.log_volt
    #if k == number_of_runs - 1: # if it it the last run
    #    last_run_time2 = environment.log_time
    #    last_run_soc2 = environment.log_soc
    #    last_run_volt2 = environment.log_volt
    environment.reset()
    k += 1

agent.learning = False  #to keep it from exploring

Beispiel #33

0

Datei anzeigen

Datei: trainer.py Projekt: polca-project/polca-toolbox


# Learning phase
# Num iterations used for PROHA Workshop perliminary evaluation
# numIterations   = 1600
numIterations   = 1500
numInteractions = 600

# Num iterations used for PROHA and PROLE slides
# numIterations   = 10
# numInteractions = 3
for i in range(numIterations):


    # interact with the environment (here in batch mode)
    experiment.doInteractions(numInteractions)
    agent.learn()
    agent.reset()

#    # and draw the table
#    # pylab.pcolor(table.params.reshape(numStates,numActions).max(1).reshape(numStates,numStates))
#    # # pylab.savefig('myfilename_%2d.png' % (i))
#    # pylab.show(block=True)
#    # print table.params.reshape(numStates,numActions).max(1).reshape(numStates,1)

#     print("\nIteration: %d" % (i))
#     print table.params.reshape(numStates,numActions)
#
# print "-------------------------------------------------"
# exit(0)
# print table.params.reshape(numStates,numActions)

Beispiel #34

0

Datei anzeigen

    learner = Q()
    agent = LearningAgent(actionValueNetwork, learner)
    experiment = Experiment(task, agent)

    start = time()

    i = 0

    while True:
        for state in range(control.get_randomize_states()):
            control.randomize(state)
            task.reset()

            print("run %d" % i)

            experiment.doInteractions(1000)
            agent.learn()
            agent.reset()

            with open('q/rewards.csv','a') as f: f.write("%f,%d\n" % (task.getTotalReward(), time() - start))

            # print("learn")
            # agent.learn()
            # agent.reset()
            # control.pause()

            pylab.pcolor(actionValueNetwork.params.reshape(32, actions).max(1).reshape(8,4).T)
            pylab.pause(0.01)

            if (i % 20) == 0:
                print("save network")

Beispiel #35

0

Datei anzeigen

Datei: rl.py Projekt: SotiriosVrachas/Inhibition

    exit("quiting")

def start(unused_addr, args, message):
    print("RL starting")
    while True:
        experiment.doInteractions(6) # make a number of interaction in-between learning
        agent.learn()
        agent.reset()
    
if __name__ == "__main__":
    # dispatch osc messages
    disp = dispatcher.Dispatcher()
    disp.map("/test", print) # dumb input message
    disp.map("/quit", self_quit, "ok")
    disp.map("/start", start, "ok")
    #disp.map("/iterate", self_quit, "ok")
    #disp.map("/reset", reset, "ok")
    
    server = osc_server.ThreadingOSCUDPServer( ("127.0.0.1", listening_port), disp)
    print("Serving on {}".format(server.server_address))
    #server.serve_forever()
    while True:
        experiment.doInteractions(6) # make a number of interaction in-between learning
        agent.learn()
        agent.reset()

Beispiel #36

0

Datei anzeigen

Datei: main.py Projekt: tsvvladimir95/blacvkjack

# define action-value table
# number of states is:
#
#    current value: 1-21
#
# number of actions:
#
#    Stand=0, Hit=1
av_table = ActionValueTable(21, 2)
av_table.initialize(0.)

# define Q-learning agent
learner = Q(0.5, 0.0)
learner._setExplorer(EpsilonGreedyExplorer(0.0))
agent = LearningAgent(av_table, learner)

# define the environment
env = BlackjackEnv()

# define the task
task = BlackjackTask(env)

# finally, define experiment
experiment = Experiment(task, agent)

# ready to go, start the process
while True:
    experiment.doInteractions(1)
    agent.learn()
    agent.reset()

Beispiel #37

0

Datei anzeigen

Datei: test_stupid.py Projekt: xjie0403/communication-swarm-intelligence

import pickle
import time

# Create environment
sub_env = Environment(20, 20)
world = World(sub_env)

# Brain for the animat, we have already trained the data
f = open('neuro.net', 'r')
trained_net = pickle.load(f)
brain = BrainController(trained_net)

# Learning method we use
#learner = PolicyGradientLearner()
learner = ENAC()
learner._setLearningRate(0.2)
# Create an animat
animat = StupidAnimat(trained_net, learner, sub_env)

# Establish a task
task = InteractTask(world, animat)

brain.validate_net()
experiment = Experiment(task, animat)
while True:
    experiment.doInteractions(10000)
    animat.learn()
    animat.reset()
    brain.validate_net()
    time.sleep(3)

Beispiel #38

0

Datei anzeigen

Datei: td_menu_sequential.py Projekt: paba50/reinforcement_learning

# standard exploration is e-greedy, but a different type can be chosen as well
# learner.explorer = BoltzmannExplorer()

# create agent
agent = LearningAgent(table, learner)

# create experiment
experiment = Experiment(task, agent)

# prepare plotting
pylab.gray()
pylab.ion()

#for i in range(100):
while True:
    # interact with the environment (here in batch mode)
    experiment.doInteractions(matrix_size)
    agent.learn()
    agent.reset()

    # and draw the table
    print table.params.reshape(matrix_size,2)
    #print table.params.reshape(matrix_size,matrix_size)
    pylab.pcolor(table.params.reshape(matrix_size,2).max(1).reshape(matrix_size,1))
    #pylab.pcolor(table.params.reshape(matrix_size,matrix_size).max(1).reshape(matrix_size,1))
    pylab.draw()
    pylab.ion()
    pylab.show()
print "training complete"

Beispiel #39

0

Datei anzeigen

Datei: rl_op.py Projekt: jasonboyer/dcs

class RlOp(threading.Thread):
    episodes = 1
    epilen = 200
    def __init__(self, event_queue_name, hub_queue_name):
        super().__init__()
        # create environment
        self.conn = boto.sqs.connect_to_region(constants.REGION)
        self.event_queue = self.conn.get_queue(event_queue_name)
        self.event_queue.set_message_class(MHMessage)
        self.env = DogEnv(DogEnv.ALL_QUIET, DogEnv.ALL_QUIET, self.event_queue, hub_queue_name)
        self.env.delay = (self.episodes == 1)

        # create task
        self.task = QuietDogTask(self.env)

        # create value table and initialize with ones
        # TODO: Get number of states from DogEnv
        self.table = ActionValueTable(2*5*4, 5*4)
        self.table.initialize(1.)

        # create agent with controller and learner - use SARSA(), Q() or QLambda() here
        self.learner = SARSA()

        # standard exploration is e-greedy, but a different type can be chosen as well
        self.learner.explorer = BoltzmannExplorer()

        # create agent
        self.agent = DogAgent(self.table, self.learner)

        # create experiment
        self.experiment = Experiment(self.task, self.agent)

    def run(self):
        self.call_run()

    def call_run(self):
        print('RlOp: running')
        # prepare plotting
        pylab.gray()
        pylab.ion()

        for i in range(1000):

            # interact with the environment (here in batch mode)
            self.experiment.doInteractions(100)
            self.agent.learn()
            self.agent.reset()

            results0 = self.table.params.reshape(2, 4, 5, 20)[0]
            results1 = self.table.params.reshape(2, 4, 5, 20)[1]
            pp.pprint(results0.argmax(2))
            pp.pprint(results1.argmax(2))

            # and draw the table
            #ar=self.table.params.reshape(2,5,4,5,4)
            #for state1 in range(len(constants.SOUNDS)):
            #    for state2 in range(4):
            #        pylab.pcolor(ar[1][state1][state2])
            #        pylab.draw()

        results0 = self.table.params.reshape(2, 4, 5, 20)[0]
        results1 = self.table.params.reshape(2, 4, 5, 20)[1]
        while True:
            time.sleep(60)
            pp.pprint(results0.argmax(2))
            pp.pprint(results1.argmax(2))