def __init__(self, n_threads=4, initial_port=19997, q_table_version=0,
                 batch_size=None, learner=None, explorer=None):
        self.barrier = Barrier(n_threads + 1, timeout=720)
        self.n_threads = n_threads
        self.initial_port = initial_port
        self.batch_size = batch_size

        self.controller = MyActionValueTable(q_table_version)
        if learner is None:
            self.learner = Q(0.5, 0.9)
        else:
            self.learner = learner

        if explorer is None:
            self.explorer = self.learner.explorer = EpsilonGreedyExplorer(0.2, 0.998)
        else:
            self.explorer = self.learner.explorer = explorer
        self.agent = LearningAgent(self.controller, self.learner)
        # Logger initialization
        self.logger = logging.getLogger('master_logger')
        self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(logging.FileHandler(Utils.DATA_PATH + 'learning-tables/master.log'))
        self.failed_simulations = []
        self.n_episodes = 0
        self.simulations = []
        self.initialize_simulations()
def q_learning_table():
    controller = ActionValueTable(36, 4)
    learner = Q()
    controller.initialize(1.)

    agent = LearningAgent(controller, learner)

    score_list = []
    turn_list = []
    # neural側のトレーニング分 +100
    for i in range(600):
        print_state(agent.module.getValue, 'table')

        score, turn = play(agent, 'table')
        score_list.append(score)
        turn_list.append(turn)

        agent.learn()
        agent.reset()

        print i, int(numpy.mean(score_list)), max(score_list), score, turn

        with open('./agent.dump', 'w') as f:
            pickle.dump(agent, f)
        with open('./score.dump', 'w') as f:
            pickle.dump([score_list, turn_list], f)
Beispiel #3
0
def createAgent(module):
    ### create agent with controller and learner - use SARSA(), Q() or QLambda() here
    ## alpha -- learning rate (preference of new information -- update value factor)
    ## gamma -- discount factor (importance of future reward -- next value factor)
    
    learner = Q(0.2, 0.99)
    # learner = SARSA(0.2, 0.99)
    ## learner = QLambda(0.5, 0.99, 0.9)
    explorer = learner.explorer
    explorer.decay = 1.0
     
    agent = LearningAgent(module, learner)
    return agent
Beispiel #4
0
 def __init__(self,
              name,
              num_states,
              num_actions,
              epsilon=0.3,
              gamma=0.99,
              alpha=0.95):
     self.controller = ActionValueTable(num_states, num_actions)
     self.controller.initialize(np.random.rand(num_states * num_actions))
     self.learner = Q(gamma=gamma, alpha=alpha)
     self.learner.batchMode = False
     self.learner.explorer.epsilon = epsilon
     LearningAgent.__init__(self, self.controller, self.learner)
     Agent.__init__(self, name)
Beispiel #5
0
 def learn(self, number_of_iterations):
     learner = Q(0.2, 0.8)
     task = CartMovingTask(self.environment)
     self.controller = ActionValueTable(
         reduce(lambda x, y: x * y, map(lambda x: len(x), self.ranges)),
         self.force_granularity)
     self.controller.initialize(1.)
     agent = LearningAgent(self.controller, learner)
     experiment = Experiment(task, agent)
     for i in range(number_of_iterations):
         experiment.doInteractions(1)
         agent.learn()
         agent.reset()
     with open("test.pcl", "w+") as f:
         pickle.dump(self.controller, f)
Beispiel #6
0
def initExperiment(learnalg='Q',
                   history=None,
                   binEdges='10s',
                   scriptfile='./rlRunExperiment_v2.pl',
                   resetscript='./rlResetExperiment.pl'):

    if binEdges == '10s':
        centerBinEdges = centerBinEdges_10s
    elif binEdges == '30s':
        centerBinEdges = centerBinEdges_30s
    elif binEdges == 'lessperturbed':
        centerBinEdges = centerBinEdges_10s_lessperturbed
    elif binEdges is None:
        centerBinEdges = None
    else:
        raise Exception("No bins for given binEdges setting")

    env = OmnetEnvironment(centerBinEdges, scriptfile, resetscript)
    if history is not None:
        env.data = history['data']

    task = OmnetTask(env, centerBinEdges)
    if history is not None:
        task.allrewards = history['rewards']

    if learnalg == 'Q':
        nstates = env.numSensorBins**env.numSensors
        if history is None:
            av_table = ActionValueTable(nstates, env.numActions)
            av_table.initialize(1.)
        else:
            av_table = history['av_table']
        learner = Q(0.1, 0.9)  # alpha, gamma
        learner._setExplorer(EpsilonGreedyExplorer(0.05))  # epsilon
    elif learnalg == 'NFQ':
        av_table = ActionValueNetwork(env.numSensors, env.numActions)
        learner = NFQ()
    else:
        raise Exception("learnalg unknown")

    agent = LearningAgent(av_table, learner)

    experiment = Experiment(task, agent)
    if history is None:
        experiment.nruns = 0
    else:
        experiment.nruns = history['nruns']
    return experiment
Beispiel #7
0
 def __init__(self, name, clientID, sensorHandle, bodyHandle):
     '''
     Constructor
     '''
     self.resetParameters()
     controller = ActionValueTable(150, 5)   # pyBrain
     controller.initialize(1.)               # pyBrain
     learner = Q()                           # pyBrain
     self.__mind=AgentMind(controller, learner)  # with pyBrain
     self.__controller=controller
     self.__name=name
     self.__clientID=clientID          # Client ID of the Dummy object
     self.__sensorHandle=sensorHandle  # Proximity sensor handle of the V-Rep agent
     self.__bodyHandle=bodyHandle      # BubbleRob body handle
     self.__mind.setInput("name", name)
     self.__pybrainEnvironment = LocomotionEnvironment()
     self.__pybrainTask = LocomotionTask(self.__pybrainEnvironment)
Beispiel #8
0
    def maze():
        # import sys, time
        pylab.gray()
        pylab.ion()
        # The goal appears to be in the upper right
        structure = [
            '!!!!!!!!!!',
            '! !  ! ! !',
            '! !! ! ! !',
            '!    !   !',
            '! !!!!!! !',
            '! ! !    !',
            '! ! !!!! !',
            '!        !',
            '! !!!!!  !',
            '!   !    !',
            '!!!!!!!!!!',
            ]
        structure = np.array([[ord(c)-ord(' ') for c in row] for row in structure])
        shape = np.array(structure.shape)
        environment = Maze(structure, tuple(shape - 2))
        controller = ActionValueTable(shape.prod(), 4)
        controller.initialize(1.)
        learner = Q()
        agent = LearningAgent(controller, learner)
        task = MDPMazeTask(environment)
        experiment = Experiment(task, agent)

        for i in range(100):
            experiment.doInteractions(100)
            agent.learn()
            agent.reset()
            # 4 actions, 81 locations/states (9x9 grid)
            # max(1) gives/plots the biggest objective function value for that square
            pylab.pcolor(controller.params.reshape(81, 4).max(1).reshape(9, 9))
            pylab.draw()

        # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
        greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
        greedy_policy = np.flipud(np.array(list('NESW'))[greedy_policy].reshape(shape))
        maze = np.flipud(np.array(list(' #'))[structure])
        print('Maze map:')
        print('\n'.join(''.join(row) for row in maze))
        print('Greedy policy:')
        print('\n'.join(''.join(row) for row in greedy_policy))
def test_maze():
    # simplified version of the reinforcement learning tutorial example
    structure = [
        list('!!!!!!!!!!'),
        list('! !  ! ! !'),
        list('! !! ! ! !'),
        list('!    !   !'),
        list('! !!!!!! !'),
        list('! ! !    !'),
        list('! ! !!!! !'),
        list('!        !'),
        list('! !!!!!  !'),
        list('!   !    !'),
        list('!!!!!!!!!!'),
    ]
    structure = np.array([[ord(c) - ord(' ') for c in row]
                          for row in structure])
    shape = np.array(structure.shape)
    environment = Maze(structure, tuple(shape - 2))
    controller = ActionValueTable(shape.prod(), 4)
    controller.initialize(1.)
    learner = Q()
    agent = LearningAgent(controller, learner)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    for i in range(30):
        experiment.doInteractions(30)
        agent.learn()
        agent.reset()

    controller.params.reshape(shape.prod(), 4).max(1).reshape(*shape)
    # (0, 0) is upper left and (0, N) is upper right, so flip matrix upside down to match NESW action order
    greedy_policy = np.argmax(controller.params.reshape(shape.prod(), 4), 1)
    greedy_policy = np.flipud(
        np.array(list('NESW'))[greedy_policy].reshape(shape))
    maze = np.flipud(np.array(list(' #'))[structure])
    print('Maze map:')
    print('\n'.join(''.join(row) for row in maze))
    print('Greedy policy:')
    print('\n'.join(''.join(row) for row in greedy_policy))
    assert '\n'.join(
        ''.join(row)
        for row in greedy_policy) == 'NNNNN\nNSNNN\nNSNNN\nNEENN\nNNNNN'
Beispiel #10
0
def run():
    """
    number of states is:
    current value: 0-20

    number of actions:
    Stand=0, Hit=1 """

    # define action value table
    av_table = ActionValueTable(MAX_VAL, MIN_VAL)
    av_table.initialize(0.)

    # define Q-learning agent
    q_learner = Q(Q_ALPHA, Q_GAMMA)
    q_learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(av_table, q_learner)

    # define the environment
    env = BlackjackEnv()

    # define the task
    task = BlackjackTask(env, verbosity=VERBOSE)

    # finally, define experiment
    experiment = Experiment(task, agent)

    # ready to go, start the process
    for _ in range(NB_ITERATION):
        experiment.doInteractions(1)
        if task.lastreward != 0:
            if VERBOSE:
                print "Agent learn"
            agent.learn()

    print '|First State|Choice 0 (Stand)|Choice 1 (Hit)|Relative value of Standing over Hitting|'
    print '|:-------:|:-------|:-----|:-----|'
    for i in range(MAX_VAL):
        print '| %s | %s | %s | %s |' % (
            (i + 1),
            av_table.getActionValues(i)[0], av_table.getActionValues(i)[1],
            av_table.getActionValues(i)[0] - av_table.getActionValues(i)[1])
def main():
    client_id = Utils.connectToVREP()

    # Define RL elements
    environment = StandingUpEnvironment(client_id)
    task = StandingUpTask(environment)
    controller = MyActionValueTable()
    learner = Q(0.5, 0.9)
    learner.explorer = EpsilonGreedyExplorer(0.15, 1)  # EpsilonGreedyBoltzmannExplorer()
    agent = LearningAgent(controller, learner)
    experiment = EpisodicExperiment(task, agent)

    controller.initialize(agent)

    i = 0
    try:
        while True:
            i += 1
            print('Episode ' + str(i))
            experiment.doEpisodes()
            agent.learn()
            agent.reset()
            print('mean: '+str(numpy.mean(controller.params)))
            print('max: '+str(numpy.max(controller.params)))
            print('min: '+str(numpy.min(controller.params)))

            if i % 500 == 0:  # Save q-table every 500 episodes
                print('Save q-table')
                controller.save()
                task.t_table.save()

    except (KeyboardInterrupt, SystemExit):
        with open('../data/standing-up-q.pkl', 'wb') as handle:
            pickle.dump(controller.params, handle)
        task.t_table.save()
        controller.save()

    vrep.simxFinish(client_id)
    def __init__(self, text_to_speech, speech_to_text):
        Feature.__init__(self)

        # setup AV Table
        self.av_table = GameTable(13, 2)
        if (self.av_table.loadParameters() == False):
            self.av_table.initialize(0.)

        # setup a Q-Learning agent
        learner = Q(0.5, 0.0)
        learner._setExplorer(EpsilonGreedyExplorer(0.0))
        self.agent = LearningAgent(self.av_table, learner)

        # setup game interaction
        self.game_interaction = GameInteraction(text_to_speech, speech_to_text)

        # setup environment
        environment = GameEnvironment(self.game_interaction)

        # setup task
        task = GameTask(environment, self.game_interaction)

        # setup experiment
        self.experiment = Experiment(task, self.agent)
Beispiel #13
0
    task.max_samples = 500

    actions = len(environment.actions)

    actionValueNetwork = ActionValueTable(task.outdim, task.indim)
    actionValueNetwork.stdParams = 0.0001
    actionValueNetwork.randomize()
    # actionValueNetwork = ActionValueNetwork(task.outdim,task.indim)
    # if os.path.isfile("q/q_train.npy"):
    #    actionValueNetwork.param = np.load("q/q_train.npy")
    #else: actionValueNetwork.initialize(0.0001)
    # if os.path.isfile("nfq.xml"): actionValueNetwork.network = NetworkReader.readFrom('nfq.xml')
    pylab.pcolor(actionValueNetwork.params.reshape(32, actions).max(1).reshape(8,4).T)
    pylab.pause(0.01)

    learner = Q()
    agent = LearningAgent(actionValueNetwork, learner)
    experiment = Experiment(task, agent)

    start = time()

    i = 0

    while True:
        for state in range(control.get_randomize_states()):
            control.randomize(state)
            task.reset()

            print("run %d" % i)

            experiment.doInteractions(1000)
Beispiel #14
0
states = 165  #Has to match class Env(Environment) - outdim  in environment_01.py
actions = 2  #Has to match class Env(Environment) - indim  in environment_01.py

try:
    arr = np.loadtxt('/home/pi/Desktop/ray_bot/ray_bot2.csv', delimiter=';')
    # open action value table  from .csv file
except Exception as e:
    #    print e
    arr = np.zeros((states, actions))
    # except if the file does not exist - ie. first time - then creat and initialize it with numpy of zeros

av_table = ActionValueTable(states, actions)
av_table.initialize(arr.flatten())

# define Q-learning agent
learner = Q(0.1, 0.5)
learner._setExplorer(EpsilonGreedyExplorer(0.5))
agent = LearningAgent(av_table, learner)

# define the environment
env = Env()

# define the task
task = Task(env)

# define experiment
experiment = Experiment(task, agent)

# ready to go, start the process
#while PIR_sensing(PIR)==1 and ultrasonic (ECHO, TRIG)==1:
if __name__ == "__main__":
    # testing the environment and task

    from pybrain.rl.learners.valuebased import ActionValueTable
    from pybrain.rl.learners import Q
    from pybrain.rl.agents import LearningAgent
    from pybrain.rl.experiments import Experiment
    from pybrain.rl.explorers import EpsilonGreedyExplorer

    env = Chain()
    controller = ActionValueTable(env.outdim, env.indim)
    controller.initialize(1.)
    #    controller.initialize(0.)

    #    learner = Q(0.5, 0.8) # alpha 0.5, gamma 0.8
    learner = Q()  # default alpha 0.5, gamma 0.99
    #    learner._setExplorer(EpsilonGreedyExplorer(0.5))
    agent = LearningAgent(controller, learner)

    task = ChainTask(env)
    exp = Experiment(task, agent)

    reward = 0
    xs = []
    ys = []

    import matplotlib.pyplot as plt
    for i in xrange(5000):
        exp.doInteractions(1)
        agent.learn()
Beispiel #16
0
def Py_Brain():
    ############################
    # pybrain
    ############################
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    import itertools
    from scipy import linalg

    from pybrain.rl.environments.mazes import Maze, MDPMazeTask
    from pybrain.rl.learners.valuebased import ActionValueTable
    from pybrain.rl.agents import LearningAgent
    from pybrain.rl.learners import Q, SARSA
    from pybrain.rl.experiments import Experiment
    from pybrain.rl.environments import Task

    import pylab
    #pylab.gray()
    #pylab.ion()

    '''
    structure = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                          [1, 0, 0, 1, 0, 0, 0, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 0, 1, 0, 1],
                          [1, 0, 0, 1, 0, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 1, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 0, 1],
                          [1, 0, 0, 0, 0, 0, 0, 0, 1],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1]])
    '''
    structure = np.array([[1, 1, 1, 1, 1],
                          [1, 1, 0, 0, 1],
                          [1, 1, 0, 1, 1],
                          [1, 0, 0, 1, 1],
                          [1, 1, 1, 1, 1]])

    num_states = int(structure.shape[0]*structure.shape[1])
    SQRT = int(math.sqrt(num_states))
    #print structure.item((1, 3))
    #environment = Maze(structure, (7, 7)) #second parameter is goal field tuple
    environment = Maze(structure, (1, 3)) #second parameter is goal field tuple
    print type(environment)
    print environment
    # Standard maze environment comes with the following 4 actions:
    # North, South, East, West
    controller = ActionValueTable(num_states, 4) #[N, S, E, W] 
    controller.initialize(1)

    learner = Q()
    agent = LearningAgent(controller, learner)
    np.not_equal(agent.lastobs, None)
    task = MDPMazeTask(environment)
    experiment = Experiment(task, agent)

    #while True:
    for x in range(4):
        print x
        experiment.doInteractions(10)
        agent.learn()
        agent.reset()

        pylab.pcolor(controller.params.reshape(num_states,4).max(1).reshape(SQRT,SQRT))
        pylab.draw()
        #pylab.show()
        name='MAZE'
        plt.savefig(str(name)+'_PLOT.png')
    plt.close()
import sys, time
from scipy import *

from pybrain.rl.environments import Task
from pybrain.rl.learners.valuebased import ActionValueTable
from pybrain.rl.environments.mazes import Maze, MDPMazeTask
from pybrain.rl.experiments import Experiment
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners import Q, SARSA

var_structure_arr_ = array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                            [1, 0, 0, 1, 0, 0, 0, 0, 1],
                            [1, 0, 0, 1, 0, 0, 1, 0, 1],
                            [1, 0, 0, 1, 0, 0, 1, 0, 1],
                            [1, 0, 0, 1, 0, 1, 1, 0, 1],
                            [1, 0, 0, 0, 0, 0, 1, 0, 1],
                            [1, 1, 1, 1, 1, 1, 1, 0, 1],
                            [1, 0, 0, 0, 0, 0, 0, 0, 1],
                            [1, 1, 1, 1, 1, 1, 1, 1, 1]])

var_controller_ = ActionValueTable(81, 4)
var_controller_.initialize(1.0)

var_learner_ = Q()
var_Agent_ = LearningAgent(var_controller_, var_learner_)

var_task_ = MDPMazeTask(Task)

experiment = Experiment(var_task_, var_Agent_)
Beispiel #18
0
    case.generators[1].p_cost = (0.0, 6.5, 200.0)
    case.generators[2].p_cost = (0.0, 2.0, 200.0)

    case.generators[0].c_shutdown = 100.0

    #case.generators[0].p_min = 0.0 # TODO: Unit-decommitment.
    #case.generators[1].p_min = 0.0
    ##case.generators[2].p_min = 0.0

    case.generators[0].p_max = 100.0
    case.generators[1].p_max = 70.0
    case.generators[2].p_max = 70.0

    vre = VariantRothErev(experimentation=0.55, recency=0.3)
    vre.explorer = BoltzmannExplorer()  #tau=100, decay=0.95)
    learners = [vre, Q(), Reinforce()]

    profile = [0.9, 0.6]

    m = (20, 75)  # markups
    nb = 1  # no. offers
    ns = 3  # no. states

    mx = 60.0  # max markup

    weeks = 2
    days = 2

    outdir = "/tmp/case6ww1"
    dc = True
Beispiel #19
0
from pacmanAgent import PacmanAgent
from runPacman import RunPacman
from ghost import Ghost
from pacmanEnvironment import Environment

###############################################################
# The main function that begins running our Pacman-In-AI game #
###############################################################
if __name__ == "__main__":

    # Initialize our Action-Environment-Reward Table
    controller = ActionValueTable(196, 4)
    controller.initialize(0.)

    # Initialize Reinforcement Learning
    learner = Q(0.5, 0.0)
    learner._setExplorer(EpsilonGreedyExplorer(0.0))
    agent = LearningAgent(controller, learner)

    # Setup the PyBrain and PyGame Environments
    environment = Environment()
    game = RunPacman(environment)

    # Create the Task for the Pac-Man Agent to Accomplish and initialize the first Action
    task = PacmanTask(environment, game)
    task.performAction(np.array([1]))

    # The Experiment is the PyBrain link between the task to be completed and the agent completing it
    experiment = Experiment(task, agent)
    currentGame = 1
Beispiel #20
0
def runMainProg():
    # define Q-learning agents with different attributes to see
    # which one will come out as the better player.

    # It would be possible to loop through the number of players N
    # and create them, however they would all have to be the same
    # or other code would have to be added to dynamically create
    # thier attributes.
    learner = []
    learner.append(Q(0.9, 0.0))
    learner[0]._setExplorer(EpsilonGreedyExplorer(0., 0.5))
    learner.append(Q(0.5, 0.5))
    learner[1]._setExplorer(EpsilonGreedyExplorer(0.29, 0.))
    learner.append(Q(0.1, 0.5))
    learner[2]._setExplorer(EpsilonGreedyExplorer(0.29, 0.5))
    learner.append(Q(0.5, 0.0))
    learner[3]._setExplorer(DiscreteStateDependentExplorer(0., 0.5))
    learner.append(Q(0.5, 0.5))
    learner[4]._setExplorer(DiscreteStateDependentExplorer(0.29, 0.5))

    #define a blackjack deck
    theDeck = BlackjackCardDeck()

    #define action value table, agent, task, and environment arrays
    av_table = []
    agent = []
    env = []
    task = []

    #Loop through the number of players, and set up the action value table,
    #associated agent, environment, and task, so they can play the game
    for i in range(0, N):
        av_table.append(ActionValueTable(22, 2))
        av_table[i].initialize(0.)
        agent.append(LearningAgent(av_table[i], learner[i]))
        env.append(BlackjackEnv(theDeck))
        env[i].createHand()
        task.append(BlackjackTask(env[i]))

    #define a Dealer
    dealer = BlackjackDealer(theDeck)

    #run the game for a total of 1000 games. This value can be changed.
    for i in range(0, 1000):
        #This is the function that plays the game. The code for it is below.
        playGame(dealer, task, env, agent)

    #All of the games have been played, and now the results of the games played,
    #games won, tied, and lost are displayed.
    for i in range(0, N):
        print "Games Player ", i + 1, " Won Against The Dealer: ", GamesAgentWon[
            i]
        print "Games Player ", i + 1, " Lost Against The Dealer: ", TotalGames - GamesTied[
            i] - GamesAgentWon[i]
        print "Games Player ", i + 1, " Tied With The Dealer: ", GamesTied[i]
        print
    print "Total Games Played: ", TotalGames
    print

    #Create some arrays for the action value values, and the hits, and stands.
    #A new array is needed for the AV values because the AV table used for the program
    #is not an array that can be easily used to plot results, so below the AV values are
    #transfered to the array below for processing.
    theAVTables = []
    hits = []
    stands = []

    #Move the values from the AV table to the array created above, and populate the
    #hits, and stands tables as well. The values in these tables will be used in the plot below.
    for i in range(0, N):
        print "Action Table Values for Player ", i + 1, ":"
        theAVTables.append([])
        hits.append([])
        stands.append([])
        for j in range(0, 22):
            print "The AV Value At ", (
                j + 1
            ), " for Player ", i + 1, " is: ", av_table[i].getActionValues(j)
            theAVTables[i].append(av_table[i].getActionValues(j))
            hits[i].append(theAVTables[i][j][0])
            stands[i].append(theAVTables[i][j][1])
        print
        print

    subPlotVal = 511

    #The following uses matplot lib to display a nice graph about the results.
    for i in range(0, N):
        plt.figure(1)
        plt.subplot(subPlotVal)
        plot(hits[i], label="Hits")
        plot(stands[i], label="Stands")
        plt.ylabel('Probability')
        plt.title('Player ' + str(i + 1))
        plt.axis([0, 30, -3, 3])
        plt.legend()
        subPlotVal += 1

    plt.xlabel('Hand Value')

    plt.show()
Beispiel #21
0
from pybrain.rl.experiments import Experiment
from pybrain.rl.environments import Task
from pybrain.rl.learners.valuebased import ActionValueTable
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners import Q, SARSA
from charge_opt import *
from power_env2 import *
import matplotlib.pyplot as plt

#set the environment
environment = power_env2(80, 5, 3)

# create the lerner
controller = ActionValueTable(63, 3)
controller.initialize(1.0)
learner = Q(0.5, 1)  # discount set to 0 because it is a infinitely long game
agent = LearningAgent(controller, learner)

# set the task
task = charge_opt(environment, 0.8, 0.01)

#do experiment
number_of_runs = 20

#change the reward and run the whole experiment
task.change_reward(1, 0.0)

# create the experiment
experiment = Experiment(task, agent)

k = 0
Beispiel #22
0
    Pd_min = get_pd_min(case, profile)

    market = pyreto.SmartMarket(case,
                                priceCap=cap,
                                decommit=decommit,
                                auctionType=auctionType,
                                locationalAdjustment=locAdj)

    experiment = pyreto.continuous.MarketExperiment([], [], market)

    portfolios, sync_cond = get_portfolios3()

    for gidx in portfolios:
        g = [case.generators[i] for i in gidx]

        learner = Q(alpha, gamma)
        learner.explorer.epsilon = epsilon
        learner.explorer.decay = decay

        task, agent = get_discrete_task_agent(g, market, nStates, nOffer,
                                              markups, withholds, maxSteps,
                                              learner, Pd0, Pd_min)

        print "ALL ACTIONS:", len(task.env._allActions) * nStates

        experiment.tasks.append(task)
        experiment.agents.append(agent)

    passive = [case.generators[i] for i in sync_cond]
    passive[0].p_min = 0.001  # Avoid invalid offer withholding.
    passive[0].p_max = 0.002
Beispiel #23
0
task = GymTask.createTask(gymRawEnv)
env = task.env
env.setTransformation(transformation)
## env.setCumulativeRewardMode()

## create value table and initialize with ones
table = ActionValueTable(env.numStates, env.numActions)
table = ActionValueTableWrapper(table)
table.initialize(0.0)
# table.initialize( np.random.rand( table.paramdim ) )

### create agent with controller and learner - use SARSA(), Q() or QLambda() here
## alpha -- learning rate (preference of new information)
## gamma -- discount factor (importance of future reward)

learner = Q(0.2, 0.99)
# learner = SARSA(0.2, 0.99)
## learner = QLambda(0.5, 0.99, 0.9)
explorer = learner.explorer
explorer.decay = 1.0

agent = LearningAgent(table, learner)

experiment = Experiment(task, agent)

## prevents "ImportError: sys.meta_path is None, Python is likely shutting down"
atexit.register(task.close)

render_demo = False
imax = 5000
period_print = 100