Ejemplo n.º 1
0
        return self.sa2x(s, a)


def getPolicyQ(model, s):
    # we need Q(s,a) to choose an action
    # i.e. a = argmax[a]{ Q(s,a) }
    policyQ = {}
    for a in allActions:
        q = model.predict(s, a)
        policyQ[a] = q
    return policyQ


if __name__ == '__main__':

    grid = negativeGrid(stepCost=-0.1)

    # initialize model
    model = Model()

    # repeat until convergence
    t = 1.0
    t2 = 1.0
    deltas = []
    for i in range(20000):
        if i % 100 == 0:
            t += 10e-3
            t2 += 0.01
        alpha = learningRate / t2
        s = (2, 0)
        grid.setState(s)
    grid.setState(s)
    statesRewardsList.append((s, 0))
    #traverse the grid till we reach the terminal state
    while not grid.isGameOver():
        oldS = s
        oldR = r
        a = randomAction(policy[s])
        r = grid.move(a)
        s = grid.getCurrentState()
        valueF[oldS] += alpha * (r + gamma * valueF[s] - valueF[oldS])
    return valueF


if __name__ == '__main__':

    grid = negativeGrid()
    valueF = {}
    policy = {}
    for s in grid.allStates():
        valueF[s] = 0
    # state -> action
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
Ejemplo n.º 3
0
import numpy as np
from grid_world import standardGrid, negativeGrid
from iterative_policy_evaluation import printValues, printPolicy
#this basically refers to the windy grid problem, where lets assume a wind is blowing in the cells and tberefore we are not sure that
#our agent would move for sure where it wants to
#the case taken here is that if our agent wants to move to say direction L, it could do so only with a probability of 0.5
#and it could move to any other direction with an equal probability

if __name__ == '__main__':

    smallEnough = 10e-4
    gamma = 0.9
    print('Hola')
    grid = negativeGrid(stepCost=-1.0)
    states = grid.allStates()
    allActions = ('U', 'D', 'L', 'R')
    valueF = {}
    policy = {}
    for s in states:
        valueF[s] = 0

    # initial policy
    for a in grid.actions.keys():
        policy[a] = np.random.choice(allActions)

    while True:
        #policy evaluation step
        while True:
            largest = 0
            for s in states:
                if s in policy: