Beispiel #1
0
def qlearning(grid, policy, evaler, num_iter1, alpha):
    actions = grid.actions
    gamma = grid.gamma
    y = []
    for i in xrange(len(policy.theta)):
        policy.theta[i] = 0.1

    for iter1 in xrange(num_iter1):
        y.append(evaler.eval(policy))
        f = grid.start()
        a = actions[int(random.random() * len(actions))]
        t = False
        count = 0
        while False == t and count < 100:
            t, f1, r = grid.receive(a)
            #find max Q
            qmax = policy.qfunc(f1, actions[0])
            for a1 in actions:
                pvalue = policy.qfunc(f1, a1)
                if qmax < pvalue:
                    qmax = pvalue
            update(policy, f, a, r + gamma * qmax, alpha)
            f = f1
            a = policy.epsilon_greedy(f1)
            count += 1

    return policy,y
Beispiel #2
0
def sarsa(grid, policy, evaler, num_iter1, alpha):
    actions = grid.actions
    gamma = grid.gamma
    y = []
    for i in xrange(len(policy.theta)):
        policy.theta[i] = 0.1

    for iter1 in xrange(num_iter1):
        y.append(evaler.eval(policy))
        f = grid.start()
        a = actions[int(random.random() * len(actions))]
        t = False
        count = 0
        while False == t and count < 100:
            t, f1, r = grid.receive(a)
            a1 = policy.epsilon_greedy(f1)
            update(policy, f, a, r + gamma * policy.qfunc(f1, a1), alpha)
            f = f1
            a = a1
            count += 1

    return policy,y
Beispiel #3
0
def update(policy, f, a, tvalue, alpha):
    pvalue = policy.qfunc(f, a)
    error = pvalue - tvalue
    fea = policy.get_fea_vec(f, a)
    policy.theta -= alpha * error * fea