Esempio n. 1
0
def q_learning(episodes, alpha=0.1, discount_factor=1, epsilon=0.1, k=1):
    Q = {}
    pi = {}
    total_states = getAllStates()
    for s in total_states:
        Q[s] = 0
        pi[s[0]] = 0
    env = environment()
    episodic_rewards = []
    for i in range(episodes):
        history = []
        total_episodic_reward = 0
        # print("Episode is ",i)
        current_state = env.reset()
        while not (current_state[0] >= 0 and current_state[0] <= 31
                   and current_state[1][1] == 0):
            current_state = env.reset()
        T = 1e10
        t = 0
        while True:
            if t < T:
                hashcurrentstate = (current_state[0], current_state[1],
                                    tuple(current_state[2]))
                action = epsilon_greedy(hashcurrentstate, Q, 2, epsilon)
                pi[hashcurrentstate] = action
                s_t, r_t, terminate = env.step(action)
                total_episodic_reward += r_t
                # if current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0:
                hashstate = (current_state[0], current_state[1],
                             tuple(current_state[2]))
                history.append((hashstate, action, r_t))
                current_state = s_t
                if terminate:
                    T = t + 1
            tau = t - k + 1
            if tau >= 0:
                G = 0
                for i in range(tau + 1, min(tau + k, T) + 1):
                    G += pow(discount_factor, i -
                             (tau + 1)) * history[i - 1][2]
                if tau + k < T:
                    G += pow(discount_factor, k) * max(
                        Q[(history[tau + k - 1][0], 0)],
                        Q[(history[tau + k - 1][0],
                           1)])  #update by 1 step ahead greedy policy
                Q[(history[tau][0], history[tau][1]
                   )] += alpha * (G - Q[(history[tau][0], history[tau][1])])
            t += 1
            if tau == T - 1:
                episodic_rewards.append(total_episodic_reward)
                break

    m = 0
    for k, v in Q.items():
        m = max(v, m)
    # print(m)
    # print(Q)
    return Q, episodic_rewards
Esempio n. 2
0
def sarsa_lambda(episodes,
                 alpha=0.1,
                 discount_factor=1,
                 epsilon=0.1,
                 lmbda=0.5,
                 decay=False):
    Q = {}
    total_states = getAllStates()
    for s in total_states:
        Q[s] = 0
    env = environment()
    episodic_rewards = []
    for i in range(episodes):
        total_episodic_reward = 0
        E = {}
        for s in total_states:
            E[s] = 0
        # print("Episode ",i)
        current_state = env.reset()
        while not (current_state[0] >= 0 and current_state[0] <= 31
                   and current_state[1][1] == 0):
            current_state = env.reset()
        current_action = 1
        while True:
            # if current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0:
            hashcurrentstate = (current_state[0], current_state[1],
                                tuple(current_state[2]))
            next_state, reward, terminate = env.step(current_action)
            total_episodic_reward += reward
            hashnextstate = (next_state[0], next_state[1],
                             tuple(next_state[2]))
            if decay:
                epsilon = epsilon / (i + 1)
            next_action = epsilon_greedy(hashnextstate, Q, 2, epsilon)
            delta = reward + discount_factor * (Q[
                (hashnextstate, next_action)]) - Q[(hashcurrentstate,
                                                    current_action)]
            E[(hashcurrentstate, current_action)] += 1
            for s in Q.keys():
                Q[s] += alpha * delta * E[s]
                # if s == (hashcurrentstate,current_action) and delta:
                #     print("True")
                #     print(Q[s])
                E[s] = discount_factor * lmbda * E[s]
            current_state = next_state
            current_action = next_action
            if terminate:
                episodic_rewards.append(total_episodic_reward)
                break

    m = 0
    for k, v in Q.items():
        m = max(v, m)
    # print(m)
    # print(Q)
    return Q, episodic_rewards
Esempio n. 3
0
def monte_carlo(episodes, mode="fv", discount=1):
    Q = {}
    total_states = getAllStates()
    for s in total_states:
        Q[s] = 0
    total_return = {}
    env = environment()
    for i in range(episodes):
        # print("episode number is")
        # print(i)
        history = []
        current_state = env.reset()
        while True:
            action = fixedPolicy(current_state)
            next_state, reward, terminate = env.step(action)
            # if current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0:
            hashstate = (current_state[0], current_state[1],
                         tuple(current_state[2]))
            history.append((hashstate, action, reward))
            current_state = next_state
            if terminate:
                break
        expected_return = 0

        for i in range(len(history) - 2, -1, -1):
            expected_return = discount * expected_return + history[i + 1][2]
            flag = 0
            if mode == "fv":
                for j in range(0, i):
                    if history[j][0] == history[i][0] and history[j][
                            1] == history[j][0]:
                        flag = 1
                        break
            s = history[i][0]
            a = history[i][1]
            if flag == 0:
                if total_return.get((s, a)) is None:
                    total_return[(s, a)] = []
                total_return[(s, a)].append(expected_return)
                if Q.get((s, a)) is None:
                    Q[(s, a)] = 0
                Q[(s,
                   a)] = sum(total_return[(s, a)]) / len(total_return[(s, a)])
    m = 0
    for k, v in Q.items():
        m = max(k[0][0], m)
    # print(m)
    # print(len(Q.keys()))
    return Q
Esempio n. 4
0
def td_learning(episodes, alpha=0.1, discount_factor=1, k=1):
    Q = {}
    total_states = getAllStates()
    for s in total_states:
        Q[s] = 0
    env = environment()
    for i in range(episodes):
        history = []
        # print("Episode is ",i)
        current_state = env.reset()
        while not (current_state[0] >= 0 and current_state[0] <= 31
                   and current_state[1][1] == 0):
            current_state = env.reset()
        T = 1e10
        t = 0
        while True:
            if t < T:
                action = fixedPolicy(current_state)
                s_t, r_t, terminate = env.step(action)
                # if current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0:
                hashstate = (current_state[0], current_state[1],
                             tuple(current_state[2]))
                history.append((hashstate, action, r_t))
                current_state = s_t
                if terminate:
                    T = t + 1
            tau = t - k + 1
            if tau >= 0:
                G = 0
                for i in range(tau + 1, min(tau + k, T) + 1):
                    G += pow(discount_factor, i -
                             (tau + 1)) * history[i - 1][2]
                if tau + k < T:
                    G += pow(discount_factor, k) * Q[(history[tau + k - 1][0],
                                                      history[tau + k - 1][1])]
                Q[(history[tau][0], history[tau][1]
                   )] += alpha * (G - Q[(history[tau][0], history[tau][1])])
            t += 1
            if tau == T - 1:
                break

    m = 0
    for n, v in Q.items():
        m = max(v, m)
    # print(m)
    return Q
Esempio n. 5
0
def n_step_sarsa(episodes,
                 alpha=0.1,
                 discount_factor=1,
                 k=1,
                 epsilon=0.1,
                 decay=False):
    Q = {}
    pi = {}
    total_states = getAllStates()
    for s in total_states:
        Q[s] = 0
        pi[s[0]] = 0
    env = environment()
    episodic_rewards = []
    for i in range(episodes):
        history = []
        total_episodic_reward = 0
        # print("Episode is ",i)
        current_state = env.reset()
        # while not (current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0) :
        #     current_state = env.reset()
        if decay:
            epsilon = epsilon / (i + 1)
        hashstate = (current_state[0], current_state[1],
                     tuple(current_state[2]))
        # current_action = fixedPolicy(current_state)
        current_action = epsilon_greedy(hashstate, Q, 2, epsilon)
        history.append((hashstate, 0))
        T = 1e10
        t = 0
        while True:
            print(t)
            if t < T:
                s_t, r_t, terminate = env.step(current_action)
                print(terminate)
                print(r_t)
                total_episodic_reward += r_t
                current_state = s_t
                # if current_state[0] >= 0 and current_state[0] <=31 and current_state[1][1] == 0:
                hashstate = (current_state[0], current_state[1],
                             tuple(current_state[2]))
                if terminate:
                    T = t + 1
                else:
                    # current_action = fixedPolicy(current_state)
                    current_action = epsilon_greedy(hashstate, Q, 2, epsilon)
                history.append((hashstate, current_action, r_t))
            tau = t - k + 1
            if tau >= 0:
                G = 0
                for i in range(tau + 1, min(tau + k, T) + 1):
                    G += pow(discount_factor, i - (tau + 1)) * history[i][2]
                if tau + k < T:
                    G += pow(discount_factor, k) * Q[(history[tau + k - 1][0],
                                                      history[tau + k - 1][1])]
                Q[(history[tau][0], history[tau][1]
                   )] += alpha * (G - Q[(history[tau][0], history[tau][1])])
            t += 1
            if tau == T - 1:
                episodic_rewards.append(total_episodic_reward)
                break

    m = 0
    for k, v in Q.items():
        m = max(v, m)
    # print(m)
    # # print(Q)
    return Q, episodic_rewards
from player import Player
from simulator import environment

a = environment()
terminate = False
counter = 0
flag = 0
while (not terminate):
    if flag == 0:
        state, reward, terminate = a.step(1)
        print(state[0])
    else:
        state, reward, terminate = a.step(0)
    if (state[0] >= 25):
        flag = 1
print(state)
print(reward)