terminal = False state = game.initialise_state() action = epsilon_greedy(state) E_matrix = np.zeros_like(theta) while not terminal: # take action a, observe r, s' next_state, reward = game.step(state, action) # choose a' from s' using policy from Q terminal = next_state.terminal if not terminal: next_action = epsilon_greedy(state) delta = reward + Q(next_state, next_action) - Q(state, action) else: delta = reward - Q(state, action) E_matrix = np.add(lmd * E_matrix, psi(state, action)) theta += alpha * delta * E_matrix if not terminal: state = next_state action = next_action game.visualise(V(generate_Q()))
E_matrix = np.zeros_like(Q_matrix) state = game.initialise_state() action = epsilon_greedy(allQ(state), allN(state)) while not terminal: next_state, reward = game.step(state, action) terminal = state.terminal if not terminal: next_action = epsilon_greedy(allQ(state), allN(state)) delta = reward + Q(next_state, next_action) - Q(state, action) else: delta = reward - Q(state, action) allE(state)[int(action)] += 1 allN(state)[int(action)] += 1 alpha = 1 / N(state, action) Q_matrix += alpha * delta * E_matrix E_matrix *= lmd if not terminal: state = next_state action = next_action game.visualise(V(Q_matrix))
while not terminal: state, reward = game.step(state, action) action = softmax_policy(state, theta) terminal = state.terminal if terminal: state_action_pairs = zip(history[0::3], history[1::3]) history.append(reward) history.append(state) Gt = sum(history[2::3]) for s, a in state_action_pairs: increment_n(s, a) alpha = 1 / N(s, a) advantage = Gt - Q(s, a, theta) theta += alpha * score_function(s, a, theta) * advantage else: history.append(reward) history.append(state) history.append(action) if k % 10000 == 0: print("MSE: " + str(round(np.sum((Q_star - generate_Q(theta))**2), 2))) game.visualise(V(generate_Q(theta)))