def main(): g = Grid(4, 4) terminals = [{ "x": 3, "y": 0, "reward": 1 }, { "x": 1, "y": 3, "reward": 1 }, { "x": 2, "y": 3, "reward": -10 }, { "x": 3, "y": 3, "reward": 10 }] blocks = [{"x": 1, "y": 1}] g.init_world(terminals, blocks) np.random.seed(62) mdp.value_iteration(g, -0.02, 0.8, 0.8) mdp.policy_iteration(g, -0.02, 0.8, 0.8) mdp.q_function(g, "s6", -0.02, 0.8, 0.1, 0.1, 0.8, 1000000)
def _policy_iteration_slow(self): old_policy = dict(self.mdp.policy) for i in range(100): policy_iteration(old_policy,self.mdp, num_iter=1) self.gridworldwindow.update_grid(self.mdp.values, self.mdp.policy) self.gridworldwindow.window.update() time.sleep(0.25) self.gridworldwindow.window.update() new_policy = dict(self.mdp.policy) if policy_converged(new_policy, old_policy): break old_policy = new_policy self.gridworldwindow.show_dialog('Policy Iteration has converged in {} steps!'.format(i+1))
def _policy_iteration_1_step(self): policy, values = policy_iteration(self.mdp.policy, self.mdp, num_iter=1) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy)
def solve(self, episodes=200, iterations=200, reset=True, seed=False, gamma=0.95): mdp = EnvMDP(self.env, gamma=gamma) self.policy = policy_iteration(mdp) self.U = value_iteration(mdp, epsilon=0.000000000001)
def get_reachable(m): reachable_states = [m.init] new_rs = [] policy = mdp.policy_iteration(m) while True: for state in reachable_states: if state in policy: for (result_state, prob) in m.T(state, policy[state]).iteritems(): new_rs = new_rs + [result_state, state] #new_rs = new_rs + [policy[state]] new_rs = list(set(new_rs)) if new_rs == reachable_states: print "breaking: " + str(new_rs) #return new_rs break else: reachable_states = new_rs return True # Let's make sure the whole loop works first
def _policy_iteration_100_steps(self): policy_iteration(self.mdp, num_iter=100) self.gridworld.update_grid(self.mdp.values, self.mdp.policy)
def actions(self, state): """Set of actions that can be performed in this state. By default, a fixed list of actions, except for terminal states. Override this method if you need to specialize by state.""" if "dead" in (state.player.classify_hp(), state.opponent.classify_hp()): return [None] else: return state.player.moveset def T(self, state, action): if action == None: return [(0.0, state)] else: p = 1.0 / len(state.opponent.moveset) your_action = action(self.moveset_values[action])(state) v = [ (p, counter_attack(self.moveset_values[counter_attack])(your_action)) for counter_attack in state.opponent.moveset ] return v initial_state = State( Entity(5, 1, 1, [attack_opponent, weaken_defense, boost_attack]), Entity(5, 1, 1, [attack_opponent, weaken_attack, boost_defense]), ) initial_moveset_values = {boost_attack: 1, boost_defense: 1, weaken_attack: 1, weaken_defense: 1, attack_opponent: 1} m = BattleSimulation(initial_moveset_values, initial_state) solution = mdp.policy_iteration(m) pprint.PrettyPrinter(indent=4).pprint(solution)
for i_action in range(num_actions): pi_dict[i_state][i_action] = default_action [S, A, P, R, gamma, pi] = mdp.create_MDP(state_space, action_space, P_dict, R_dict, gamma, pi_dict) for i_state in range(num_states): # print(i_state) for i_action in range(num_actions): P[i_action, i_state, :] = deepcopy( P[i_action, i_state, :] / (sum(P[i_action, i_state, :]) + 10**(-10))) # print(P[i_action,i_state,:]) a = 1 best_action, vk = mdp.policy_iteration(S, A, P, R, gamma, pi) print(best_action) avg_best_pol = np.zeros(t.shape[0]) - 1 for i_t in range(t.shape[0]): sum_t = 0 t_val = t[i_t] counter_t = 0 for i_state in range(num_states): if near(state_space[i_state][0], t_val): sum_t = sum_t + action_space[int(best_action[i_state])] counter_t = counter_t + 1
return q_table, rewards if __name__ == '__main__': mdp = PresentationEnvironment() # Hyperparameters episodes = 400 steps_per_episode = 10 alpha = 0.95 # learning rate epsilon = 1. # exploration-exploitation rate gamma = 0.3 # discount rate q_table, rewards_over_time = q_learning(mdp, episodes, steps_per_episode, alpha, epsilon, gamma) optimal_policy = policy_iteration(mdp) q_policy = [np.argmax(q_table[s, :]) for s in range(mdp.number_of_states)] optimal_path, optimal_rewards = simulate(mdp, optimal_policy) q_path, q_rewards = simulate(mdp, q_policy) # Plot Rewards over episodes fig = plt.figure() ax = fig.add_subplot(111) ax.plot(rewards_over_time, '-b', label='Q-learning') ax.plot([optimal_rewards] * len(rewards_over_time), '--r', label='Optimal solution') ax.set_xlabel('Episodes') ax.set_ylabel('Cumulative Rewards') plt.figtext(0.96, 0.02, 'alpha: {}; gamma: {}'.format(alpha, gamma), ha="right", fontsize=8)
table = [header] + table table = [[(numfmt % x if isnumber(x) else x) for x in row] for row in table] maxlen = lambda seq: max(map(len, seq)) sizes = map(maxlen, zip(*[map(str, row) for row in table])) for row in table: print sep.join(getattr(str(x), j)(size) for (j, size, x) in zip(justs, sizes, row)) prize = 1 trap = -1 neg = -0.4 mdp1 = GridMDP([[neg, trap, prize], [neg, None, neg], [neg, neg, neg]], terminals=[(1, 2), (2, 2)], error=.8) print "GRID" print print "Value iteration" pi = best_policy(mdp1, value_iteration(mdp1, .01)) print_table(mdp1.to_arrows(pi)) print "Policy iteration" print_table(mdp1.to_arrows(policy_iteration(mdp1))) print "Q Learning" pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000)) print_table(mdp1.to_arrows(pi))
def _policy_iteration_100_steps(self): policy, values = policy_iteration(self.mdp.policy, self.mdp, num_iter=100) self.gridworldwindow.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policies(policy)