def main():
  env = standard_grid()
  qs = monte_carlo_control(standard_grid, epsilon_soft_greedy)
  render_qs_policy(env, qs)
  print()
  env = negative_grid()
  qs = monte_carlo_control(negative_grid, epsilon_soft_greedy)
  render_qs_policy(env, qs)
Example #2
0
def main():
    print('Standard Grid')
    env = standard_grid()
    v_star = value_iteration(env)
    render_vs(env, v_star)
    render_policy(env, policy_from_v(env, v_star))

    print('Negative Grid:')
    env = negative_grid()
    v_star = value_iteration(env)
    render_vs(env, v_star)
    render_policy(env, policy_from_v(env, v_star))
def main():
    print('Standard Grid')
    env = standard_grid()
    policy, vs = policy_iteration(env)
    render_vs(env, vs)
    render_policy(env, policy)

    print('Negative Grid:')
    env = negative_grid()
    policy, vs = policy_iteration(env)
    render_vs(env, vs)
    render_policy(env, policy)
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# this is deterministic
# all p(s',r|s,a) = 1 or 0

if __name__ == '__main__':
    # this grid gives you a reward of -0.1 for every non-terminal state
    # we want to see if this will encourage finding a shorter path to the goal
    grid = negative_grid()

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)

    # state -> action
    # we'll randomly choose an action and update as we learn
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    # initial policy
    print("initial policy:")
    print_policy(policy, grid)
        if env.is_terminal(env.current_state()):
            target = reward
        else:
            target = reward + discount * next_est

        # Update current state
        theta = theta + alpha * (target - c_est) * x
    return theta


def semi_gradient_td(create_env, policy, episodes=100000):
    theta = np.random.randn(4) / 2

    for _ in range(episodes):
        theta = td_episode(create_env, policy, theta)

    return theta


if __name__ == '__main__':
    env = standard_grid()
    theta = semi_gradient_td(standard_grid, eps_win_policy)
    vs = get_value(env, theta, preprocess_features)
    render_vs(env, vs)

    print()

    env = negative_grid()
    theta = semi_gradient_td(negative_grid, eps_win_policy)
    vs = get_value(env, theta, preprocess_features)
    render_vs(env, vs)
Example #6
0

def random_action(a, eps=0.1):
    p = np.random.random()  # returns a random float from [0,1)
    if p < (1 - eps):
        return a
    else:
        return np.random.choice(all_possible_actions)


# We then start the code. We do not have a playgame function, as playing the game and doing the updates cannot be separate
# The updates need to be done while playing the game

if __name__ == '__main__':

    grid = negative_grid(
        step_cost=-0.1)  #we want to penalize each agent movement

    # We then initialize Q(s,a)
    Q = {}
    states = grid.all_states()
    for s in states:
        Q[s] = {}  # we initialize as dictionary, as this also acts as policy
        for a in all_possible_actions:
            Q[s][a] = 0

    # We also keep track of how oftern Q[s] has been updated.
    # This is needed for updating the learning rate

    update_counts = {}
    update_counts_sa = {}
    for s in states:
Example #7
0
ALPHA = 2.0
ALL_POSSIBLE_ACTIONS = ['U', 'D', 'L', 'R']

def greedy_from(Qs):
    best_action = None
    best_value = float('-inf')
    for action in Qs:
        value = Qs[action]
        if value > best_value:
            best_action = action
            best_value = value 
    return best_action, best_value


if __name__ == '__main__':
    grid = negative_grid(-0.5)
    print('rewards')
    print_values(grid.rewards, grid)
    S = grid.all_states()
    Q = {}
    num_seen_sa = {}
    for s in S:
        Q[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            num_seen_sa[(s, a)] = 0
            if grid.is_terminal(s):
                Q[s][a] = 0
            else:
                Q[s][a] = np.random.rand()
    N = 10000
    deltas = []
  # suboptimal policies
  # e.g.
  # ---------------------------
  #   R  |   R  |   R  |      |
  # ---------------------------
  #   R* |      |   U  |      |
  # ---------------------------
  #   U  |   R  |   U  |   L  |
  # since going R at (1,0) (shown with a *) incurs no cost, it's OK to keep doing that.
  # we'll either end up staying in the same spot, or back to the start (2,0), at which
  # point we whould then just go back up, or at (0,0), at which point we can continue
  # on right.
  # instead, let's penalize each movement so the agent will find a shorter route.
  #
  # grid = standard_grid()
  grid = negative_grid(step_cost=-0.1)

  # print rewards
  print "rewards:"
  print_values(grid.rewards, grid)

  # no policy initialization, we will derive our policy from most recent Q
  # enumerate all (s,a) pairs, each will have its own weight in our "dumb" model
  # essentially each weight will be a measure of Q(s,a) itself
  states = grid.all_states()
  for s in states:
    SA2IDX[s] = {}
    for a in ALL_POSSIBLE_ACTIONS:
      SA2IDX[s][a] = IDX
      IDX += 1
Example #9
0
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

if __name__ == "__main__":
    grid = negative_grid(-.1)

    print("rewards: ")
    print_values(grid.rewards, grid)

    states = grid.all_state()

    print("\ninitial initalization: ")
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print_policy(policy, grid)

    # Value Initialization
    V = {}
    for s in states:
        V[s] = 0

    while True:
        delta = 0
        for s in states:
            old_Vs = V[s]
                    action_value = reward + self.discount * self.state_values[
                        next_state]
                    if action_value > best_action_value:
                        best_action_value = action_value
                        best_action = a
                    self.state_values[s] += p_a * action_value
                    self.env.undo_move(a)
                self.policy[s] = best_action
                deltas[i] = np.abs(self.state_values[s] - old_value)

            t += 1
            print("Number of iterations: ", t)
            grid_world.print_policy(self.policy, self.env)
            grid_world.print_values(self.state_values, self.env)


if __name__ == "__main__":

    optimizer = PolicyOptimizer(environment=grid_world.negative_grid())
    grid_world.print_policy(optimizer.policy, optimizer.env)
    grid_world.print_values(optimizer.state_values, optimizer.env)
    optimizer.perform_value_iteration(wind=None)

    # Windy Gridworld: each action has a 50% chance to fail, another action (chosen at random) is performed instead
    optimizer = PolicyOptimizer(environment=grid_world.negative_grid())
    grid_world.print_policy(optimizer.policy, optimizer.env)
    grid_world.print_values(optimizer.state_values, optimizer.env)
    optimizer.perform_value_iteration(
        wind='right',
        wind_force=0.26)  # .25 is the threshold to switch optimal agency
Example #11
0
import numpy as np
from grid_world import standard_grid, negative_grid
from visualize import print_values, print_policies
from q_study_functional import action_eps_greedy, get_max_Q, get_Qs
from model import Model

GAMMA = 0.9
ALPHA = 0.1
ALL_POSSIBLE_ACTIONS = ('R', 'L', 'U', 'D')

if __name__ == "__main__":
    grid = negative_grid(step_cost=-50)
    print("rewards:")
    print_values(grid.rewards, grid.width, grid.height)
    model = Model()
    t = 1.0
    for it in range(30000):
        if it % 100 == 0:
            t += 1e-2
        if it % 1000 == 0:
            print("it: {}".format(it))
        # 0. initialize condition
        alpha = ALPHA / t
        current_grid = (3, 0)
        grid.set_initial_grid(current_grid)
        start_flag = True
        Qs = get_Qs(model, current_grid, ALL_POSSIBLE_ACTIONS)
        while not grid.game_over():
            if start_flag is True:
                # 1. choose random action
                action = np.random.choice(ALL_POSSIBLE_ACTIONS)
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# next state and reward will now have some randomness
# you'll go in your desired direction with probability 0.5
# you'll go in a random direction a' != a with probability 0.5/3

if __name__ == '__main__':
  # this grid gives you a reward of -0.1 for every non-terminal state
  # we want to see if this will encourage finding a shorter path to the goal
  grid = negative_grid(step_cost=-1.0)
  # grid = negative_grid(step_cost=-0.1)
  # grid = standard_grid()

  # print rewards
  print "rewards:"
  print_values(grid.rewards, grid)

  # state -> action
  # we'll randomly choose an action and update as we learn
  policy = {}
  for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

  # initial policy
  print "initial policy:"
Example #13
0
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid, negative_grid
from utils import print_policy, print_values

ALL_POSSIBLE_ACTIONS = ['U', 'D', 'L', 'R']
SMALL_ENOUGH = 10e-4
GAMMA = 0.9

if __name__ == '__main__':
    # random initalize V and pi
    V = {}
    grid = negative_grid(-1.901)
    print('Rewards:')
    print_values(grid.rewards, grid)

    S = grid.all_states()
    for state in S:
        V[state] = 0

    pi = {}
    for state in grid.actions.keys():
        pi[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    print('Inital Policy')
    print_policy(pi, grid)
    print('Inital Vs')
    print_values(V, grid)

    iteration = 0
    while True:
    def grad(self, s, a):
        return self.sa2x(s, a)


def getQs(model, s):
    #we need Q9s,a) to choose an action
    Qs = {}
    for a in ACTIONS:
        q_sa = model.predict(s, a)
        Qs[a] = q_sa
    return Qs


if __name__ == '__main__':

    grid = negative_grid(step_cost=-0.1)

    # rewards
    print('rewards')
    print_values(grid.rewards, grid)

    #no policy initialization

    states = grid.all_states()
    for s in states:
        SA2IDX[s] = {}
        for a in ACTIONS:
            SA2IDX[s][a] = IDX
            IDX += 1

    #initialize model
Example #15
0
                r = grid.move(other_action)
                mean_v += p * (r + gamma * steps_values[grid.current_state()])
            if mean_v > best_val:
                best_val = mean_v
                new_action = candidate_action
        if new_action != current_action:
            improved_policy[s] = new_action
            is_converged = False

    return is_converged, improved_policy


if __name__ == "__main__":

    # grid = standard_grid()
    grid = negative_grid(-1.0)
    print_values(grid.rewards, grid)

    # Step 1: initialize policy and state values
    policy = dict(
        zip(grid.actions.keys(),
            np.random.choice(ALL_POSSIBLE_ACTIONS, len(grid.actions.keys()))))

    V = defaultdict(int)
    for s in grid.actions:
        V[s] = np.random.random()

    print("Initialized values:")
    print_values(V, grid)
    print("Initialized pplicy:")
    print_policy(policy, grid)
Example #16
0
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

threshold = 1e-3
gamma = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# Add some randomness

if __name__ == "__main__":
    grid = negative_grid(step_cost=-1.0)

    print('Rewards:')
    print_values(grid.rewards, grid)
    print('')

    policy = {}
    # Randomly choose action and update as we learn
    for state in grid.actions:
        policy[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    print('Initial Policy:')
    print_policy(policy, grid)

    V = {}
    states = grid.all_states()
    for state in states:
        # V(s) = 0
        if state in grid.actions:
            V[state] = np.random.random()
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# this is deterministic
# all p(s',r|s,a) = 1 or 0

if __name__ == '__main__':
  # this grid gives you a reward of -0.1 for every non-terminal state
  # we want to see if this will encourage finding a shorter path to the goal
  grid = negative_grid()

  # print rewards
  print "rewards:"
  print_values(grid.rewards, grid)

  # state -> action
  # we'll randomly choose an action and update as we learn
  policy = {}
  for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

  # initial policy
  print "initial policy:"
  print_policy(policy, grid)
# Glenn Thomas
# 2020-06-01
# TD (Temporal differcing examples)

import grid_world as gw
import numpy as np
import td
import dynamic_programming_functions as dp
import matplotlib.pyplot as plt

# Set up
g = gw.negative_grid(step_reward=-0.1)
g.windy = 0.5

# Set up policies
fixed_policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
}
actions = g.actions_array
for key, value in fixed_policy.items():
    probs = np.zeros(len(actions))
    probs[np.isin(actions, value)] = 1 / len(value)
    fixed_policy[key] = probs
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

if __name__ == '__main__':
    grid = negative_grid(step_cost=-1.0)  # try standard

    print "rewards:"
    print_values(grid.rewards, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

    print "initial policy:"
    print_policy(policy, grid)

    V = {}
    states = grid.all_states()
    for s in states:
        if s in grid.actions:
            V[s] = np.random.random()
        else:
            # terminal state
            V[s] = 0