Exemple #1
0
def run_trial(name,
              agent,
              rounds,
              epochs,
              given_params,
              save=True,
              animation=True):
    save_path = f"/Models/{name}/"
    if save:
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        # p.dump(given_params, open(f"/Models/{name}/env.txt", "wb"))
        np.savetxt(f"/Models/{name}/env.txt", np.asarray(given_params[0]))
        with open(f"/Models/{name}/env_penalty.txt", "w") as f:
            f.write(f"Collision penalty is: {given_params[1]}")
        f.close()

    for round_num in range(rounds):
        world = Gridworld(WORLD_SIZE, ACTION_INFO, given_params)
        agent.set_world(world)
        world.place_agent(0, 0)
        run_epoch(agent,
                  world,
                  round_num,
                  epochs,
                  f"/Models/{name}",
                  save=save,
                  animate=animation)
        if save:
            agent.end_round(name, round_num)
Exemple #2
0
def main():

    print(f"Starting Cliff Walk, initializing world and agent")

    # Create args to use for instantiating a world and agent object
    cliff_world_args = {
        "H": H,
        "W": W,
        "wind": WIND,
        "start_pos": START_POS,
        "end_pos": END_POS,
        "stochastic": False,
        "variance": 0,
        "hazard": HAZARD
    }

    cliff_agent_args = {
        "alpha": ALPHA,
        "eps": EPS,
        "gamma": GAMMA,
        "alpha_ramp": ALPHA_RAMP,
        "actions": ACTIONS
    }

    # Create the world and agent
    cliff_world = Gridworld(**cliff_world_args)
    cliff_agent = GridAgent(START_POS, cliff_world.H, cliff_world.W, NUM_A,
                            **cliff_agent_args)

    # Train the agent for specified eps, occasionally printing to console
    print(f"Training agent for {TRAIN_EPS} episodes")
    cliff_agent.train_agent(cliff_world,
                            print_moves=1000,
                            move_timeout=1000,
                            episodes=5000,
                            ramp_alpha=True,
                            method=GridAgent.Q_LEARNING)

    # Check the final policy (no training, acting 100% greedily)
    print(f"\nDone training agent for {TRAIN_EPS} episodes")
    print(f"Checking path and returns for trained agent")
    G, path = cliff_agent.run_episode(cliff_world, train=False)

    print(f"GridAgent received reward {-G} (smaller is better)\n")

    # Visualize the final policy
    policy = cliff_agent.get_policy(world=cliff_world, visual=True)
    for a in policy:
        print(a)
    print()

    # Plot the final path that agent took, and its rewards over course of training
    y, x = zip(*path)
    fig, ax = plt.subplots(2, 1)
    ax[0].imshow(cliff_world.get_image(), origin="upper")
    # ax[0].set_ylim(ax[0].get_ylim()[::-1])
    ax[0].plot(x, y)

    ax[1].plot(cliff_agent.G_list)
    plt.show()
def main():

    print(f"Starting Windy Gridworld, initializing world and agent")

    windy_world_args = {
        "H": H,
        "W": W,
        "wind": WIND,
        "start_pos": START_POS,
        "end_pos": END_POS,
        "stochastic": True,
        "variance": 1
    }

    windy_agent_args = {
        "alpha": ALPHA,
        "eps": EPS,
        "gamma": GAMMA,
        "alpha_ramp": ALPHA_RAMP,
        "actions": ACTIONS
    }

    windy_world = Gridworld(**windy_world_args)
    windy_agent = GridAgent(START_POS, windy_world.H, windy_world.W, NUM_A,
                            **windy_agent_args)
    # windy_agent.set_Q_to_default()

    print(f"Training agent for {TRAIN_EPS} episodes")
    windy_agent.train_agent(windy_world,
                            print_moves=PRINT_AFTER_MOVES,
                            move_timeout=MOVE_TIMEOUT,
                            episodes=TRAIN_EPS)

    print(f"\nDone training agent for {TRAIN_EPS} episodes")
    print(f"Checking path and returns for trained agent")
    G, path = windy_agent.run_episode(windy_world, train=False)

    print(f"GridAgent completed task in {-G} moves\n")
    policy = windy_agent.get_policy(world=windy_world, visual=True)
    for a in policy:
        print(a)
    print()

    print(windy_world.get_image())

    y, x = zip(*path)
    fig, ax = plt.subplots(2, 1)
    ax[0].imshow(windy_world.get_image(), origin="upper")
    ax[0].plot(x, y)
    ax[1].plot(windy_agent.G_list)
    plt.show()
Exemple #4
0
def main():

    # define variables
    theta = 0.000001
    discount_factor = 0.8

    # create a grid object
    grid = Gridworld(5)

    # initialize a policy: create an array of dimension (number of states by number of actions)
    # for equal probability amongst all actions, divide everything by the number of actions
    policy = np.ones([state_count, action_count]) / action_count

    # run policy evaluation
    final_value_map, max_iter, delta, policy = policy_evaluation(
        grid.valueMap, grid.states, discount_factor, theta, grid.reward,
        grid.p_transition, grid.transition_prob, policy)

    # print the final value function
    print("Total Iterations: ")
    print(max_iter)
    print("Value Function: ")
    np.set_printoptions(precision=4)
    print(final_value_map)

    # print delta vs iterations
    import matplotlib.pyplot as plt
    # plot iteration vs delta
    plt.plot(range(max_iter), delta)
    plt.title('Policy Evaluation with Discount Factor ' + str(discount_factor))
    plt.xlabel('Iterations')
    plt.ylabel('Max Delta')
    plt.savefig('graphs/policy_evaluation_' + str(int(discount_factor * 100)) +
                '.png')
    plt.show()
def show_game(args):
    env = Gridworld(rows=5, cols=5, greens=3, reds=2)
    tf.reset_default_graph()
    qnet = get_qnet(args)

    saver = tf.train.Saver()
    tf.get_default_graph().finalize()
    with tf.Session(config=tf.ConfigProto(operation_timeout_in_ms=10000)) as sess:
        saver.restore(sess, args.restore_ckpt)
        done = False
        state = preprocess_img(env.reset())
        _ = env.render()
        reward, turns = 0, 0

        while not done:
            t1 = time.time()
            action = qnet.predict(sess, normalize(np.array([state])))[0]

            img, r, done, _ = env.step(action)
            _ = env.render()
            state = preprocess_img(img)
            reward += r
            turns += 1
            time.sleep(max(0, .2 - (time.time() - t1)))
    print('turns =', turns, ' reward =', reward, ' reward/turn =', reward/turns)
    def __init__(self, dyna=False, plus=False, experiment=False):
        self.randomizeAction = 0.1
        self.agent = Agent(Actions)
        self.world = Gridworld(self.agent, self)
        self.Q = np.zeros((self.world.Width, self.world.Height,
                           len(Actions)))  # initialize q table to zeros

        self.goalreward = 1
        self.rewards = []
        self.cumulativeReward = 0
        self.completedEpisodes = 0
        self.stepsPerEpisode = []

        self.updatePolicy = self.basicQPolicy
        self.PLUS = plus
        self.EXPERIMENT = experiment
        if (dyna):
            self.updatePolicy = self.DynaQPolicy

        # For Dyna-Q
        self.numModelUpdates = 50
        self.model = self.BuildModel()

        # For Dyna-Q+, a table of how long its been since a state-action was visited, and an incrementer for easy addition
        # if (self.PLUS):
        #     self.randomizeAction = 1.0
        self.timestep = 0
        self.history = dict()  #self.BuildHistory()

        self.lookingForNextWin = False
        self.timeSinceLooking = 0

        self.visitCount = np.zeros((self.world.Width, self.world.Height))

        # standardized random number generator for action selection
        self.random = random.Random()
        self.random.seed(12)
def train(args):
    """
    This function trains a Neural Network on how to play brickbreaker. Is
    meant to be identical to how Deepminds paper "Playing Atari with Deep
    Reinforcement Learning" works.
    :param args: parser.parse_args
    :return:
    """
    with open(os.path.join(args.ckpt_dir, args.train_record_fname), 'a') as f:
        f.write("BasicGridworld -- begin training --\n")

    tf.reset_default_graph()
    env = Gridworld(rows=5, cols=5, greens=3, reds=2)
    qnet = get_qnet(args)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    # Don't want to change the graph once we begin playing the game.
    tf.get_default_graph().finalize()
    with tf.Session(config=tf.ConfigProto(
            operation_timeout_in_ms=10000)) as sess:
        sess.run(init)
        e = args.e_i
        last_output_ep = 0
        rewards = []
        transitions = 0  # number of transitions updated against
        next_output = args.output_period

        while transitions < args.train_steps:
            r, e, t = play_episode(args, sess, env, qnet, e)
            if transitions == 0 and t > 0:
                # Output status from before training starts.
                write_output(args, sess, saver, last_output_ep, e, rewards,
                             transitions)
                last_output_ep = len(rewards)

            transitions += t
            rewards.append(r)

            if transitions > next_output:
                # Regular output during training.
                write_output(args, sess, saver, last_output_ep, e, rewards,
                             transitions)
                next_output += args.output_period
                last_output_ep = len(rewards)

    with open(os.path.join(args.ckpt_dir, args.train_record_fname), 'a') as f:
        f.write('\n\n')
def main():
    goals = [(7,0)]
    anti_goals = [(1,0),(2,0),(3,0),(4,0),(5,0),(6,0)]
    env = Gridworld(8, 4, goals, anti_goals)

    # get baseline random performance
    q = init_state_action_map(env)
    estimate_performance(env, q, 1)

    # learn q
    print("running double q-learning...")
    q1, q2 = double_q_learning(env)
    print("double q-learning complete")

    # determine post-training performance
    estimate_performance(env, q2, 0.01)
    visualize_performance(env, q2)
Exemple #9
0
def main():
    goals = [(7, 0)]
    anti_goals = [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)]
    env = Gridworld(8, 4, goals, anti_goals)

    # init q and get baseline random performance
    q = init_state_action_map(env)
    estimate_performance(env, q, 1)

    # learn q
    print("running sarsa...")
    q = sarsa(env, q)
    print("sarsa complete")

    # determine post-training performance
    estimate_performance(env, q, 0.01)
    visualize_performance(env, q)
def test_model(model, mode='static', display=True):
    i = 0
    test_game = Gridworld(size=5, mode=mode)
    state_ = test_game.board.render_np().reshape(
        1, input_size) + np.random.rand(1, input_size) / 10.0
    state = torch.from_numpy(state_).float()
    if display:
        print("Initial State:")
        print(test_game.display())
    status = 1
    while (status == 1):  #A
        qval = model(state)
        qval_ = qval.data.numpy()
        action_ = np.argmax(qval_)  #B
        action = action_set[action_]
        if display:
            print('Move #: %s; Taking action: %s' % (i, action))
        test_game.makeMove(action)
        state_ = test_game.board.render_np().reshape(
            1, input_size) + np.random.rand(1, input_size) / 10.0
        state = torch.from_numpy(state_).float()
        if display:
            print(test_game.display())
        reward = test_game.reward()
        if reward != -1:
            if reward > 0:
                status = 2
                if display:
                    print("Game won! Reward: %s" % (reward, ))
            else:
                status = 0
                if display:
                    print("Game LOST. Reward: %s" % (reward, ))
        i += 1
        if (i > 15):
            if display:
                print("Game lost; too many moves.")
            break

    win = True if status == 2 else False
    return win
def main():
    x_limit = 8
    y_limit = 4
    goals = [(7, 3)]
    anti_goals = []

    env = Gridworld(x_limit, y_limit, goals, anti_goals, kings_moves=False)
    num_episodes = 100

    # determine the baseline performance that results from taking random moves
    avg = sum([len(generate_random_episode(env))
               for _ in range(num_episodes)]) / float(num_episodes)
    print "baseline random performance: " + str(avg)

    # learn q
    print "running n-step sarsa..."
    q = n_step_sarsa(env)
    print "n-step sarsa complete"

    # determine post-training performance
    avg = sum([
        len(generate_epsilon_greedy_episode(env, q))
        for _ in range(num_episodes)
    ]) / float(num_episodes)
    print "post learning performance: " + str(avg)

    # visualize post-training episode
    state = env.reset()
    while True:
        env.render()
        time.sleep(0.25)
        action = choose_epsilon_greedy_action(q, state, 0.1)
        state, _, done, _ = env.step(action)  # take a random action
        if done:
            env.render(close=True)
            break
Exemple #12
0
optimizer = tf.keras.optimizers.Adam(learning_rate)

gamma = 0.9
epsilon = 0.3

epochs = 5000
losses = []
mem_size = 1000
batch_size = 200
replay = deque(maxlen=mem_size)
max_moves = 50
h = 0
sync_freq = 500  #A
j = 0
for i in range(epochs):
    game = Gridworld(size=4, mode='random')
    state1_ = game.board.render_np().reshape(
        1, 64) + np.random.rand(1, 64) / 100.0
    state1 = state1_  #torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while (status == 1):
        j += 1
        mov += 1
        qval = model.predict(state1)
        if (random.random() < epsilon):
            action_ = np.random.randint(0, 4)
        else:
            action_ = np.argmax(qval)

        action = action_set[action_]
Exemple #13
0
        # get new state and reward after taking action from current state
        new_state_vector, reward = grid.transition_reward(
            state_vector, action_vector)
        state_vector = list(new_state_vector)

        # save state, action chosen and reward to list
        state_list.append(state_vector)
        action_list.append(action_vector)
        reward_list.append(reward)

    return state_list, action_list, reward_list


# create a grid object
grid = Gridworld(5)

# initialize other parameters
gamma = 0.99
lr = 0.1
epsilon = [0.01, 0.1, 0.25]
runs = 20
episode_length = 500
window_length = int(episode_length / 20)

reward_epsilon = []
reward_run_all = []
test_reward_epsilon = []
test_reward_run_all = []

# plot
Exemple #14
0
def Sarsa(gamma, lr, epsilon, runs, step_number, episode_length):

    # create a grid object
    grid = Gridworld(5)
    window_length = int(episode_length/20)

    # define variables for plotting purposes
    reward_epsilon = []
    reward_run_all = []
    test_reward_epsilon = []
    test_reward_run_all = []
    label = []
    for r in range(1, runs+1):
        label.append(str(r))

    # begin iterating over every epsilon
    for eps in epsilon:

        # reset some lists
        Q_values_list = []
        reward_run = []
        test_reward_run =[]

        # begin iterating over a set amount of runs (20)
        for run in range(1, runs+1):

            # initialize q values for all state action pairs
            global Q_values
            Q_values = np.zeros((state_count, action_count))

            # define lists for plots
            reward_episode = []
            test_reward_episode = []
            delta_list = []

            # SARSA BEGINS ------------------------------------------------------------------------------------------
            # iterate over episodes
            for episode in range(episode_length):
                
                # initialize/reset parameters
                reward_list = []
                delta = 0
                
                # initialize state (output: [4, 4])
                state_vector = grid.initial_state()
                state_index = grid.states.index(state_vector)

                # choose an action based on epsilon-greedy (output: action index ie. 0)
                action_index = choose_action(state_index, eps)
                action_vector = actions[action_index]

                # iterate over 200 steps within each episode
                for step in range(step_number):

                    # get the next state and reward after taking the chosen action in the current state
                    next_state_vector, reward = grid.transition_reward(state_vector, action_vector)
                    next_state_index = grid.states.index(list(next_state_vector))
                    
                    # add reward to list
                    reward_list.append(reward)
                    
                    # choose an action based on epsilon-greedy (output: action index ie. 0)
                    next_action_index = choose_action(next_state_index, eps)
                    next_action_vector = actions[next_action_index]

                    # calculate max delta change for plotting max q value change
                    Q_value = Q_values[state_index][action_index] + lr*(reward + gamma*Q_values[next_state_index][next_action_index] - Q_values[state_index][action_index])
                    delta = max(delta, np.abs(Q_value - Q_values[state_index][action_index]))   
                    
                    # update Q value
                    Q_values[state_index][action_index] = Q_values[state_index][action_index] + lr*(reward + gamma*Q_values[next_state_index][next_action_index] - Q_values[state_index][action_index])
                    
                    # update state and action vector
                    state_vector = list(next_state_vector)
                    state_index = grid.states.index(state_vector)
                    action_vector = list(next_action_vector)
                    action_index = next_action_index
                
                # append lists for plotting purposes
                delta_list.append(delta)
                reward_episode.append(sum(reward_list))
                
                # TESTING AFTER EACH EPISODE ------------------------------------------------------------
                # initialize policy
                policy = np.zeros((state_count, action_count))            
                # Generate Greedy policy based on Q_values after each episode
                for state in range(len(Q_values)):
                    # find the best action at each state
                    best_action = np.argmax(Q_values[state])
                    # write deterministic policy based on Q_values
                    policy[state][best_action] = 1
                # Generate test trajectory with the greedy policy
                state_list, action_list, test_reward_list = generate_episode(step_number, grid, policy)
                test_reward_episode.append(sum(test_reward_list))
                #----------------------------------------------------------------------------------------

                # print current episode
                clear_output(wait=True)
                display('Epsilon: ' + str(eps) + ' Run: ' + str(run) + ' Episode: ' + str(episode))

            # append lists for plotting purpose
            test_reward_run.append(Average(test_reward_episode))
            reward_run.append(Average(reward_episode))
            Q_values_list.append(Q_values)

            # PLOTTING CODE--------------------------------------------------------------------------------------------------------------------
            # Average Reward per Episode during Training with different runs and epsilons
            plt.plot(test_reward_episode)
            plt.plot(reward_episode)
            plt.title('Average Reward per Episode, Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Average Reward')
            plt.legend(('Testing','Training'))
            plt.savefig('Graphs/Sarsa/reward_episode/reward_episode_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

            # max delta of each episode, where delta is the change in Q values
            plt.plot(delta_list)
            plt.title('Sarsa Max Delta for Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Max Delta')
            delta_frame = pd.DataFrame(delta_list)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average', color='orange')
            plt.savefig('Graphs/Sarsa/delta/delta_run_'+str(int(run))+'_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

        # append lists for plotting
        reward_run_all.append(reward_run)
        test_reward_run_all.append(test_reward_run)
        reward_epsilon.append(Average(reward_run))
        test_reward_epsilon.append(Average(test_reward_run))

        # Average Reward for each Run with different Epsilon
        plt.plot(test_reward_run)
        plt.plot(reward_run)
        plt.title('Average Reward for each Run with Epsilon: '+ str(float(eps)))
        plt.xlabel('Run')
        plt.xticks(np.arange(runs), label)
        plt.ylabel('Average Reward')
        plt.legend(('Testing','Training'))
        plt.savefig('Graphs/Sarsa/reward_run/reward_run_epsilon_' + str(float(eps)) + '.png')
        plt.clf()
        time.sleep(0.05)

        # save Q value tables to a pickle
        with open('Graphs/Sarsa/Qvalues/Sarsa_Qvalues_' + str(eps) + '.pkl', 'wb') as f:
            pickle.dump(Q_values_list, f)

    # Average Reward for each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, reward_epsilon)
    # plt.plot(reward_epsilon)
    plt.title('Average Reward for each Epsilon during Training')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/Sarsa/reward_epsilon/reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for Each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, test_reward_epsilon)
    # plt.plot(test_reward_epsilon)
    plt.title('Average Reward for Each Epsilon during Testing')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/Sarsa/test_reward_epsilon/test_reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Training
    for r in range(3):
        plt.plot(reward_run_all[r])
    plt.title('Average Reward for each Run during Training')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01','0.1','0.25'))
    plt.savefig('Graphs/Sarsa/reward_run/reward_run_all.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Testing
    for r in range(3):
        plt.plot(test_reward_run_all[r])
    plt.title('Average Reward for each Run during Testing')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01','0.1','0.25'))
    plt.savefig('Graphs/Sarsa/test_reward_run/test_reward_run_all.png')
    plt.clf()
    time.sleep(0.05)
Exemple #15
0
def TdLambda(gamma, lr, epsilon, runs, step_number, episode_length, lamda):
    """
        A Function that performs TdLambda Algorithm.
        Input: gamma, learning rate, epsilon (in list), runs, total number of steps, number of episodes, lamda
        Output: variety of graphs that illustrate the algorithm's performance
    """
    # create a grid object
    grid = Gridworld(5)
    window_length = int(episode_length / 20)

    # define variables for plotting purposes
    reward_epsilon = []
    reward_run_all = []
    test_reward_epsilon = []
    test_reward_run_all = []
    label = []
    for r in range(1, runs + 1):
        label.append(str(r))

    # begin iterating over every epsilon
    for eps in epsilon:

        # reset some lists
        Q_values_list = []
        reward_run = []
        test_reward_run = []

        # begin iterating over a set amount of runs (20)
        for run in range(1, runs + 1):

            # initialize q values for all state action pairs
            global Q_values
            Q_values = np.zeros((state_count, action_count))

            # define lists
            reward_episode = []
            test_reward_episode = []
            delta_list = []

            # TdLambda BEGINS ---------------------------------------------------------------------------------------------------------------------------
            # iterate over episodes
            for episode in range(episode_length):

                # initialize delta for eligibility trace
                delta_ = 0

                # delta for change in Q values
                delta = 0

                # initialize S,A (? should i choose an Action using epsilon-greedy here or just select an Action?)
                state_vector = grid.initial_state()
                state_index = grid.states.index(state_vector)

                # initialize  eligibility traces for all state action pairs of all states to 0
                z_values = np.zeros((state_count, action_count))

                action_index = choose_action(state_index, eps)
                action_vector = actions[action_index]

                reward_list = []

                # iteration 200 steps of the episode
                for i in range(step_number):

                    # Take action A, oberserve R, S'
                    next_state_vector, reward = grid.transition_reward(
                        state_vector, action_vector)
                    next_state_index = grid.states.index(
                        list(next_state_vector))

                    reward_list.append(reward)

                    # Choose A' from S' using policy derived from Q (eg. epsilon-greedy)
                    next_action_index = choose_action(next_state_index, eps)
                    next_action_vector = actions[next_action_index]

                    # update the action-value form of the TD error
                    delta_ = reward + gamma * Q_values[next_state_index][
                        next_action_index] - Q_values[state_index][action_index]

                    # accumulate traces (? big S and big A?)
                    z_values[state_index][action_index] += 1

                    # calculate max Q_value change for plotting max delta
                    Q_value = Q_values[state_index][
                        action_index] + lr * delta_ * z_values[state_index][
                            action_index]
                    delta = max(
                        delta,
                        np.abs(Q_value - Q_values[state_index][action_index]))

                    # update Q value
                    Q_values[state_index][action_index] = Q_values[
                        state_index][action_index] + lr * delta_ * z_values[
                            state_index][action_index]

                    # update z value
                    z_values[state_index][
                        action_index] = gamma * lamda * z_values[state_index][
                            action_index]

                    # update state and action vector
                    state_vector = list(next_state_vector)
                    state_index = grid.states.index(state_vector)
                    action_vector = list(next_action_vector)
                    action_index = next_action_index

                # append lists for plotting purpose
                delta_list.append(delta)
                reward_episode.append(sum(reward_list))

                # TESTING AFTER EACH EPISODE ------------------------------------------------------------
                # initialize policy
                policy = np.zeros((state_count, action_count))
                # Generate Greedy policy based on Q_values after each episode
                for state in range(len(Q_values)):
                    # find the best action at each state
                    best_action = np.argmax(Q_values[state])
                    # write deterministic policy based on Q_values
                    policy[state][best_action] = 1
                # Generate test trajectory with the greedy policy
                state_list, action_list, test_reward_list = generate_episode(
                    step_number, grid, policy)
                test_reward_episode.append(sum(test_reward_list))
                #----------------------------------------------------------------------------------------

                # print current episode
                clear_output(wait=True)
                display('Epsilon: ' + str(eps) + ' Run: ' + str(run) +
                        ' Episode: ' + str(episode))

            test_reward_run.append(Average(test_reward_episode))

            # append lists for plotting purpose
            reward_run.append(Average(reward_episode))
            Q_values_list.append(Q_values)

            # PLOTTING CODE--------------------------------------------------------------------------------------------------------------------
            # Average Reward per Episode during Training with different runs and epsilons
            plt.plot(test_reward_episode)
            plt.plot(reward_episode)
            plt.title('Average Reward per Episode, Run: ' + str(int(run)) +
                      ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Average Reward')
            plt.legend(('Testing', 'Training'))
            plt.savefig('Graphs/TdLambda/reward_episode/reward_episode_run_' +
                        str(int(run)) + '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

            # Average Reward per Episode during Training with different runs and epsilons
            plt.title('Average Reward per Episode (Smoothed), Run: ' +
                      str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Average Reward')
            delta_frame = pd.DataFrame(test_reward_episode)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average')
            delta_frame = pd.DataFrame(reward_episode)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average')
            plt.legend(('Testing', 'Training'))
            plt.savefig(
                'Graphs/TdLambda/reward_episode/reward_episode_smoothed_run_' +
                str(int(run)) + '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

            # max delta of each episode, where delta is the change in Q values
            plt.plot(delta_list)
            plt.title('TdLambda Max Delta for Run: ' + str(int(run)) +
                      ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Max Delta')
            delta_frame = pd.DataFrame(delta_list)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average', color='orange')
            plt.savefig('Graphs/TdLambda/delta/delta_run_' + str(int(run)) +
                        '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

        # append lists for plotting
        reward_run_all.append(reward_run)
        test_reward_run_all.append(test_reward_run)
        reward_epsilon.append(Average(reward_run))
        test_reward_epsilon.append(Average(test_reward_run))

        # Average Reward for each Run with different Epsilon
        plt.plot(test_reward_run)
        plt.plot(reward_run)
        plt.title('Average Reward for each Run with Epsilon: ' +
                  str(float(eps)))
        plt.xlabel('Run')
        plt.xticks(np.arange(runs), label)
        plt.ylabel('Average Reward')
        plt.legend(('Testing', 'Training'))
        plt.savefig('Graphs/TdLambda/reward_run/reward_run_epsilon_' +
                    str(float(eps)) + '.png')
        plt.clf()
        time.sleep(0.05)

        # save Q value tables to a pickle
        with open(
                'Graphs/TdLambda/Qvalues/TdLambda_Qvalues_' + str(eps) +
                '.pkl', 'wb') as f:
            pickle.dump(Q_values_list, f)

    # Average Reward for each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, reward_epsilon)
    plt.title('Average Reward for each Epsilon during Training')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/TdLambda/reward_epsilon/reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for Each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, test_reward_epsilon)
    plt.title('Average Reward for Each Epsilon during Testing')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/TdLambda/test_reward_epsilon/test_reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Training
    for r in range(3):
        plt.plot(reward_run_all[r])
    plt.title('Average Reward for each Run during Training')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01', '0.1', '0.25'))
    plt.savefig('Graphs/TdLambda/reward_run/reward_run_all.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Testing
    for r in range(3):
        plt.plot(test_reward_run_all[r])
    plt.title('Average Reward for each Run during Testing')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01', '0.1', '0.25'))
    plt.savefig('Graphs/TdLambda/test_reward_run/test_reward_run_all.png')
    plt.clf()
    time.sleep(0.05)
Exemple #16
0
theta = 0.000001
discount_factor = 0.8
delta_list = []

# UNCOMMENT THE FOLLOWING FOR EVEN POLICY
# # initialize a policy: create an array of dimension (number of states by number of actions)
# # for equal probability amongst all actions, divide everything by the number of actions
# policy = np.ones([state_count, action_count]) / action_count

# create a random policy
random_policy = np.random.randint(1000, size=(state_count, action_count))
random_policy = random_policy/random_policy.sum(axis=1)[:,None]
policy = random_policy

# create a grid object
grid = Gridworld(5)

def calculate_action_value(state, value):
    A = np.zeros(action_count)
    
    # perform 4 actions per state and add the rewards (value)
    for action_number, action in enumerate(actions):
            
        # get next position and reward
        new_position = grid.p_transition(state, action)
        reward = grid.reward(state, action)
        
        # get next position and reward
        new_position = grid.p_transition(state, action)
        reward = grid.reward(state, action)
    total_accumulated_rewards = [None] * 10000
    for i in range(10000):
        agent = Agent()
        agent.current_state = world.grid[0][0]
        while agent.current_state != knowledge.goal:
            run_optimal()

        total_steps[i] = agent.steps
        total_accumulated_rewards[i] = agent.accumulated_rewards

    print("===Optimal Results===")
    print("Mean: " + str(np.mean(total_accumulated_rewards)))
    print("Std Dev.: " + str(np.std(total_accumulated_rewards)))
    print("Max: " + str(np.max(total_accumulated_rewards)))
    print("Min: " + str(np.min(total_accumulated_rewards)))

    print("Steps at Max: " +
          str(total_steps[np.argmax(total_accumulated_rewards)]))
    print("Steps at Min: " +
          str(total_steps[np.argmin(total_accumulated_rewards)]))


if __name__ == "__main__":
    world = Gridworld(True)

    knowledge = Knowledge(world, True)

    gather_random_stats()

    gather_optimal_stats()
class QLearner():
    def __init__(self, dyna=False, plus=False, experiment=False):
        self.randomizeAction = 0.1
        self.agent = Agent(Actions)
        self.world = Gridworld(self.agent, self)
        self.Q = np.zeros((self.world.Width, self.world.Height,
                           len(Actions)))  # initialize q table to zeros

        self.goalreward = 1
        self.rewards = []
        self.cumulativeReward = 0
        self.completedEpisodes = 0
        self.stepsPerEpisode = []

        self.updatePolicy = self.basicQPolicy
        self.PLUS = plus
        self.EXPERIMENT = experiment
        if (dyna):
            self.updatePolicy = self.DynaQPolicy

        # For Dyna-Q
        self.numModelUpdates = 50
        self.model = self.BuildModel()

        # For Dyna-Q+, a table of how long its been since a state-action was visited, and an incrementer for easy addition
        # if (self.PLUS):
        #     self.randomizeAction = 1.0
        self.timestep = 0
        self.history = dict()  #self.BuildHistory()

        self.lookingForNextWin = False
        self.timeSinceLooking = 0

        self.visitCount = np.zeros((self.world.Width, self.world.Height))

        # standardized random number generator for action selection
        self.random = random.Random()
        self.random.seed(12)

    def Step(self):
        self.timestep = self.timestep + 1
        # if (self.PLUS and (self.randomizeAction > 0.1)):
        #     self.randomizeAction -= 0.001
        self.updatePolicy(self.world, self.Q)

    def SelectModelStateActionDynaQPlus(self, fromState=None):
        s = random.choice(list(self.history.keys()))
        a = self.random.randint(0, len(Actions) - 1)

        # Hack to promote exploration (over favoring states that have tiny q-values because they've been visited before)
        # if (0 in self.Q[s[0]][s[1]][:]):
        #     print("Choosing unexplored option")
        #     a = self.random.choice( np.argwhere(self.Q[s[0]][s[1]] == 0) )[0]

        r = self.RecencyBonus(s[0], s[1], a)
        if math.isnan(r):
            r = 0
        return s, a, r

    def SelectModelStateActionDynaQ(self):
        #random for placeholder
        s = random.choice(list(self.history.keys()))
        actionCounts = self.history[s]
        a = self.random.choice(np.argwhere(actionCounts != 0))[0]
        # a = self.random.randint(0, len(Actions) - 1)
        return s, a

    # The model is structurally the same as the Q table, but Model[S,A] -> S' rather than a value
    def BuildModel(self):
        model = np.zeros(
            (self.world.Width, self.world.Height, len(Actions), 3)
        )  # Every S,A pair should give rise to an S', we could add R here, but instead we'll use the real Q table (that _should_ be the same thing)

        # now initialize the model so that every transition leads to S (which we'll update as we actually explore)
        # doin it the slooooow, but clear way
        w = self.world.Width
        h = self.world.Height
        A = len(Actions)
        r = 0
        for x in range(w):
            for y in range(h):
                for a in range(A):
                    model[x][y][a] = [x, y, r]  # S' starts out as S

        return model.astype(int)

    def BuildHistory(self):
        return -1 * np.ones(
            (self.world.Width, self.world.Height, len(Actions), 1))

    # Our model can be deterministic, which greatly simplifies things
    def ModelStep(self):
        # For Dyna-Q, we simulate a randomly previously observed state and action and update their reward in the Q table
        r = 0
        if (self.PLUS) and not (self.EXPERIMENT):  #and (self.timestep > 1000):
            s, a, r = self.SelectModelStateActionDynaQPlus()
            # print("plus update reward ", r)
        else:
            s, a = self.SelectModelStateActionDynaQ()

        [x, y, r_transition] = self.model[s[0]][s[1]][a]
        s_prime = (x, y)
        r = r + r_transition

        before = self.Q[s[0]][s[1]][a]
        self.UpdateQ(s, a, s_prime, r)
        after = self.Q[s[0]][s[1]][a]

        # if (self.PLUS):
        #     print("Model updated Q from {:6.2f} to {:6.2f}".format(before, after))

    def RecencyBonus(self, x, y, a):
        k = 0.001

        # if you've never tried the queried state-action (For experiment) give a large number
        timeSince = 0
        if ((x, y) in list(self.history.keys())):
            timeSince = self.history[(x, y)][a]

        # if (self.timestep > 1000):
        # embed()

        dt = self.timestep - timeSince
        return k * np.sqrt(dt)

    def UpdateQ(self, s, a, s_prime, reward, PLUS_MODEL=False):
        x, y = s
        xNew, yNew = s_prime
        # Dont do an update if you're entering the terminal state (special case handled by policy)
        if (self.world.IsGoalState(xNew, yNew)):
            return

        alpha = 0.1
        gamma = 0.95

        newExpectedValue = gamma * np.max(self.Q[xNew][yNew][:])
        currentExpectedValue = self.Q[x][y][a]
        error = newExpectedValue - currentExpectedValue
        self.Q[x][y][a] = currentExpectedValue + alpha * (reward + error)

    def PrintQ(self, Q="blarg"):
        # allow for manual entry of a Q table
        if Q == "blarg":
            Q = self.Q
        # For debugging go through the Q and pix the max state action
        debuggingView = []
        for i in range(len(Q)):
            row = []
            for j in range(len(Q[0])):
                row.append("{:6.2f}".format(np.max(Q[i][j])))
            debuggingView.append(row)

        for entry in debuggingView:
            print(entry)

        print("--------------------")

    def RestartEpisode(self):
        self.world.Reset()
        self.agent.startNewEpisode()

    def takeActionFn(self, actionKey):
        # for dyna-q+, update history
        # self.history = self.history + (1 * (self.history != -1)) # only increment valid values (visited values)

        # Update the history
        x, y = self.agent.position
        if ((x, y) in self.history.keys()):
            self.history[(x, y)][actionKey] = self.timestep
        else:
            actionHistory = np.zeros(len(Actions))
            actionHistory[actionKey] = self.timestep
            self.history[(x, y)] = actionHistory

        (dx, dy) = Actions[actionKey]
        self.world.agent.updateHistory(
            actionKey)  # log the action state before actually taking it
        self.world.moveAgentBy(dx, dy)

        # print("{} -> {} by {}".format((x,y), self.agent.position, (dx,dy)))

        self.visitCount[self.agent.position[0]][
            self.agent.position[1]] = self.visitCount[self.agent.position[0]][
                self.agent.position[1]] + 1

    def manualActionSelection(self, gridworld, actionKey):
        self.basicQPolicy(gridworld, self.Q, actionKey)

    def DynaQPolicy(self, gridworld, Q, manualActionKey=None):
        agent = gridworld.agent
        x, y = agent.position

        actionKey = None
        if (manualActionKey):
            actionKey = manualActionKey
            print("Took action key ", actionKey)
        else:
            # whats the best action we could take from here?
            if (self.random.random() <= self.randomizeAction):
                actionKey = self.random.randint(0, len(Actions) - 1)
            elif (self.EXPERIMENT):  #and (self.timestep > 1000)):
                actionVals = Q[x][y][:]
                bestAction = 0
                bestValue = -1
                for i in range(len(actionVals)):
                    v = actionVals[i] + self.RecencyBonus(x, y, i)
                    if (v > bestValue):
                        bestValue = v
                        bestAction = i
                actionKey = bestAction
            else:
                actionKey = np.random.choice(
                    np.argwhere(Q[x][y][:] == np.max(Q[x][y][:])).flatten())
                # actionKey = np.argmax(Q[x][y][:])
                # print("Selected {} val {} from {}".format(actionKey, Q[x][y][actionKey],Q[x][y][:]))

        # Take that action
        self.takeActionFn(actionKey)
        xNew, yNew = agent.position

        # Update your previous state with the new reward TD(0)
        reward = agent.GetAndResetReward()

        # update our model of S,A,S'
        self.model[x][y][actionKey] = [xNew, yNew, reward]

        # Agent only gets direct rewared on episode completion
        # Hack to recognize end of episodes by reward
        if (reward > 0):
            # print ("End of episode!")
            self.stepsPerEpisode.append(len(agent.history))
            self.cumulativeReward += reward
            self.Q[x][y][
                actionKey] = reward  # action key here actually doesn't matter (and really shouldnt be included) but since we do a max over the next state, filling out the action values for the terminal state "shouldnt" have side-effects
            self.RestartEpisode()
            self.completedEpisodes = self.completedEpisodes + 1
            if (self.lookingForNextWin):
                self.lookingForNextWin = False
        else:
            self.UpdateQ([x, y], actionKey, [xNew, yNew], reward)

        if (self.lookingForNextWin):
            self.timeSinceLooking = self.timeSinceLooking + 1

        self.rewards.append(self.cumulativeReward)

        for i in range(self.numModelUpdates):
            self.ModelStep()

        # print("ModelSteps ", modelSteps)

        # self.PrintQ()

    def basicQPolicy(self, gridworld, Q, manualActionKey=None):
        agent = gridworld.agent
        x, y = agent.position
        # a0 = agent.history[-1][1] # action part of history

        actionKey = None
        if (manualActionKey):
            actionKey = manualActionKey
            print("Took action key ", actionKey)
        else:
            # whats the best action we could take from here?
            if (self.random.random() <= 0.1):
                actionKey = self.random.randint(0, len(Actions) - 1)
            else:

                actionKey = np.argmax(Q[x][y][:])

        # Take that action
        self.takeActionFn(actionKey)
        xNew, yNew = agent.position

        # Update your previous state with the new reward TD(0)
        reward = agent.GetAndResetReward()

        # Agent only gets direct reward on episode completion
        # Hack to recognize end of episodes by reward
        if (reward > 0):
            # print ("End of episode!")
            self.stepsPerEpisode.append(len(agent.history))
            self.cumulativeReward += reward
            self.Q[x][y][
                actionKey] = reward  # action key here actually doesn't matter (and really shouldnt be included) but since we do a max over the next state, filling out the action values for the terminal state "shouldnt" have side-effects
            self.RestartEpisode()
            self.completedEpisodes = self.completedEpisodes + 1
        else:
            self.UpdateQ([x, y], actionKey, [xNew, yNew], reward)

        self.rewards.append(self.cumulativeReward)
def train(mem_size, batch_size, sync_freq, epochs=500, print_epoch=False):

    model = torch.nn.Sequential(torch.nn.Linear(l1, l2), torch.nn.ReLU(),
                                torch.nn.Linear(l2, l3), torch.nn.ReLU(),
                                torch.nn.Linear(l3, l4), torch.nn.ReLU(),
                                torch.nn.Linear(l4, l5))
    model2 = copy.deepcopy(model)
    model2.load_state_dict(model.state_dict())
    loss_fn = torch.nn.MSELoss()
    learning_rate = 1e-3
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    replay = deque(maxlen=mem_size)
    gamma = 0.9
    epsilon = 1.0
    j = 0
    losses = []
    for i in range(epochs):
        # Start new game
        game = Gridworld(size=5, mode='random')

        state1 = get_state(game)

        status = 1
        if print_epoch:
            print(i)
        while status:
            j += 1
            Q_val_ = model(state1)
            Q_val = Q_val_.data.numpy()

            if (random.random() < epsilon):
                choice = np.random.randint(0, 4)  # Exploration
            else:
                choice = np.argmax(Q_val)  # Exploitation

            action = action_set[choice]

            game.makeMove(action)

            state2 = get_state(game)
            reward = game.reward()

            with torch.no_grad():  # Since we won't backpropagate ?
                newQ = model(state2)

            maxQ = torch.max(newQ)
            done = True if reward > 0 else False
            exp = (state1, choice, reward, state2, done)
            replay.append(exp)

            if (len(replay) > batch_size):

                X, Y = experience_replay(replay, batch_size, gamma, model,
                                         model2)
                loss = loss_fn(X, Y.detach())
                optimizer.zero_grad()
                loss.backward()
                losses.append(loss.item())
                optimizer.step()
                # Target update
                if j % sync_freq == 0:
                    model2.load_state_dict(model.state_dict())
            state1 = state2

            # End game
            if reward != -1:
                status = 0

        if epsilon > 0.1:
            epsilon -= 1 / epochs

    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(model, mode='random', display=False)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    print("Games played: {0}, # of wins: {1}".format(max_games, wins))
    print("Win percentage: {}".format(win_perc))

    if print_epoch:
        plt.figure(figsize=(10, 7))
        plt.plot(losses)
        plt.xlabel("Iterations", fontsize=22)
        plt.ylabel("Loss", fontsize=22)

        plt.show()
    return win_perc
from QLearningAgent import QLearningAgent
from QTable import QTable

alpha = 0.1
gamma = 1
epsilon = 0.05

n_episodes = 500

reward_array = np.empty(n_episodes)

q = QTable(25, 4)

for i in range(n_episodes):
    total_reward = 0
    env = Gridworld()
    # agent = RandomAgent()
    agent = QLearningAgent(alpha, gamma, epsilon)

    while not env.is_terminal(env.agent_position):
        state = env.agent_position
        available_actions = env.get_available_actions()
        chosen_action = agent.choose_action(available_actions,
                                            env.agent_position, q)
        reward = env.make_step(chosen_action)
        new_state = env.agent_position  # now updated
        q.q_table[state][available_actions.index(
            chosen_action)] = (1 - alpha) * q.q_table[state][
                available_actions.index(chosen_action)] + alpha * (
                    reward + gamma * max(q.q_table[new_state, :]))
from Gridworld import Gridworld
import torch
import numpy as np
import random
import matplotlib.pyplot as plt

### GridWorld with Catastrophic Forgetting
### Trained only on the static version of the game, which means that the positions of the objects do not change

game = Gridworld(size=4, mode='static')

l1 = 64  # input
l2 = 150
l3 = 100
l4 = 4  # output

model = torch.nn.Sequential(torch.nn.Linear(l1, l2), torch.nn.ReLU(),
                            torch.nn.Linear(l2, l3), torch.nn.ReLU(),
                            torch.nn.Linear(l3, l4))

loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


def get_state(game):
    # Adding noise, since most of the input are zeros | can also help with overfitting
    state = game.board.render_np().reshape(1,
                                           64) + np.random.rand(1, 64) / 10.0
    # To torch tensor
    return torch.from_numpy(state).float()
Exemple #22
0
        # save state, action chosen and reward to list
        state_list.append(state_vector)
        action_list.append(action_vector)
        reward_list.append(reward)

    return state_list, action_list, reward_list


# define average function
def Average(lst):
    return sum(lst) / len(lst)


# create a grid object
grid = Gridworld(5)

# intialize parameters
gamma = 0.99
epsilon = 0.1
epsilon = [0.01, 0.1, 0.25]
runs = 20
episode_length = 500
window_length = int(episode_length / 20)
max_steps = 200

# define variables for keeping track of time steps
Terminal = max_steps
t_list = []
for i in range(1, max_steps + 1):
    t = Terminal - i
# The following np.array 'v' has the correct format (but is just a random
# collection of floats).
#v = np.random.rand(25)
#print(v)

# Please write your code for Exercise 1 here. We will mark your coursework by checking
# the values of the variables policy and v in this cell. Your code should compute the
# values of policy and v from scratch when this cell is executed, using the value
# iteration algorithm.

theta = 1e-10
gamma = 1
epsilon = 0
alpha = 0

env = Gridworld()

v = np.zeros(25)
#print(v)

actions = env.get_available_actions()
#print(actions)

lookup_table = np.zeros((25,4), dtype=np.ndarray)
for state in range(25):
    for action in actions:
        list = []
        for a in actions:
            if a == action:
                prob = (1-alpha) + alpha / len(actions)
            else:
from Gridworld import Gridworld
import torch
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
import copy
### GridWorld with Experience Replay(no catas. forgetting) and satabilization with a target network
### Trained only on the static version of the game, which means that the positions of the objects do not change
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
print(dev)
device = torch.device(dev)
game = Gridworld(size=5, mode='static')

input_size = 100

l1 = input_size  # input
l2 = 300
l3 = 200
l4 = 80
l5 = 4  # output


def get_state(game):
    # Adding noise, since most of the input are zeros | can also help with overfitting
    state = game.board.render_np().reshape(
        1, input_size) + np.random.rand(1, input_size) / 10.0
    # To torch tensor
Exemple #25
0
                            torch.nn.Linear(l2, l3), torch.nn.ReLU(),
                            torch.nn.Linear(l3, l4))

model2 = copy.deepcopy(
    model)  # Create a copy of the neural network to create the target network
model2.load_state_dict(
    model.state_dict())  # copy the parameters from the original model
# copy the params of model into model2
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9
epsilon = 0.3

game = Gridworld(size=4, mode='static')
game.display()

action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r',
}
'''
Setting up the main training loop
'''
epochs = 5000
sync_freq = 500  # variable to sync the target and original neural nets, evert 50 steps we will
losses = [
]  # Create a list to store loss values so we can plot the trend later
def MonteCarlo(gamma, lr, epsilon, runs, step_number, episode_length):

    # create a grid object
    grid = Gridworld(5)
    window_length = int(episode_length/20)
    max_steps = step_number

    # define variables for plotting purposes
    reward_epsilon = []
    reward_run_all = []
    test_reward_epsilon = []
    test_reward_run_all = []

    # define variables for keeping track of time steps
    Terminal = max_steps
    t_list=[]
    for i in range(max_steps):
        t = Terminal - i - 1
        t_list.append(t)
    label = []
    for r in range(1, runs+1):
        label.append(str(r))

    # Monte Carlo BEGINS ---------------------------------------------------------------------------------------------------------------------------
    # begin iterating over every epsilon
    for eps in epsilon:
        
        # reset some lists
        Q_values_list = []
        reward_run = []
        test_reward_run =[]
        
        # begin iterating over a set amount of runs (20)
        for run in range(1, runs+1):
            
            # random e soft policy
            policy = np.zeros((state_count, action_count))
            for state in range(len(policy)):
                random_action = random.randint(0,3)
                for action in range(action_count):
                    if action == random_action:
                        policy[state][action] = 1 - eps + eps/action_count 
                    else: # if choose_action is not the same as the current action 
                        policy[state][action] = eps/action_count
            
            # initialize q values for all state action pairs
            global Q_values
            Q_values = np.zeros((state_count, action_count))
            oldQ = np.zeros((state_count, action_count))
            
            # define lists
            reward_episode = []
            test_reward_episode = []
            delta_list = []
            
            # added a dictionary of state and list of returns received
            returns_list = {}
            for s in range(state_count):
                for a in range(action_count):
                    returns_list[(s,a)] = []
            
            # iteration 500 times
            for episode in range(episode_length):
                
                # generate an episode of specified step count
                state_list, action_list, reward_list = generate_episode(max_steps, grid, policy)
                # sum reward for episode
                reward_episode.append(sum(reward_list))
                
                # intialize variables
                G = 0
                delta = 0
                visited_list = []
                state_action_pair = list(np.zeros(len(t_list)))
                
                # loop for each step of episode: T-1, T-2, T-3 ... 0 = 199, 198, 197 ... 0
                for t in t_list:
                    
                    # calculate G: starting with the last reward at index t (naturally accounts for pseudocode's "t-1")
                    G = gamma*G + reward_list[t]
                    
                    # combine state and action pair together to check if it has been visited before
                    state_action_pair[t] = state_list[t]+action_list[t]
                    
                    # check if state action pair have been visited before (if not: continue, else: move to the next time step)
                    if state_action_pair[t] not in visited_list:
                        
                        # add state action pair to visited list
                        visited_list.append(state_action_pair)
                        
                        # find state and action index, for example, converting action [-1, 0] to 0, and same for state #
                        state_index = grid.states.index(state_list[t])
                        action_index = actions.index(action_list[t])
                        
                        # append G to returns
                        returns_list[(state_index,action_index)].append(G)
                        
                        # make a copy of the q values to calculate the delta
                        oldQ[state_index][action_index] = Q_values[state_index][action_index]
                        
                        # write Q_values to the state-action pair
                        Q_values[state_index][action_index] = float(np.mean(returns_list[(state_index,action_index)]))
                        
                        # calculate max delta change for plotting max q value change
                        delta = max(delta, np.abs(Q_values[state_index][action_index] - oldQ[state_index][action_index]))      
                
                # Update policy 
                for s in range(state_count):
                    if np.count_nonzero(Q_values[s]) == 0:  # if Q_values is all zero, randomly pick an action
                        choose_action = random.randint(0,3)
                    else:
                        choose_action = np.argmax(Q_values[s]) # choose best action at given state
                    # overwrite policy
                    for a in range(action_count): # for action in actions [0, 1, 2, 3]
                        if choose_action == a: # if the choose_action is the same as the current action
                            policy[s][a] = 1 - eps 
                        else: # if choose_action is not the same as the current action 
                            policy[s][a] = eps/(action_count-1)
                
                # append delta to list
                delta_list.append(delta)
                
                # TESTING AFTER EACH EPISODE ------------------------------------------------------------
                # Generate test trajectory with the greedy policy
                state_list, action_list, test_reward_list = generate_episode(max_steps, grid, policy)
                test_reward_episode.append(sum(test_reward_list))
                #----------------------------------------------------------------------------------------

                # print current episode
                clear_output(wait=True)
                display('Epsilon: ' + str(eps) + ' Run: ' + str(run) + ' Episode: ' + str(episode))
            
            # append lists for plotting purpose
            test_reward_run.append(Average(test_reward_episode))
            reward_run.append(Average(reward_episode))
            Q_values_list.append(Q_values)

            # PLOTTING CODE--------------------------------------------------------------------------------------------------------------------
            # Average Reward per Episode during Training with different runs and epsilons
            plt.title('Average Reward per Episode (Smoothed), Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Average Reward')
            delta_frame = pd.DataFrame(test_reward_episode)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average')
            delta_frame = pd.DataFrame(reward_episode)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average')
            plt.legend(('Testing','Training'))
            plt.savefig('Graphs/MonteCarlo/reward_episode/reward_episode_smoothed_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

            # Average Reward per Episode during Training with different runs and epsilons
            plt.plot(test_reward_episode)
            plt.plot(reward_episode)
            plt.title('Average Reward per Episode, Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Average Reward')
            plt.legend(('Testing','Training'))
            plt.savefig('Graphs/MonteCarlo/reward_episode/reward_episode_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

            # max delta of each episode, where delta is the change in Q values
            plt.plot(delta_list)
            plt.title('Monte Carlo Max Delta for Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps)))
            plt.xlabel('Episode')
            plt.ylabel('Max Delta')
            delta_frame = pd.DataFrame(delta_list)
            rolling_mean = delta_frame.rolling(window=window_length).mean()
            plt.plot(rolling_mean, label='Moving Average', color='orange')
            plt.savefig('Graphs/MonteCarlo/delta/delta_run_'+str(int(run))+'_epsilon_' + str(float(eps)) + '.png')
            plt.clf()
            time.sleep(0.05)

        # append lists for plotting
        reward_run_all.append(reward_run)
        test_reward_run_all.append(test_reward_run)
        reward_epsilon.append(Average(reward_run))
        test_reward_epsilon.append(Average(test_reward_run))

        # Average Reward for each Run with different Epsilon
        plt.plot(test_reward_run)
        plt.plot(reward_run)
        plt.title('Average Reward for each Run with Epsilon: '+ str(float(eps)))
        plt.xlabel('Run')
        plt.xticks(np.arange(runs), label)
        plt.ylabel('Average Reward')
        plt.legend(('Testing','Training'))
        plt.savefig('Graphs/MonteCarlo/reward_run/reward_run_epsilon_' + str(float(eps)) + '.png')
        plt.clf()
        time.sleep(0.05)

        # save Q value tables to a pickle
        with open('Graphs/MonteCarlo/Qvalues/MonteCarlo_Qvalues_' + str(eps) + '.pkl', 'wb') as f:
            pickle.dump(Q_values_list, f)

    # Average Reward for each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, reward_epsilon)
    plt.title('Average Reward for each Epsilon during Training')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/MonteCarlo/reward_epsilon/reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for Each Epsilon
    x_label = ('0.01', '0.1', '0.25')
    plt.bar(x_label, test_reward_epsilon)
    plt.title('Average Reward for Each Epsilon during Testing')
    plt.xlabel('Epsilon')
    plt.xticks(np.arange(3), ('0.01', '0.1', '0.25'))
    plt.ylabel('Average Reward')
    plt.savefig('Graphs/MonteCarlo/test_reward_epsilon/test_reward_epsilon.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Training
    for r in range(3):
        plt.plot(reward_run_all[r])
    plt.title('Average Reward for each Run during Training')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01','0.1','0.25'))
    plt.savefig('Graphs/MonteCarlo/reward_run/reward_run_all.png')
    plt.clf()
    time.sleep(0.05)

    # Average Reward for each Run during Testing
    for r in range(3):
        plt.plot(test_reward_run_all[r])
    plt.title('Average Reward for each Run during Testing')
    plt.xlabel('Run')
    plt.xticks(np.arange(runs), label)
    plt.ylabel('Average Reward')
    plt.legend(('0.01','0.1','0.25'))
    plt.savefig('Graphs/MonteCarlo/test_reward_run/test_reward_run_all.png')
    plt.clf()
    time.sleep(0.05)
Exemple #27
0
    args = parser.parse_args()
    device = torch.device('cuda' if args.use_cuda else 'cpu')

    simulator_s = SimulatorState().to(device)
    simulator_r = SimulatorReward().to(device)
    opt_s = torch.optim.Adam(simulator_s.parameters(), lr=args.lr)
    opt_r = torch.optim.Adam(simulator_r.parameters(), lr=args.lr)

    loss_fn_state = torch.nn.CrossEntropyLoss()
    loss_fn_reward = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 50, 50]))
    losses = []
    buffer = ExperienceReplay()

    progress = tqdm(range(args.epochs))
    for epoch_num in progress:
        game = Gridworld(mode=args.mode)
        z = 0
        for step_num in args.max_steps:
            # get starting state
            state = torch.from_numpy(game.board.render_np()).float().reshape(64, )
            # take random action
            action_ = np.random.randint(0, 4)
            action = action_set[action_]
            action_vec = torch.zeros(4, )
            action_vec[action_] = 1

            game.makeMove(action)
            next_state = torch.from_numpy(game.board.render_np()).float()
            reward_ = encode_game_progress(game.reward())
            buffer.add([(state, action_vec, next_state[0].argmax(), reward_, next_state)])