def run_trial(name, agent, rounds, epochs, given_params, save=True, animation=True): save_path = f"/Models/{name}/" if save: if not os.path.exists(save_path): os.makedirs(save_path) # p.dump(given_params, open(f"/Models/{name}/env.txt", "wb")) np.savetxt(f"/Models/{name}/env.txt", np.asarray(given_params[0])) with open(f"/Models/{name}/env_penalty.txt", "w") as f: f.write(f"Collision penalty is: {given_params[1]}") f.close() for round_num in range(rounds): world = Gridworld(WORLD_SIZE, ACTION_INFO, given_params) agent.set_world(world) world.place_agent(0, 0) run_epoch(agent, world, round_num, epochs, f"/Models/{name}", save=save, animate=animation) if save: agent.end_round(name, round_num)
def main(): print(f"Starting Cliff Walk, initializing world and agent") # Create args to use for instantiating a world and agent object cliff_world_args = { "H": H, "W": W, "wind": WIND, "start_pos": START_POS, "end_pos": END_POS, "stochastic": False, "variance": 0, "hazard": HAZARD } cliff_agent_args = { "alpha": ALPHA, "eps": EPS, "gamma": GAMMA, "alpha_ramp": ALPHA_RAMP, "actions": ACTIONS } # Create the world and agent cliff_world = Gridworld(**cliff_world_args) cliff_agent = GridAgent(START_POS, cliff_world.H, cliff_world.W, NUM_A, **cliff_agent_args) # Train the agent for specified eps, occasionally printing to console print(f"Training agent for {TRAIN_EPS} episodes") cliff_agent.train_agent(cliff_world, print_moves=1000, move_timeout=1000, episodes=5000, ramp_alpha=True, method=GridAgent.Q_LEARNING) # Check the final policy (no training, acting 100% greedily) print(f"\nDone training agent for {TRAIN_EPS} episodes") print(f"Checking path and returns for trained agent") G, path = cliff_agent.run_episode(cliff_world, train=False) print(f"GridAgent received reward {-G} (smaller is better)\n") # Visualize the final policy policy = cliff_agent.get_policy(world=cliff_world, visual=True) for a in policy: print(a) print() # Plot the final path that agent took, and its rewards over course of training y, x = zip(*path) fig, ax = plt.subplots(2, 1) ax[0].imshow(cliff_world.get_image(), origin="upper") # ax[0].set_ylim(ax[0].get_ylim()[::-1]) ax[0].plot(x, y) ax[1].plot(cliff_agent.G_list) plt.show()
def main(): print(f"Starting Windy Gridworld, initializing world and agent") windy_world_args = { "H": H, "W": W, "wind": WIND, "start_pos": START_POS, "end_pos": END_POS, "stochastic": True, "variance": 1 } windy_agent_args = { "alpha": ALPHA, "eps": EPS, "gamma": GAMMA, "alpha_ramp": ALPHA_RAMP, "actions": ACTIONS } windy_world = Gridworld(**windy_world_args) windy_agent = GridAgent(START_POS, windy_world.H, windy_world.W, NUM_A, **windy_agent_args) # windy_agent.set_Q_to_default() print(f"Training agent for {TRAIN_EPS} episodes") windy_agent.train_agent(windy_world, print_moves=PRINT_AFTER_MOVES, move_timeout=MOVE_TIMEOUT, episodes=TRAIN_EPS) print(f"\nDone training agent for {TRAIN_EPS} episodes") print(f"Checking path and returns for trained agent") G, path = windy_agent.run_episode(windy_world, train=False) print(f"GridAgent completed task in {-G} moves\n") policy = windy_agent.get_policy(world=windy_world, visual=True) for a in policy: print(a) print() print(windy_world.get_image()) y, x = zip(*path) fig, ax = plt.subplots(2, 1) ax[0].imshow(windy_world.get_image(), origin="upper") ax[0].plot(x, y) ax[1].plot(windy_agent.G_list) plt.show()
def main(): # define variables theta = 0.000001 discount_factor = 0.8 # create a grid object grid = Gridworld(5) # initialize a policy: create an array of dimension (number of states by number of actions) # for equal probability amongst all actions, divide everything by the number of actions policy = np.ones([state_count, action_count]) / action_count # run policy evaluation final_value_map, max_iter, delta, policy = policy_evaluation( grid.valueMap, grid.states, discount_factor, theta, grid.reward, grid.p_transition, grid.transition_prob, policy) # print the final value function print("Total Iterations: ") print(max_iter) print("Value Function: ") np.set_printoptions(precision=4) print(final_value_map) # print delta vs iterations import matplotlib.pyplot as plt # plot iteration vs delta plt.plot(range(max_iter), delta) plt.title('Policy Evaluation with Discount Factor ' + str(discount_factor)) plt.xlabel('Iterations') plt.ylabel('Max Delta') plt.savefig('graphs/policy_evaluation_' + str(int(discount_factor * 100)) + '.png') plt.show()
def show_game(args): env = Gridworld(rows=5, cols=5, greens=3, reds=2) tf.reset_default_graph() qnet = get_qnet(args) saver = tf.train.Saver() tf.get_default_graph().finalize() with tf.Session(config=tf.ConfigProto(operation_timeout_in_ms=10000)) as sess: saver.restore(sess, args.restore_ckpt) done = False state = preprocess_img(env.reset()) _ = env.render() reward, turns = 0, 0 while not done: t1 = time.time() action = qnet.predict(sess, normalize(np.array([state])))[0] img, r, done, _ = env.step(action) _ = env.render() state = preprocess_img(img) reward += r turns += 1 time.sleep(max(0, .2 - (time.time() - t1))) print('turns =', turns, ' reward =', reward, ' reward/turn =', reward/turns)
def __init__(self, dyna=False, plus=False, experiment=False): self.randomizeAction = 0.1 self.agent = Agent(Actions) self.world = Gridworld(self.agent, self) self.Q = np.zeros((self.world.Width, self.world.Height, len(Actions))) # initialize q table to zeros self.goalreward = 1 self.rewards = [] self.cumulativeReward = 0 self.completedEpisodes = 0 self.stepsPerEpisode = [] self.updatePolicy = self.basicQPolicy self.PLUS = plus self.EXPERIMENT = experiment if (dyna): self.updatePolicy = self.DynaQPolicy # For Dyna-Q self.numModelUpdates = 50 self.model = self.BuildModel() # For Dyna-Q+, a table of how long its been since a state-action was visited, and an incrementer for easy addition # if (self.PLUS): # self.randomizeAction = 1.0 self.timestep = 0 self.history = dict() #self.BuildHistory() self.lookingForNextWin = False self.timeSinceLooking = 0 self.visitCount = np.zeros((self.world.Width, self.world.Height)) # standardized random number generator for action selection self.random = random.Random() self.random.seed(12)
def train(args): """ This function trains a Neural Network on how to play brickbreaker. Is meant to be identical to how Deepminds paper "Playing Atari with Deep Reinforcement Learning" works. :param args: parser.parse_args :return: """ with open(os.path.join(args.ckpt_dir, args.train_record_fname), 'a') as f: f.write("BasicGridworld -- begin training --\n") tf.reset_default_graph() env = Gridworld(rows=5, cols=5, greens=3, reds=2) qnet = get_qnet(args) init = tf.global_variables_initializer() saver = tf.train.Saver() # Don't want to change the graph once we begin playing the game. tf.get_default_graph().finalize() with tf.Session(config=tf.ConfigProto( operation_timeout_in_ms=10000)) as sess: sess.run(init) e = args.e_i last_output_ep = 0 rewards = [] transitions = 0 # number of transitions updated against next_output = args.output_period while transitions < args.train_steps: r, e, t = play_episode(args, sess, env, qnet, e) if transitions == 0 and t > 0: # Output status from before training starts. write_output(args, sess, saver, last_output_ep, e, rewards, transitions) last_output_ep = len(rewards) transitions += t rewards.append(r) if transitions > next_output: # Regular output during training. write_output(args, sess, saver, last_output_ep, e, rewards, transitions) next_output += args.output_period last_output_ep = len(rewards) with open(os.path.join(args.ckpt_dir, args.train_record_fname), 'a') as f: f.write('\n\n')
def main(): goals = [(7,0)] anti_goals = [(1,0),(2,0),(3,0),(4,0),(5,0),(6,0)] env = Gridworld(8, 4, goals, anti_goals) # get baseline random performance q = init_state_action_map(env) estimate_performance(env, q, 1) # learn q print("running double q-learning...") q1, q2 = double_q_learning(env) print("double q-learning complete") # determine post-training performance estimate_performance(env, q2, 0.01) visualize_performance(env, q2)
def main(): goals = [(7, 0)] anti_goals = [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)] env = Gridworld(8, 4, goals, anti_goals) # init q and get baseline random performance q = init_state_action_map(env) estimate_performance(env, q, 1) # learn q print("running sarsa...") q = sarsa(env, q) print("sarsa complete") # determine post-training performance estimate_performance(env, q, 0.01) visualize_performance(env, q)
def test_model(model, mode='static', display=True): i = 0 test_game = Gridworld(size=5, mode=mode) state_ = test_game.board.render_np().reshape( 1, input_size) + np.random.rand(1, input_size) / 10.0 state = torch.from_numpy(state_).float() if display: print("Initial State:") print(test_game.display()) status = 1 while (status == 1): #A qval = model(state) qval_ = qval.data.numpy() action_ = np.argmax(qval_) #B action = action_set[action_] if display: print('Move #: %s; Taking action: %s' % (i, action)) test_game.makeMove(action) state_ = test_game.board.render_np().reshape( 1, input_size) + np.random.rand(1, input_size) / 10.0 state = torch.from_numpy(state_).float() if display: print(test_game.display()) reward = test_game.reward() if reward != -1: if reward > 0: status = 2 if display: print("Game won! Reward: %s" % (reward, )) else: status = 0 if display: print("Game LOST. Reward: %s" % (reward, )) i += 1 if (i > 15): if display: print("Game lost; too many moves.") break win = True if status == 2 else False return win
def main(): x_limit = 8 y_limit = 4 goals = [(7, 3)] anti_goals = [] env = Gridworld(x_limit, y_limit, goals, anti_goals, kings_moves=False) num_episodes = 100 # determine the baseline performance that results from taking random moves avg = sum([len(generate_random_episode(env)) for _ in range(num_episodes)]) / float(num_episodes) print "baseline random performance: " + str(avg) # learn q print "running n-step sarsa..." q = n_step_sarsa(env) print "n-step sarsa complete" # determine post-training performance avg = sum([ len(generate_epsilon_greedy_episode(env, q)) for _ in range(num_episodes) ]) / float(num_episodes) print "post learning performance: " + str(avg) # visualize post-training episode state = env.reset() while True: env.render() time.sleep(0.25) action = choose_epsilon_greedy_action(q, state, 0.1) state, _, done, _ = env.step(action) # take a random action if done: env.render(close=True) break
optimizer = tf.keras.optimizers.Adam(learning_rate) gamma = 0.9 epsilon = 0.3 epochs = 5000 losses = [] mem_size = 1000 batch_size = 200 replay = deque(maxlen=mem_size) max_moves = 50 h = 0 sync_freq = 500 #A j = 0 for i in range(epochs): game = Gridworld(size=4, mode='random') state1_ = game.board.render_np().reshape( 1, 64) + np.random.rand(1, 64) / 100.0 state1 = state1_ #torch.from_numpy(state1_).float() status = 1 mov = 0 while (status == 1): j += 1 mov += 1 qval = model.predict(state1) if (random.random() < epsilon): action_ = np.random.randint(0, 4) else: action_ = np.argmax(qval) action = action_set[action_]
# get new state and reward after taking action from current state new_state_vector, reward = grid.transition_reward( state_vector, action_vector) state_vector = list(new_state_vector) # save state, action chosen and reward to list state_list.append(state_vector) action_list.append(action_vector) reward_list.append(reward) return state_list, action_list, reward_list # create a grid object grid = Gridworld(5) # initialize other parameters gamma = 0.99 lr = 0.1 epsilon = [0.01, 0.1, 0.25] runs = 20 episode_length = 500 window_length = int(episode_length / 20) reward_epsilon = [] reward_run_all = [] test_reward_epsilon = [] test_reward_run_all = [] # plot
def Sarsa(gamma, lr, epsilon, runs, step_number, episode_length): # create a grid object grid = Gridworld(5) window_length = int(episode_length/20) # define variables for plotting purposes reward_epsilon = [] reward_run_all = [] test_reward_epsilon = [] test_reward_run_all = [] label = [] for r in range(1, runs+1): label.append(str(r)) # begin iterating over every epsilon for eps in epsilon: # reset some lists Q_values_list = [] reward_run = [] test_reward_run =[] # begin iterating over a set amount of runs (20) for run in range(1, runs+1): # initialize q values for all state action pairs global Q_values Q_values = np.zeros((state_count, action_count)) # define lists for plots reward_episode = [] test_reward_episode = [] delta_list = [] # SARSA BEGINS ------------------------------------------------------------------------------------------ # iterate over episodes for episode in range(episode_length): # initialize/reset parameters reward_list = [] delta = 0 # initialize state (output: [4, 4]) state_vector = grid.initial_state() state_index = grid.states.index(state_vector) # choose an action based on epsilon-greedy (output: action index ie. 0) action_index = choose_action(state_index, eps) action_vector = actions[action_index] # iterate over 200 steps within each episode for step in range(step_number): # get the next state and reward after taking the chosen action in the current state next_state_vector, reward = grid.transition_reward(state_vector, action_vector) next_state_index = grid.states.index(list(next_state_vector)) # add reward to list reward_list.append(reward) # choose an action based on epsilon-greedy (output: action index ie. 0) next_action_index = choose_action(next_state_index, eps) next_action_vector = actions[next_action_index] # calculate max delta change for plotting max q value change Q_value = Q_values[state_index][action_index] + lr*(reward + gamma*Q_values[next_state_index][next_action_index] - Q_values[state_index][action_index]) delta = max(delta, np.abs(Q_value - Q_values[state_index][action_index])) # update Q value Q_values[state_index][action_index] = Q_values[state_index][action_index] + lr*(reward + gamma*Q_values[next_state_index][next_action_index] - Q_values[state_index][action_index]) # update state and action vector state_vector = list(next_state_vector) state_index = grid.states.index(state_vector) action_vector = list(next_action_vector) action_index = next_action_index # append lists for plotting purposes delta_list.append(delta) reward_episode.append(sum(reward_list)) # TESTING AFTER EACH EPISODE ------------------------------------------------------------ # initialize policy policy = np.zeros((state_count, action_count)) # Generate Greedy policy based on Q_values after each episode for state in range(len(Q_values)): # find the best action at each state best_action = np.argmax(Q_values[state]) # write deterministic policy based on Q_values policy[state][best_action] = 1 # Generate test trajectory with the greedy policy state_list, action_list, test_reward_list = generate_episode(step_number, grid, policy) test_reward_episode.append(sum(test_reward_list)) #---------------------------------------------------------------------------------------- # print current episode clear_output(wait=True) display('Epsilon: ' + str(eps) + ' Run: ' + str(run) + ' Episode: ' + str(episode)) # append lists for plotting purpose test_reward_run.append(Average(test_reward_episode)) reward_run.append(Average(reward_episode)) Q_values_list.append(Q_values) # PLOTTING CODE-------------------------------------------------------------------------------------------------------------------- # Average Reward per Episode during Training with different runs and epsilons plt.plot(test_reward_episode) plt.plot(reward_episode) plt.title('Average Reward per Episode, Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Average Reward') plt.legend(('Testing','Training')) plt.savefig('Graphs/Sarsa/reward_episode/reward_episode_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # max delta of each episode, where delta is the change in Q values plt.plot(delta_list) plt.title('Sarsa Max Delta for Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Max Delta') delta_frame = pd.DataFrame(delta_list) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average', color='orange') plt.savefig('Graphs/Sarsa/delta/delta_run_'+str(int(run))+'_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # append lists for plotting reward_run_all.append(reward_run) test_reward_run_all.append(test_reward_run) reward_epsilon.append(Average(reward_run)) test_reward_epsilon.append(Average(test_reward_run)) # Average Reward for each Run with different Epsilon plt.plot(test_reward_run) plt.plot(reward_run) plt.title('Average Reward for each Run with Epsilon: '+ str(float(eps))) plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('Testing','Training')) plt.savefig('Graphs/Sarsa/reward_run/reward_run_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # save Q value tables to a pickle with open('Graphs/Sarsa/Qvalues/Sarsa_Qvalues_' + str(eps) + '.pkl', 'wb') as f: pickle.dump(Q_values_list, f) # Average Reward for each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, reward_epsilon) # plt.plot(reward_epsilon) plt.title('Average Reward for each Epsilon during Training') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/Sarsa/reward_epsilon/reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for Each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, test_reward_epsilon) # plt.plot(test_reward_epsilon) plt.title('Average Reward for Each Epsilon during Testing') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/Sarsa/test_reward_epsilon/test_reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Training for r in range(3): plt.plot(reward_run_all[r]) plt.title('Average Reward for each Run during Training') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01','0.1','0.25')) plt.savefig('Graphs/Sarsa/reward_run/reward_run_all.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Testing for r in range(3): plt.plot(test_reward_run_all[r]) plt.title('Average Reward for each Run during Testing') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01','0.1','0.25')) plt.savefig('Graphs/Sarsa/test_reward_run/test_reward_run_all.png') plt.clf() time.sleep(0.05)
def TdLambda(gamma, lr, epsilon, runs, step_number, episode_length, lamda): """ A Function that performs TdLambda Algorithm. Input: gamma, learning rate, epsilon (in list), runs, total number of steps, number of episodes, lamda Output: variety of graphs that illustrate the algorithm's performance """ # create a grid object grid = Gridworld(5) window_length = int(episode_length / 20) # define variables for plotting purposes reward_epsilon = [] reward_run_all = [] test_reward_epsilon = [] test_reward_run_all = [] label = [] for r in range(1, runs + 1): label.append(str(r)) # begin iterating over every epsilon for eps in epsilon: # reset some lists Q_values_list = [] reward_run = [] test_reward_run = [] # begin iterating over a set amount of runs (20) for run in range(1, runs + 1): # initialize q values for all state action pairs global Q_values Q_values = np.zeros((state_count, action_count)) # define lists reward_episode = [] test_reward_episode = [] delta_list = [] # TdLambda BEGINS --------------------------------------------------------------------------------------------------------------------------- # iterate over episodes for episode in range(episode_length): # initialize delta for eligibility trace delta_ = 0 # delta for change in Q values delta = 0 # initialize S,A (? should i choose an Action using epsilon-greedy here or just select an Action?) state_vector = grid.initial_state() state_index = grid.states.index(state_vector) # initialize eligibility traces for all state action pairs of all states to 0 z_values = np.zeros((state_count, action_count)) action_index = choose_action(state_index, eps) action_vector = actions[action_index] reward_list = [] # iteration 200 steps of the episode for i in range(step_number): # Take action A, oberserve R, S' next_state_vector, reward = grid.transition_reward( state_vector, action_vector) next_state_index = grid.states.index( list(next_state_vector)) reward_list.append(reward) # Choose A' from S' using policy derived from Q (eg. epsilon-greedy) next_action_index = choose_action(next_state_index, eps) next_action_vector = actions[next_action_index] # update the action-value form of the TD error delta_ = reward + gamma * Q_values[next_state_index][ next_action_index] - Q_values[state_index][action_index] # accumulate traces (? big S and big A?) z_values[state_index][action_index] += 1 # calculate max Q_value change for plotting max delta Q_value = Q_values[state_index][ action_index] + lr * delta_ * z_values[state_index][ action_index] delta = max( delta, np.abs(Q_value - Q_values[state_index][action_index])) # update Q value Q_values[state_index][action_index] = Q_values[ state_index][action_index] + lr * delta_ * z_values[ state_index][action_index] # update z value z_values[state_index][ action_index] = gamma * lamda * z_values[state_index][ action_index] # update state and action vector state_vector = list(next_state_vector) state_index = grid.states.index(state_vector) action_vector = list(next_action_vector) action_index = next_action_index # append lists for plotting purpose delta_list.append(delta) reward_episode.append(sum(reward_list)) # TESTING AFTER EACH EPISODE ------------------------------------------------------------ # initialize policy policy = np.zeros((state_count, action_count)) # Generate Greedy policy based on Q_values after each episode for state in range(len(Q_values)): # find the best action at each state best_action = np.argmax(Q_values[state]) # write deterministic policy based on Q_values policy[state][best_action] = 1 # Generate test trajectory with the greedy policy state_list, action_list, test_reward_list = generate_episode( step_number, grid, policy) test_reward_episode.append(sum(test_reward_list)) #---------------------------------------------------------------------------------------- # print current episode clear_output(wait=True) display('Epsilon: ' + str(eps) + ' Run: ' + str(run) + ' Episode: ' + str(episode)) test_reward_run.append(Average(test_reward_episode)) # append lists for plotting purpose reward_run.append(Average(reward_episode)) Q_values_list.append(Q_values) # PLOTTING CODE-------------------------------------------------------------------------------------------------------------------- # Average Reward per Episode during Training with different runs and epsilons plt.plot(test_reward_episode) plt.plot(reward_episode) plt.title('Average Reward per Episode, Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Average Reward') plt.legend(('Testing', 'Training')) plt.savefig('Graphs/TdLambda/reward_episode/reward_episode_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # Average Reward per Episode during Training with different runs and epsilons plt.title('Average Reward per Episode (Smoothed), Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Average Reward') delta_frame = pd.DataFrame(test_reward_episode) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average') delta_frame = pd.DataFrame(reward_episode) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average') plt.legend(('Testing', 'Training')) plt.savefig( 'Graphs/TdLambda/reward_episode/reward_episode_smoothed_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # max delta of each episode, where delta is the change in Q values plt.plot(delta_list) plt.title('TdLambda Max Delta for Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Max Delta') delta_frame = pd.DataFrame(delta_list) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average', color='orange') plt.savefig('Graphs/TdLambda/delta/delta_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # append lists for plotting reward_run_all.append(reward_run) test_reward_run_all.append(test_reward_run) reward_epsilon.append(Average(reward_run)) test_reward_epsilon.append(Average(test_reward_run)) # Average Reward for each Run with different Epsilon plt.plot(test_reward_run) plt.plot(reward_run) plt.title('Average Reward for each Run with Epsilon: ' + str(float(eps))) plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('Testing', 'Training')) plt.savefig('Graphs/TdLambda/reward_run/reward_run_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # save Q value tables to a pickle with open( 'Graphs/TdLambda/Qvalues/TdLambda_Qvalues_' + str(eps) + '.pkl', 'wb') as f: pickle.dump(Q_values_list, f) # Average Reward for each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, reward_epsilon) plt.title('Average Reward for each Epsilon during Training') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/TdLambda/reward_epsilon/reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for Each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, test_reward_epsilon) plt.title('Average Reward for Each Epsilon during Testing') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/TdLambda/test_reward_epsilon/test_reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Training for r in range(3): plt.plot(reward_run_all[r]) plt.title('Average Reward for each Run during Training') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01', '0.1', '0.25')) plt.savefig('Graphs/TdLambda/reward_run/reward_run_all.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Testing for r in range(3): plt.plot(test_reward_run_all[r]) plt.title('Average Reward for each Run during Testing') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01', '0.1', '0.25')) plt.savefig('Graphs/TdLambda/test_reward_run/test_reward_run_all.png') plt.clf() time.sleep(0.05)
theta = 0.000001 discount_factor = 0.8 delta_list = [] # UNCOMMENT THE FOLLOWING FOR EVEN POLICY # # initialize a policy: create an array of dimension (number of states by number of actions) # # for equal probability amongst all actions, divide everything by the number of actions # policy = np.ones([state_count, action_count]) / action_count # create a random policy random_policy = np.random.randint(1000, size=(state_count, action_count)) random_policy = random_policy/random_policy.sum(axis=1)[:,None] policy = random_policy # create a grid object grid = Gridworld(5) def calculate_action_value(state, value): A = np.zeros(action_count) # perform 4 actions per state and add the rewards (value) for action_number, action in enumerate(actions): # get next position and reward new_position = grid.p_transition(state, action) reward = grid.reward(state, action) # get next position and reward new_position = grid.p_transition(state, action) reward = grid.reward(state, action)
total_accumulated_rewards = [None] * 10000 for i in range(10000): agent = Agent() agent.current_state = world.grid[0][0] while agent.current_state != knowledge.goal: run_optimal() total_steps[i] = agent.steps total_accumulated_rewards[i] = agent.accumulated_rewards print("===Optimal Results===") print("Mean: " + str(np.mean(total_accumulated_rewards))) print("Std Dev.: " + str(np.std(total_accumulated_rewards))) print("Max: " + str(np.max(total_accumulated_rewards))) print("Min: " + str(np.min(total_accumulated_rewards))) print("Steps at Max: " + str(total_steps[np.argmax(total_accumulated_rewards)])) print("Steps at Min: " + str(total_steps[np.argmin(total_accumulated_rewards)])) if __name__ == "__main__": world = Gridworld(True) knowledge = Knowledge(world, True) gather_random_stats() gather_optimal_stats()
class QLearner(): def __init__(self, dyna=False, plus=False, experiment=False): self.randomizeAction = 0.1 self.agent = Agent(Actions) self.world = Gridworld(self.agent, self) self.Q = np.zeros((self.world.Width, self.world.Height, len(Actions))) # initialize q table to zeros self.goalreward = 1 self.rewards = [] self.cumulativeReward = 0 self.completedEpisodes = 0 self.stepsPerEpisode = [] self.updatePolicy = self.basicQPolicy self.PLUS = plus self.EXPERIMENT = experiment if (dyna): self.updatePolicy = self.DynaQPolicy # For Dyna-Q self.numModelUpdates = 50 self.model = self.BuildModel() # For Dyna-Q+, a table of how long its been since a state-action was visited, and an incrementer for easy addition # if (self.PLUS): # self.randomizeAction = 1.0 self.timestep = 0 self.history = dict() #self.BuildHistory() self.lookingForNextWin = False self.timeSinceLooking = 0 self.visitCount = np.zeros((self.world.Width, self.world.Height)) # standardized random number generator for action selection self.random = random.Random() self.random.seed(12) def Step(self): self.timestep = self.timestep + 1 # if (self.PLUS and (self.randomizeAction > 0.1)): # self.randomizeAction -= 0.001 self.updatePolicy(self.world, self.Q) def SelectModelStateActionDynaQPlus(self, fromState=None): s = random.choice(list(self.history.keys())) a = self.random.randint(0, len(Actions) - 1) # Hack to promote exploration (over favoring states that have tiny q-values because they've been visited before) # if (0 in self.Q[s[0]][s[1]][:]): # print("Choosing unexplored option") # a = self.random.choice( np.argwhere(self.Q[s[0]][s[1]] == 0) )[0] r = self.RecencyBonus(s[0], s[1], a) if math.isnan(r): r = 0 return s, a, r def SelectModelStateActionDynaQ(self): #random for placeholder s = random.choice(list(self.history.keys())) actionCounts = self.history[s] a = self.random.choice(np.argwhere(actionCounts != 0))[0] # a = self.random.randint(0, len(Actions) - 1) return s, a # The model is structurally the same as the Q table, but Model[S,A] -> S' rather than a value def BuildModel(self): model = np.zeros( (self.world.Width, self.world.Height, len(Actions), 3) ) # Every S,A pair should give rise to an S', we could add R here, but instead we'll use the real Q table (that _should_ be the same thing) # now initialize the model so that every transition leads to S (which we'll update as we actually explore) # doin it the slooooow, but clear way w = self.world.Width h = self.world.Height A = len(Actions) r = 0 for x in range(w): for y in range(h): for a in range(A): model[x][y][a] = [x, y, r] # S' starts out as S return model.astype(int) def BuildHistory(self): return -1 * np.ones( (self.world.Width, self.world.Height, len(Actions), 1)) # Our model can be deterministic, which greatly simplifies things def ModelStep(self): # For Dyna-Q, we simulate a randomly previously observed state and action and update their reward in the Q table r = 0 if (self.PLUS) and not (self.EXPERIMENT): #and (self.timestep > 1000): s, a, r = self.SelectModelStateActionDynaQPlus() # print("plus update reward ", r) else: s, a = self.SelectModelStateActionDynaQ() [x, y, r_transition] = self.model[s[0]][s[1]][a] s_prime = (x, y) r = r + r_transition before = self.Q[s[0]][s[1]][a] self.UpdateQ(s, a, s_prime, r) after = self.Q[s[0]][s[1]][a] # if (self.PLUS): # print("Model updated Q from {:6.2f} to {:6.2f}".format(before, after)) def RecencyBonus(self, x, y, a): k = 0.001 # if you've never tried the queried state-action (For experiment) give a large number timeSince = 0 if ((x, y) in list(self.history.keys())): timeSince = self.history[(x, y)][a] # if (self.timestep > 1000): # embed() dt = self.timestep - timeSince return k * np.sqrt(dt) def UpdateQ(self, s, a, s_prime, reward, PLUS_MODEL=False): x, y = s xNew, yNew = s_prime # Dont do an update if you're entering the terminal state (special case handled by policy) if (self.world.IsGoalState(xNew, yNew)): return alpha = 0.1 gamma = 0.95 newExpectedValue = gamma * np.max(self.Q[xNew][yNew][:]) currentExpectedValue = self.Q[x][y][a] error = newExpectedValue - currentExpectedValue self.Q[x][y][a] = currentExpectedValue + alpha * (reward + error) def PrintQ(self, Q="blarg"): # allow for manual entry of a Q table if Q == "blarg": Q = self.Q # For debugging go through the Q and pix the max state action debuggingView = [] for i in range(len(Q)): row = [] for j in range(len(Q[0])): row.append("{:6.2f}".format(np.max(Q[i][j]))) debuggingView.append(row) for entry in debuggingView: print(entry) print("--------------------") def RestartEpisode(self): self.world.Reset() self.agent.startNewEpisode() def takeActionFn(self, actionKey): # for dyna-q+, update history # self.history = self.history + (1 * (self.history != -1)) # only increment valid values (visited values) # Update the history x, y = self.agent.position if ((x, y) in self.history.keys()): self.history[(x, y)][actionKey] = self.timestep else: actionHistory = np.zeros(len(Actions)) actionHistory[actionKey] = self.timestep self.history[(x, y)] = actionHistory (dx, dy) = Actions[actionKey] self.world.agent.updateHistory( actionKey) # log the action state before actually taking it self.world.moveAgentBy(dx, dy) # print("{} -> {} by {}".format((x,y), self.agent.position, (dx,dy))) self.visitCount[self.agent.position[0]][ self.agent.position[1]] = self.visitCount[self.agent.position[0]][ self.agent.position[1]] + 1 def manualActionSelection(self, gridworld, actionKey): self.basicQPolicy(gridworld, self.Q, actionKey) def DynaQPolicy(self, gridworld, Q, manualActionKey=None): agent = gridworld.agent x, y = agent.position actionKey = None if (manualActionKey): actionKey = manualActionKey print("Took action key ", actionKey) else: # whats the best action we could take from here? if (self.random.random() <= self.randomizeAction): actionKey = self.random.randint(0, len(Actions) - 1) elif (self.EXPERIMENT): #and (self.timestep > 1000)): actionVals = Q[x][y][:] bestAction = 0 bestValue = -1 for i in range(len(actionVals)): v = actionVals[i] + self.RecencyBonus(x, y, i) if (v > bestValue): bestValue = v bestAction = i actionKey = bestAction else: actionKey = np.random.choice( np.argwhere(Q[x][y][:] == np.max(Q[x][y][:])).flatten()) # actionKey = np.argmax(Q[x][y][:]) # print("Selected {} val {} from {}".format(actionKey, Q[x][y][actionKey],Q[x][y][:])) # Take that action self.takeActionFn(actionKey) xNew, yNew = agent.position # Update your previous state with the new reward TD(0) reward = agent.GetAndResetReward() # update our model of S,A,S' self.model[x][y][actionKey] = [xNew, yNew, reward] # Agent only gets direct rewared on episode completion # Hack to recognize end of episodes by reward if (reward > 0): # print ("End of episode!") self.stepsPerEpisode.append(len(agent.history)) self.cumulativeReward += reward self.Q[x][y][ actionKey] = reward # action key here actually doesn't matter (and really shouldnt be included) but since we do a max over the next state, filling out the action values for the terminal state "shouldnt" have side-effects self.RestartEpisode() self.completedEpisodes = self.completedEpisodes + 1 if (self.lookingForNextWin): self.lookingForNextWin = False else: self.UpdateQ([x, y], actionKey, [xNew, yNew], reward) if (self.lookingForNextWin): self.timeSinceLooking = self.timeSinceLooking + 1 self.rewards.append(self.cumulativeReward) for i in range(self.numModelUpdates): self.ModelStep() # print("ModelSteps ", modelSteps) # self.PrintQ() def basicQPolicy(self, gridworld, Q, manualActionKey=None): agent = gridworld.agent x, y = agent.position # a0 = agent.history[-1][1] # action part of history actionKey = None if (manualActionKey): actionKey = manualActionKey print("Took action key ", actionKey) else: # whats the best action we could take from here? if (self.random.random() <= 0.1): actionKey = self.random.randint(0, len(Actions) - 1) else: actionKey = np.argmax(Q[x][y][:]) # Take that action self.takeActionFn(actionKey) xNew, yNew = agent.position # Update your previous state with the new reward TD(0) reward = agent.GetAndResetReward() # Agent only gets direct reward on episode completion # Hack to recognize end of episodes by reward if (reward > 0): # print ("End of episode!") self.stepsPerEpisode.append(len(agent.history)) self.cumulativeReward += reward self.Q[x][y][ actionKey] = reward # action key here actually doesn't matter (and really shouldnt be included) but since we do a max over the next state, filling out the action values for the terminal state "shouldnt" have side-effects self.RestartEpisode() self.completedEpisodes = self.completedEpisodes + 1 else: self.UpdateQ([x, y], actionKey, [xNew, yNew], reward) self.rewards.append(self.cumulativeReward)
def train(mem_size, batch_size, sync_freq, epochs=500, print_epoch=False): model = torch.nn.Sequential(torch.nn.Linear(l1, l2), torch.nn.ReLU(), torch.nn.Linear(l2, l3), torch.nn.ReLU(), torch.nn.Linear(l3, l4), torch.nn.ReLU(), torch.nn.Linear(l4, l5)) model2 = copy.deepcopy(model) model2.load_state_dict(model.state_dict()) loss_fn = torch.nn.MSELoss() learning_rate = 1e-3 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) replay = deque(maxlen=mem_size) gamma = 0.9 epsilon = 1.0 j = 0 losses = [] for i in range(epochs): # Start new game game = Gridworld(size=5, mode='random') state1 = get_state(game) status = 1 if print_epoch: print(i) while status: j += 1 Q_val_ = model(state1) Q_val = Q_val_.data.numpy() if (random.random() < epsilon): choice = np.random.randint(0, 4) # Exploration else: choice = np.argmax(Q_val) # Exploitation action = action_set[choice] game.makeMove(action) state2 = get_state(game) reward = game.reward() with torch.no_grad(): # Since we won't backpropagate ? newQ = model(state2) maxQ = torch.max(newQ) done = True if reward > 0 else False exp = (state1, choice, reward, state2, done) replay.append(exp) if (len(replay) > batch_size): X, Y = experience_replay(replay, batch_size, gamma, model, model2) loss = loss_fn(X, Y.detach()) optimizer.zero_grad() loss.backward() losses.append(loss.item()) optimizer.step() # Target update if j % sync_freq == 0: model2.load_state_dict(model.state_dict()) state1 = state2 # End game if reward != -1: status = 0 if epsilon > 0.1: epsilon -= 1 / epochs max_games = 1000 wins = 0 for i in range(max_games): win = test_model(model, mode='random', display=False) if win: wins += 1 win_perc = float(wins) / float(max_games) print("Games played: {0}, # of wins: {1}".format(max_games, wins)) print("Win percentage: {}".format(win_perc)) if print_epoch: plt.figure(figsize=(10, 7)) plt.plot(losses) plt.xlabel("Iterations", fontsize=22) plt.ylabel("Loss", fontsize=22) plt.show() return win_perc
from QLearningAgent import QLearningAgent from QTable import QTable alpha = 0.1 gamma = 1 epsilon = 0.05 n_episodes = 500 reward_array = np.empty(n_episodes) q = QTable(25, 4) for i in range(n_episodes): total_reward = 0 env = Gridworld() # agent = RandomAgent() agent = QLearningAgent(alpha, gamma, epsilon) while not env.is_terminal(env.agent_position): state = env.agent_position available_actions = env.get_available_actions() chosen_action = agent.choose_action(available_actions, env.agent_position, q) reward = env.make_step(chosen_action) new_state = env.agent_position # now updated q.q_table[state][available_actions.index( chosen_action)] = (1 - alpha) * q.q_table[state][ available_actions.index(chosen_action)] + alpha * ( reward + gamma * max(q.q_table[new_state, :]))
from Gridworld import Gridworld import torch import numpy as np import random import matplotlib.pyplot as plt ### GridWorld with Catastrophic Forgetting ### Trained only on the static version of the game, which means that the positions of the objects do not change game = Gridworld(size=4, mode='static') l1 = 64 # input l2 = 150 l3 = 100 l4 = 4 # output model = torch.nn.Sequential(torch.nn.Linear(l1, l2), torch.nn.ReLU(), torch.nn.Linear(l2, l3), torch.nn.ReLU(), torch.nn.Linear(l3, l4)) loss_fn = torch.nn.MSELoss() learning_rate = 1e-3 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) def get_state(game): # Adding noise, since most of the input are zeros | can also help with overfitting state = game.board.render_np().reshape(1, 64) + np.random.rand(1, 64) / 10.0 # To torch tensor return torch.from_numpy(state).float()
# save state, action chosen and reward to list state_list.append(state_vector) action_list.append(action_vector) reward_list.append(reward) return state_list, action_list, reward_list # define average function def Average(lst): return sum(lst) / len(lst) # create a grid object grid = Gridworld(5) # intialize parameters gamma = 0.99 epsilon = 0.1 epsilon = [0.01, 0.1, 0.25] runs = 20 episode_length = 500 window_length = int(episode_length / 20) max_steps = 200 # define variables for keeping track of time steps Terminal = max_steps t_list = [] for i in range(1, max_steps + 1): t = Terminal - i
# The following np.array 'v' has the correct format (but is just a random # collection of floats). #v = np.random.rand(25) #print(v) # Please write your code for Exercise 1 here. We will mark your coursework by checking # the values of the variables policy and v in this cell. Your code should compute the # values of policy and v from scratch when this cell is executed, using the value # iteration algorithm. theta = 1e-10 gamma = 1 epsilon = 0 alpha = 0 env = Gridworld() v = np.zeros(25) #print(v) actions = env.get_available_actions() #print(actions) lookup_table = np.zeros((25,4), dtype=np.ndarray) for state in range(25): for action in actions: list = [] for a in actions: if a == action: prob = (1-alpha) + alpha / len(actions) else:
from Gridworld import Gridworld import torch import numpy as np import random import matplotlib.pyplot as plt from collections import deque import copy ### GridWorld with Experience Replay(no catas. forgetting) and satabilization with a target network ### Trained only on the static version of the game, which means that the positions of the objects do not change if torch.cuda.is_available(): dev = "cuda:0" else: dev = "cpu" print(dev) device = torch.device(dev) game = Gridworld(size=5, mode='static') input_size = 100 l1 = input_size # input l2 = 300 l3 = 200 l4 = 80 l5 = 4 # output def get_state(game): # Adding noise, since most of the input are zeros | can also help with overfitting state = game.board.render_np().reshape( 1, input_size) + np.random.rand(1, input_size) / 10.0 # To torch tensor
torch.nn.Linear(l2, l3), torch.nn.ReLU(), torch.nn.Linear(l3, l4)) model2 = copy.deepcopy( model) # Create a copy of the neural network to create the target network model2.load_state_dict( model.state_dict()) # copy the parameters from the original model # copy the params of model into model2 loss_fn = torch.nn.MSELoss() learning_rate = 1e-3 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) gamma = 0.9 epsilon = 0.3 game = Gridworld(size=4, mode='static') game.display() action_set = { 0: 'u', 1: 'd', 2: 'l', 3: 'r', } ''' Setting up the main training loop ''' epochs = 5000 sync_freq = 500 # variable to sync the target and original neural nets, evert 50 steps we will losses = [ ] # Create a list to store loss values so we can plot the trend later
def MonteCarlo(gamma, lr, epsilon, runs, step_number, episode_length): # create a grid object grid = Gridworld(5) window_length = int(episode_length/20) max_steps = step_number # define variables for plotting purposes reward_epsilon = [] reward_run_all = [] test_reward_epsilon = [] test_reward_run_all = [] # define variables for keeping track of time steps Terminal = max_steps t_list=[] for i in range(max_steps): t = Terminal - i - 1 t_list.append(t) label = [] for r in range(1, runs+1): label.append(str(r)) # Monte Carlo BEGINS --------------------------------------------------------------------------------------------------------------------------- # begin iterating over every epsilon for eps in epsilon: # reset some lists Q_values_list = [] reward_run = [] test_reward_run =[] # begin iterating over a set amount of runs (20) for run in range(1, runs+1): # random e soft policy policy = np.zeros((state_count, action_count)) for state in range(len(policy)): random_action = random.randint(0,3) for action in range(action_count): if action == random_action: policy[state][action] = 1 - eps + eps/action_count else: # if choose_action is not the same as the current action policy[state][action] = eps/action_count # initialize q values for all state action pairs global Q_values Q_values = np.zeros((state_count, action_count)) oldQ = np.zeros((state_count, action_count)) # define lists reward_episode = [] test_reward_episode = [] delta_list = [] # added a dictionary of state and list of returns received returns_list = {} for s in range(state_count): for a in range(action_count): returns_list[(s,a)] = [] # iteration 500 times for episode in range(episode_length): # generate an episode of specified step count state_list, action_list, reward_list = generate_episode(max_steps, grid, policy) # sum reward for episode reward_episode.append(sum(reward_list)) # intialize variables G = 0 delta = 0 visited_list = [] state_action_pair = list(np.zeros(len(t_list))) # loop for each step of episode: T-1, T-2, T-3 ... 0 = 199, 198, 197 ... 0 for t in t_list: # calculate G: starting with the last reward at index t (naturally accounts for pseudocode's "t-1") G = gamma*G + reward_list[t] # combine state and action pair together to check if it has been visited before state_action_pair[t] = state_list[t]+action_list[t] # check if state action pair have been visited before (if not: continue, else: move to the next time step) if state_action_pair[t] not in visited_list: # add state action pair to visited list visited_list.append(state_action_pair) # find state and action index, for example, converting action [-1, 0] to 0, and same for state # state_index = grid.states.index(state_list[t]) action_index = actions.index(action_list[t]) # append G to returns returns_list[(state_index,action_index)].append(G) # make a copy of the q values to calculate the delta oldQ[state_index][action_index] = Q_values[state_index][action_index] # write Q_values to the state-action pair Q_values[state_index][action_index] = float(np.mean(returns_list[(state_index,action_index)])) # calculate max delta change for plotting max q value change delta = max(delta, np.abs(Q_values[state_index][action_index] - oldQ[state_index][action_index])) # Update policy for s in range(state_count): if np.count_nonzero(Q_values[s]) == 0: # if Q_values is all zero, randomly pick an action choose_action = random.randint(0,3) else: choose_action = np.argmax(Q_values[s]) # choose best action at given state # overwrite policy for a in range(action_count): # for action in actions [0, 1, 2, 3] if choose_action == a: # if the choose_action is the same as the current action policy[s][a] = 1 - eps else: # if choose_action is not the same as the current action policy[s][a] = eps/(action_count-1) # append delta to list delta_list.append(delta) # TESTING AFTER EACH EPISODE ------------------------------------------------------------ # Generate test trajectory with the greedy policy state_list, action_list, test_reward_list = generate_episode(max_steps, grid, policy) test_reward_episode.append(sum(test_reward_list)) #---------------------------------------------------------------------------------------- # print current episode clear_output(wait=True) display('Epsilon: ' + str(eps) + ' Run: ' + str(run) + ' Episode: ' + str(episode)) # append lists for plotting purpose test_reward_run.append(Average(test_reward_episode)) reward_run.append(Average(reward_episode)) Q_values_list.append(Q_values) # PLOTTING CODE-------------------------------------------------------------------------------------------------------------------- # Average Reward per Episode during Training with different runs and epsilons plt.title('Average Reward per Episode (Smoothed), Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Average Reward') delta_frame = pd.DataFrame(test_reward_episode) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average') delta_frame = pd.DataFrame(reward_episode) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average') plt.legend(('Testing','Training')) plt.savefig('Graphs/MonteCarlo/reward_episode/reward_episode_smoothed_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # Average Reward per Episode during Training with different runs and epsilons plt.plot(test_reward_episode) plt.plot(reward_episode) plt.title('Average Reward per Episode, Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Average Reward') plt.legend(('Testing','Training')) plt.savefig('Graphs/MonteCarlo/reward_episode/reward_episode_run_' + str(int(run)) + '_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # max delta of each episode, where delta is the change in Q values plt.plot(delta_list) plt.title('Monte Carlo Max Delta for Run: ' + str(int(run)) + ', Epsilon: ' + str(float(eps))) plt.xlabel('Episode') plt.ylabel('Max Delta') delta_frame = pd.DataFrame(delta_list) rolling_mean = delta_frame.rolling(window=window_length).mean() plt.plot(rolling_mean, label='Moving Average', color='orange') plt.savefig('Graphs/MonteCarlo/delta/delta_run_'+str(int(run))+'_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # append lists for plotting reward_run_all.append(reward_run) test_reward_run_all.append(test_reward_run) reward_epsilon.append(Average(reward_run)) test_reward_epsilon.append(Average(test_reward_run)) # Average Reward for each Run with different Epsilon plt.plot(test_reward_run) plt.plot(reward_run) plt.title('Average Reward for each Run with Epsilon: '+ str(float(eps))) plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('Testing','Training')) plt.savefig('Graphs/MonteCarlo/reward_run/reward_run_epsilon_' + str(float(eps)) + '.png') plt.clf() time.sleep(0.05) # save Q value tables to a pickle with open('Graphs/MonteCarlo/Qvalues/MonteCarlo_Qvalues_' + str(eps) + '.pkl', 'wb') as f: pickle.dump(Q_values_list, f) # Average Reward for each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, reward_epsilon) plt.title('Average Reward for each Epsilon during Training') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/MonteCarlo/reward_epsilon/reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for Each Epsilon x_label = ('0.01', '0.1', '0.25') plt.bar(x_label, test_reward_epsilon) plt.title('Average Reward for Each Epsilon during Testing') plt.xlabel('Epsilon') plt.xticks(np.arange(3), ('0.01', '0.1', '0.25')) plt.ylabel('Average Reward') plt.savefig('Graphs/MonteCarlo/test_reward_epsilon/test_reward_epsilon.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Training for r in range(3): plt.plot(reward_run_all[r]) plt.title('Average Reward for each Run during Training') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01','0.1','0.25')) plt.savefig('Graphs/MonteCarlo/reward_run/reward_run_all.png') plt.clf() time.sleep(0.05) # Average Reward for each Run during Testing for r in range(3): plt.plot(test_reward_run_all[r]) plt.title('Average Reward for each Run during Testing') plt.xlabel('Run') plt.xticks(np.arange(runs), label) plt.ylabel('Average Reward') plt.legend(('0.01','0.1','0.25')) plt.savefig('Graphs/MonteCarlo/test_reward_run/test_reward_run_all.png') plt.clf() time.sleep(0.05)
args = parser.parse_args() device = torch.device('cuda' if args.use_cuda else 'cpu') simulator_s = SimulatorState().to(device) simulator_r = SimulatorReward().to(device) opt_s = torch.optim.Adam(simulator_s.parameters(), lr=args.lr) opt_r = torch.optim.Adam(simulator_r.parameters(), lr=args.lr) loss_fn_state = torch.nn.CrossEntropyLoss() loss_fn_reward = torch.nn.CrossEntropyLoss(weight=torch.Tensor([1, 50, 50])) losses = [] buffer = ExperienceReplay() progress = tqdm(range(args.epochs)) for epoch_num in progress: game = Gridworld(mode=args.mode) z = 0 for step_num in args.max_steps: # get starting state state = torch.from_numpy(game.board.render_np()).float().reshape(64, ) # take random action action_ = np.random.randint(0, 4) action = action_set[action_] action_vec = torch.zeros(4, ) action_vec[action_] = 1 game.makeMove(action) next_state = torch.from_numpy(game.board.render_np()).float() reward_ = encode_game_progress(game.reward()) buffer.add([(state, action_vec, next_state[0].argmax(), reward_, next_state)])