def main(): np.random.seed(26) env = GridworldEnv(shape=[4,4]) agent = Agent(env) td_zero_res = agent.td_zero(discount_factor=0.25, alpha=0.10) print("Result TD(0) Value function:") print(td_zero_res.reshape((env.shape))) td_lambda_res_bw = agent.td_lambda(discount_factor=0.25, alpha=0.1, _lambda=0.5, backward=True) print("Result backward TD(Lambda=0.5) Value function:") print(td_lambda_res_bw.reshape((env.shape))) td_lambda_res_fw = agent.td_lambda(discount_factor=0.25, alpha=0.1, _lambda=0.5, backward=False) print("Result forward TD(Lambda=0.5) Value function:") print(td_lambda_res_fw.reshape((env.shape))) sarsa_gridworld = agent.sarsa(num_iter=1000, alpha=0.25, discount_factor=None, epsilon=0.1) print("Result SARSA (no Lambda) for gridworld. Optimal Q function:") print(np.round(sarsa_gridworld, 2)) sarsa_gridworld_lambda = agent.sarsa_lambda(num_iter=1000, alpha=0.25, lambda_=0.5, discount_factor=None, epsilon=0.1) print("Result SARSA (lambda=0.5) for gridworld. Optimal Q function:") print(np.round(sarsa_gridworld_lambda, 2)) windy_env = WindyGridworldEnv() agent_2 = Agent(windy_env) sarsa_windy_gridworld = agent_2.sarsa(num_iter=1000, alpha=0.25, discount_factor=None, epsilon=0.1) print("Result SARSA (no Lambda) for windy gridworld. Optimal Q function:") print(np.round(sarsa_windy_gridworld, 2)) sarsa_windy_gridworld_lambda = agent_2.sarsa_lambda(num_iter=1000, alpha=0.25, discount_factor=None, epsilon=0.1, lambda_=0.5) print("Result SARSA (lambda=0.5) for windy gridworld. Optimal Q function:") print(np.round(sarsa_windy_gridworld_lambda, 2))
def args_handler(parser): args = parser.parse_args() config, size = {}, DEFAULT_GRID_SIZE if args.size: size = int(args.size) grid = GridworldEnv((size, size)) if args.gamma: config['gamma'] = float(args.gamma) if args.exps: config['exps'] = int(args.exps) if args.eps: config['num_episodes'] = int(args.eps) if args.epsilon: config['epsilon'] = float(args.epsilon) if args.alpha: config['alpha'] = float(args.alpha) if args.lamb: config['l'] = float(args.lamb) if args.alg not in ['q', 's']: parser.print_help() return elif args.alg == 'q': learner = QLearner(grid, **config) elif args.alg == 's': learner = SarsaLambdaLearner(grid, **config) print "\n" print learner.learn()[0]
def __init__(self, name, learner, size=5, **params): self.name = name self.grid = GridworldEnv((size, size)) if learner == 'q': self.learner = QLearner(self.grid, **params) if learner == 's': self.learner = SarsaLambdaLearner(self.grid, **params)
def create_env(env_name): """ Create/load the environment associated with :env_name """ if env_name == "SimpleGridWorld": return GridworldEnv() elif env_name == "MediumGridWorld": return GridworldEnv(shape=[10,10]) elif env_name == "LargeGridWorld": return GridworldEnv(shape=[20,20]) elif env_name == "HugeGridWorld": return GridworldEnv(shape=[31,31]) elif env_name == "SimpleRectangleWorld": return GridworldEnv(shape=[10,4]) elif env_name == "LargeRectangleWorld": return GridworldEnv(shape=[15,31]) elif env_name == "SimpleMazeWorld": return load_maze("SimpleMazeWorld") elif env_name == "MediumMazeWorld": return load_maze("MediumMazeWorld", (15, 15)) elif env_name == "LargeMazeWorld": return load_maze("LargeMazeWorld", (25, 25)) elif env_name == "SimpleWindyGridWorld": return create_windy_gridworld((7,10), ((0, 1, 2, 9), (3, 4, 5, 8), (6, 7)), (3,7)) elif env_name == "MediumRectangularWindyGridWorld": return create_windy_gridworld((20,5), ((0, 1), (2, 3), (4)), (12, 3)) elif env_name == "LargeRectangularWindyGridWorld": return create_windy_gridworld((30,15), ((5, 6, 7, 8, 12, 14), (0, 1, 2, 3, 13), (4), (9, 10, 11)), (7, 8)) else: return gym.envs.make(env_name)
def main(): np.random.seed(26) env = GridworldEnv(shape=[4,4]) agent = Agent(env) ## Sample one episode episode = agent.generate_episode(policy=agent.env.isap) print("Example: Sample episode for Monte Carlo:") print(episode) ## Do First-Visit-Monte-Carlo Prediction first_visit_MC_value_fnc = agent.monte_carlo_prediction(first_visit=True, discount_factor=1.0, num_iter=1000) first_visit_MC_value_fnc = np.round(first_visit_MC_value_fnc, 2) print("Result first-visit Monte Carlo:") print(first_visit_MC_value_fnc.reshape((env.shape))) ## Do Every-Visit Monte-Carlo Prediction every_visit_MC_value_fnc = agent.monte_carlo_prediction(first_visit=False, discount_factor=1.0, num_iter=1000) every_visit_MC_value_fnc = np.round(every_visit_MC_value_fnc, 2) print("Result every-visit Monte Carlo:") print(every_visit_MC_value_fnc.reshape((env.shape))) ## Do Every-Visit Monte-Carlo Control with Exploring Starts (no epsilon greedy method) Q_control_no_epsilon, policy_control_no_epsilon = agent.monte_carlo_control(policy=None, num_iter=200, discount_factor=None, epsilon_method=False, epsilon=0.1, on_policy=True) Q_control_no_epsilon = np.round(Q_control_no_epsilon, 2) policy_control_no_epsilon = np.round(policy_control_no_epsilon, 2) print("Result every-visit Monte Carlo Control Q-Function:") print(Q_control_no_epsilon) print("Result every-visit Monte Carlo Control optimal policy:") print(policy_control_no_epsilon) ## Do Every-Visit Monte-Carlo Control epsilon greedy method: Q_control_eps_greedy, policy_control_eps_greedy = agent.monte_carlo_control(policy=None, num_iter=500, discount_factor=None, epsilon_method=True, epsilon=0.1, on_policy=True) Q_control_eps_greedy = np.round(Q_control_eps_greedy, 2) policy_control_eps_greedy = np.round(policy_control_eps_greedy, 2) print("Result every-visit Monte Carlo Control Q-Function:") print(Q_control_eps_greedy) print("Result every-visit Monte Carlo Control optimal policy:") print(policy_control_eps_greedy)
def main(shape=[4, 4]): env = GridworldEnv(shape=shape) agent = Agent(env=env) ## Policy Evaluation print("Do Policy Evaluation...:") policy = env.isap print("Initial value function:") print(agent.vFnc.reshape((env.shape))) print("") print("Random Policy uniformly distributed") print(policy) print("") optimal_value_fnc = agent.policy_evaluation(policy) optimal_value_fnc = np.round(optimal_value_fnc) print("Optimal value function:") print(optimal_value_fnc.reshape((env.shape))) print("") ## Policy Improvement print("Do Policy Improvement...:") print("Start with Random Policy uniformly distributed") ## Initialize random policy for each state and action from environment policy = env.isap print(policy) print("") policy_improvement_res, value_fnc_optimal = agent.policy_improvement( policy) print("Optimal Policy Probability Distribution:") print(policy_improvement_res) print("") print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") print(np.reshape(np.argmax(policy_improvement_res, axis=1), env.shape)) print("") print("Value Function:") print(value_fnc_optimal) print("") print("Reshaped Grid Value Function:") print(value_fnc_optimal.reshape(env.shape)) print("")
def main(): np.random.seed(26) env = GridworldEnv(shape=[4, 4]) agent = Agent(env) Q_learning_gridworld = agent.Q_learning(num_iter=1000, epsilon=0.10, alpha=0.20, discount_factor=0.30) print("Optimal Q-Function after 1000 iterations:") print(np.round(Q_learning_gridworld, 2)) env2 = WindyGridworldEnv() agent2 = Agent(env2) Q_learning_windyworld = agent2.Q_learning(num_iter=1000, epsilon=0.10, alpha=0.20, discount_factor=0.30) print("Optimal Q-Function after 1000 iterations:") print(np.round(Q_learning_windyworld, 2))
def setUpModule(): global env env = GridworldEnv()
def her_experiment(): batch_size = 256 discount_factor = 0.8 learn_rate = 1e-3 num_hidden = 128 num_episodes = 2 epochs = 200 training_steps = 10 memory_size = 100000 # her = False # seeds = [42, 30, 2,19,99] # This is not randomly chosen seeds = [42, 30, 2, 19, 99] shape = [30, 30] targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x] env = GridworldEnv(shape=shape, targets=targets(*shape)) # functions for grid world def sample_goal(): return np.random.choice(env.targets, 1) extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1) def calc_reward(state, action, goal): if state == goal: return 0.0 else: return -1.0 # # maze # def sample_goal(): # return env.maze.end_pos # extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1) # def calc_reward(state, action, goal): # if state == goal: # return 0.0 # else: # return -1.0 means = [] x_epochs = [] l_stds = [] h_stds = [] for her in [True, False]: episode_durations_all = [] for seed in seeds: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) env.seed(seed) print(env.reset()) memory = ReplayMemory(memory_size) if her: # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n) model = QNetwork(2 * env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=True) else: model = QNetwork(env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=False) episode_durations_all.append( loop_environments.smooth(episode_durations, 10)) mean = np.mean(episode_durations_all, axis=0) means.append(mean) std = np.std(episode_durations_all, ddof=1, axis=0) l_stds.append(mean - std) h_stds.append(mean + std) x_epochs.append(list(range(len(mean)))) # print(len(mean),mean,std) line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration", ["HindsightReplay", "RandomReplay"], "Episode duration per epoch", ["orange", "blue"]) name = "her_" + str(shape) file_name = os.path.join("./results", name) with open(file_name + ".pkl", "wb") as f: pickle.dump((x_epochs, means, l_stds, h_stds), f)
def main(): print("Running DQN") if config.env == "GridWorldEnv": print("Playing: ", config.env) env = GridworldEnv() else: env_name = config.env print("Playing:", env_name) env = gym.make(env_name) # not 100 % sure this will work for all envs obs_shape = env.observation_space.shape num_actions = env.action_space.n assert len( obs_shape) <= 1, "Not yet compatible with multi-dim observation space" if len(obs_shape) > 0: obs_size = obs_shape[0] else: obs_size = 1 num_episodes = config.n_episodes batch_size = config.batch_size discount_factor = config.discount_factor learn_rate = config.learn_rate seed = config.seed num_hidden = config.num_hidden min_eps = config.min_eps max_eps = config.max_eps anneal_time = config.anneal_time clone_interval = config.clone_interval replay = (config.replay_off == False) clipping = (config.clipping_off == False) if config.memory_size is None: memory_size = 10 * batch_size else: memory_size = config.memory_size if not replay and (batch_size != 1 or memory_size != 1): print("Replay is turned off: adjusting memory and batch size to 1") batch_size = 1 memory_size = 1 memory = ReplayMemory(memory_size) # We will seed the algorithm (before initializing QNetwork!) for reproducibility random.seed(seed) torch.manual_seed(seed) env.seed(seed) Q_net = QNetwork(obs_size, num_actions, num_hidden=num_hidden) policy = EpsilonGreedyPolicy(Q_net, num_actions) episode_durations, losses, max_qs = run_episodes( train, Q_net, policy, memory, env, num_episodes, batch_size, discount_factor, learn_rate, clone_interval, min_eps, max_eps, anneal_time, clipping) plot_smooth(episode_durations, 10, show=True) # This just for now to see results quick. TODO: make nice plot function to test/compare multiple settings plt.plot(losses) plt.title( f"{config.env}, lr={learn_rate}, replay={replay}, clone_interval={clone_interval}" ) plt.ylabel("Loss") plt.xlabel("Episode") plt.show() plt.plot(max_qs) if clipping: plt.axhline(y=1. / (1 - discount_factor), color='r', linestyle='-') plt.title( f"{config.env}, lr={learn_rate}, replay={replay}, clone_interval={clone_interval}" ) plt.ylabel("max |Q|") plt.xlabel("Episode") plt.show()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") NAME = '' if RANDOMIZE: NAME = 'rand' if REGULARIZE: NAME = 'reg' for se in range(N_SEEDS): print('seed : ' + str(se)) first_actions = [] env = GridworldEnv(randomized_params=INITIAL_ADDITIONAL_PARAMS, randomize=RANDOMIZE, regularize=REGULARIZE, randomization_space=RANDOMIZATION_SPACE, goal_reward=GOAL_REWARD, lava_reward=LAVA_REWARD, step_reward=STEP_REWARD, out_of_grid=OUT_OF_GRID_REWARD, max_episode_steps=10) nb_steps = 4000 agent = VPG(env, MLP_Multihead, gamma=1, verbose=False, learning_rate=1e-3, regularize=REGULARIZE, lam=LAMBDA) print(agent.seed)
import numpy as np import itertools from collections import defaultdict from gridworld import GridworldEnv ENV = GridworldEnv() EQUIPROBABLE_POLICY = random_policy = np.ones([ENV.nS, ENV.nA]) / ENV.nA # Chris Fenton # CNC - AI # Winter Final Programming Problems ''' a) For the gridworld in example 4.1, Figure 4.1 shows a synchronous iterative policy evaluation, although the text explains asynchronous. An asynchronous iterative policy evaluation would go through the states (in numerical order from 1 to 14) and update after each state based on the previous updates. Program the asynchronous version, and write the value for each state after 2000 iterations. ''' def asyncPolicyEvaluation(gamma=0.9,iterations=2000): # ENV.P[s][a] : prob, next_state, reward, terminal?) tuple # Actions: up=0, right=1, down=2, left=3 Q = defaultdict(lambda: np.zeros(ENV.action_space.n)) # Q is the optimal action-value function, a dictionary mapping state -> action values. A = [0,1,2,3] #Actions values = [0] * ENV.nS for i in range(iterations): state = ENV.reset() #get a random state action = np.random.choice(A, replace=False) #random action #Q(s,a) = EV[R(t+1) + γV(s')]
import numpy as np from gridworld import GridworldEnv env = GridworldEnv([6, 6]) def policy_iteration(env, theta=0.001, discount_factor=1.0): """ Policy Iteration Algorithm. Args: env: gridWorld theta: Stopping threshold. discount_factor: lambda time discount factor. Returns: A tuple (policy, V) of the optimal policy and the optimal value function. """ def one_step_lookahead(state, V): """ Helper function to calculate the value for all action in a given state. Args: state: The state to consider (int) V: The value to use as an estimator, Vector of length env.nS Returns: A vector of length env.nA containing the expected value of each action. """ A = 0.0
sys.stdout.flush() avg_time_steps = time_steps_per_episode / self.exps avg_max_q = max_q_value_per_episode / self.exps return self._policy_directions(self._choose_policy()), avg_time_steps, avg_max_q def plot(v): import matplotlib.pylab as plt fig, ax = plt.subplots() min_val, max_val = 0, 5 for i in xrange(5): for j in xrange(5): c = v[i][j] ax.text(i, j, str(c), va='center', ha='center') ax.matshow(v, cmap=plt.cm.Blues) ax.set_xlim(min_val, max_val) ax.set_ylim(min_val, max_val) ax.set_xticks(np.arange(max_val)) ax.set_yticks(np.arange(max_val)) ax.grid() plt.show() if __name__ == '__main__': shape = (5, 5) g = GridworldEnv(shape=shape) l = SarsaLambdaLearner(g, exps=2, l=0.8, num_episodes=400, gamma=0.99, alpha=0.1, epsilon=0.3) print l.learn()
# Create a deterministic policy using the optimal value function policy = np.zeros([env.nS, env.nA]) for s in range(env.nS): # One step lookahead to find the best action for this state A = one_step_lookahead(s, V) best_action = np.argmax(A) # Always take the best action policy[s, best_action] = 1.0 return policy, V sizes = [5, 10, 20, 30, 50] for size in sizes: print("Running VI Size: ", size) env = GridworldEnv(shape=[size, size]) tic = time.time() policy, v = value_iteration(env) toc = time.time() elapsed_time = (toc - tic) * 1000 print(f"Time to converge: {elapsed_time: 0.3} ms") # print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") # print(np.reshape(np.argmax(policy, axis=1), env.shape)) # print("") # print("Value Function:") # print(v) # print("")
result.append(element.observation) return result def print_trajectory(self): print('Trajectory:') for element in self.trajectory: print(element) print('Total trajectory steps: {0}'.format(len(self.trajectory))) if __name__ == '__main__': size_x = 4 size_y = 4 env = GridworldEnv(size_x, size_y) env.make_start(0, 0) env.make_goal(0, 3) env.make_goal(3, 0) agent = GridworldAgent(size_x, size_y) total_episodes = 1000 for i in range(total_episodes): obs = env.reset() agent.reset() agent.append_trajectory(t_step=0, prev_action=None, observation=obs,
import numpy as np import sys import gym.spaces import timeit if "../" not in sys.path: sys.path.append("../") from gridworld import GridworldEnv environment = GridworldEnv() def value_iteration(environment, discountFactor=0.9, minError=0.1): def lookahead(V, a, s): [(next_state, reward, done)] = environment.P[s][a] #Bellman eqn value = (reward + discountFactor * V[next_state]) return value #inital value function and policy V = np.zeros(environment.nS) policy = np.zeros([environment.nS, environment.nA]) while True: error = 0 #loop over states for s in range(environment.nS): actions_values = np.zeros(environment.nA)
for i in range(num_episodes): episodes = [] init_state = choice(list(set( env.P.keys()))) # draw a random state to start # generate an episode while not env.is_terminal(init_state): action = choice(list(env.P[init_state].keys( ))) # random policy such that draw an action randomly next_state = env.P[init_state][action][0][1] reward = env.P[init_state][action][0][2] episodes.append([init_state, action, reward]) init_state = next_state G = 0 states_seen = set() for S, A, R in reversed(episodes): G = 1.0 * G + R # assuming discount factor is 1.0 if S not in states_seen: states_seen.add(S) returns[S].append(G) V[S] = np.mean(returns[S]) V_sorted = sorted(V.items(), key=lambda x: x[0]) # sort by state return V_sorted if __name__ == '__main__': env = GridworldEnv((9, 9)) print(env.P) env._render(mode="human") V = mc_policy_evaluation_random_policy(env, 5000) print(V)
import numpy as np import gym.spaces from gridworld import GridworldEnv env = GridworldEnv() def policy_eval(policy, env, discount_factor=1.0, epsilon=0.00001): """ Evaluate a policy given an environment and a full description of the environment's dynamics. Args: policy: [S, A] shaped matrix representing the policy. env: OpenAI env. env.P represents the transition probabilities of the environment. env.P[s][a] is a list of transition tuples (prob, next_state, reward, done). env.nS is a number of states in the environment. env.nA is a number of actions in the environment. theta: We stop evaluation once our value function change is less than theta for all states. discount_factor: Gamma discount factor. Returns: Vector of length env.nS representing the value function. """ # Start with a random (all 0) value function V = np.zeros(env.nS) while True: #old value function V_old = np.zeros(env.nS) #stopping condition
import numpy as np import pprint import sys if "./" not in sys.path: sys.path.append("./") #from lib.envs.gridworld import GridworldEnv from gridworld import GridworldEnv from gridworld import print_policy pp = pprint.PrettyPrinter(indent=2) env = GridworldEnv(shape=(4, 4)) # 4*4的方格 print("env.nS:", env.nS, " env.nA:", env.nA, ' env.P[][]:', env.P) def value_iteration(env, theta=0.0001, discount_factor=1.0): """ Value Iteration Algorithm. Args: env: OpenAI env. env.P represents the transition probabilities of the environment. env.P[s][a] is a list of transition tuples (prob, next_state, reward, done). env.nS is a number of states in the environment. env.nA is a number of actions in the environment. theta: We stop evaluation once our value function change is less than theta for all states. discount_factor: Gamma discount factor. Returns: A tuple (policy, V) of the optimal policy and the optimal value function. """ def one_step_lookahead(state, V):