def showExamples(self, env): init_state = np.array([0,29,82]) gridworld = env.genGridWorld() cout = 0 frames = [] for i in range(3): step = 0 pre_states = [] self.state = init_state[i] converge = False while True: pre_states.append(self.state) action = np.argmax(self.policy[self.state]) _, next_state, _, _ = env.P[self.state][action][0] ax, fig = env.showWorld(gridworld, tlt="Round {0}, Step {1}".format(i+1, step)) env.movingAgent(gridworld, ax, self.state, pre_states) image = fig_to_image(fig) frames.append(image) plt.close() self.state = next_state if converge == True: cout +=1 break if self.state == self.terminate_state: converge = True step += 1 cout += 1 file_dir = get_dirs(os.path.join(RESULT_PATH, "DP")) imageio.mimsave(os.path.join(file_dir, "pi_test.gif"), frames, fps=5)
def run_policy(self, env): frames = [] state = env.reset() policy = self.epsilon_greedy_policy(env.action_space.n) for _ in itertools.count(): frames.append(env.render(mode='rgb_array')) action_prob = policy(state) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_step, _, done, _ = env.step(action) if done: env.close() break state = next_step save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning")) imageio.mimsave(save_path + "/vfa_car_qlearning.gif", frames, fps=30)
def main(): # create a Mountain Car environment env = MountainCarEnv() # create a approximator for the agent approximator = Approximator(env) # create an agent that knows how to perform Q learing method with value function approximation method agent_qlearning_vfa = Agent_QLearning_VFA(approximator) # perform q learning with function approximation method Q_stats = agent_qlearning_vfa.q_learning_fa(env=env, num_episodes=200) # plot_episode_stats(stats=Q_stats) save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning")) plt.savefig(save_path+ "/vfa_rewards.png") # plt.show() # agent_qlearning_vfa.run_policy(env)
def q_learning_fa(self,env, num_episodes, discount_factor=1.0, epsilon=0.1): frames = [] episode_stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) for i_episode in range(num_episodes): print("Episode {0}".format(i_episode)) # The policy we are following policy = self.epsilon_greedy_policy(nA=env.action_space.n, epsilon=epsilon) # Reset the environment and pick the first action state = env.reset() if i_episode % 5 == 0: fig = plot_cost_mountain_car(env, self.approximator, step=i_episode) image = fig_to_image(fig) frames.append(image) plt.close() for t in itertools.count(): # Choose an action action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # Take a step. next_state, reward, done, _ = env.step(action) # store staatistics episode_stats.episode_lengths[i_episode] = t episode_stats.episode_rewards[i_episode] += reward # TD Update. q_values_next = self.approximator.predict(next_state) # compute TD target. td_target = reward + discount_factor * np.max(q_values_next) # update the parameters in function approximator. self.approximator.update(state, action, td_target) if done: break state = next_state save_path = get_dirs(os.path.join(RESULT_PATH, "Qlearning")) imageio.mimsave(save_path + "/vfa_values_qlearning.gif", frames, fps=20) return episode_stats
# import numpy as np import matplotlib.pyplot as plt import matplotlib.animation as animation import os import imageio # from gym.envs.toy_text.cliffwalking import CliffWalkingEnv from rl.env.gridWorld import GridWorldEnv from rl.misc.utilies import ROOT_PATH, get_dirs from rl.misc.utilies import fig_to_image RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, "results")) class Agent: def __init__(self, env): self.V = np.zeros(env.nS) self.policy = np.ones([env.nS, env.nA]) / env.nA self.init_state = 0 self.state = None self.terminate_state = 46 def policy_evaluation(self, env, policy, theta=0.00001, discount_factor=0.9): V = np.zeros(env.nS) while True: Delta = 0.0 for s in range(env.nS): v = 0 for a, action_prob in enumerate(policy[s]):
from rl.misc.utilies import get_dirs import os results_path = os.path.join(os.path.realpath("../../../"), 'results') freps_path = get_dirs(os.path.join(results_path, 'freps'))
import numpy as np import matplotlib.pyplot as plt import itertools from collections import defaultdict import os import imageio # from rl.env.cliff_walking import CliffWalkingEnv from rl.misc.utilies import get_dirs, fig_to_image from rl.algo.td.sarsa.SARSA import Agent_SARSA # ROOT_PATH = os.path.realpath("../../../../") # RESULT_PATH = os.path.join(ROOT_PATH, "results") # Q_SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "Qlearning")) S_SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "SARSA")) class Agent_QLearning: def __init__(self, env): self.Q = defaultdict(lambda : np.zeros(env.nA)) self.policy = np.ones([env.nS, env.nA]) / env.nA def epsilon_greedy_policy(self, env, Q, epsilon=0.1): def policy_fn(observation): A = np.ones(env.nA, dtype=float) * epsilon / env.nA best_action = np.argmax(Q[observation]) A[best_action] += (1.0 - epsilon) return A
import tensorflow as tf import os import ot import numpy as np import pandas as pd # from rl.misc.utilies import get_dirs # env_ID = "Pendulum-v0" # path_all_results = os.path.join(os.path.realpath("../../../"), 'results') path_ppo_results = get_dirs(os.path.join(path_all_results, 'ppo')) path_env_result = get_dirs(os.path.join(path_ppo_results, env_ID)) path_csv = os.path.join(path_env_result, 'data.csv') # columns = ['methods', 'alphas', 'trials', 'episodes', 'rewards', 'losses_c', 'losses_a', 'divergences', 'entropies', "beta"] # data = pd.DataFrame(columns=columns) seed = 12345 # params = {'methods': {'clip': [None], # 'f': [1.0, 2.0, 'GAN'], # 'w2': [None] }, 'num_trials': 5, 'num_episodes': 100, 'num_sample_trans': 3200, 'epochs': 10, 'batch_size': 32, 'gamma': 0.99, 'lam': 0.95,
import matplotlib.pyplot as plt matplotlib.style.use('ggplot') from collections import namedtuple # import sklearn.pipeline import sklearn.preprocessing from sklearn.linear_model import SGDRegressor from sklearn.kernel_approximation import RBFSampler # from rl.misc.utilies import get_dirs, fig_to_image from rl.algo.td.util import plot_episode_stats, plot_cost_mountain_car from gym.envs.classic_control.mountain_car import MountainCarEnv # # Global variables ROOT_PATH = os.path.realpath("../../../../") RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, 'results')) EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"]) # class Approximator: """ Value Function Approximator """ def __init__(self, env): self.observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) self.scaler = self.standard_scaler(env) self.featurizer = self.sklearn_featurizer() # I don't quite understand this part. self.models = []
import numpy as np import matplotlib.pyplot as plt import itertools import os import imageio from collections import defaultdict from rl.misc.utilies import get_dirs, fig_to_image from rl.env.windy_gridWorld import WindyGridworldEnv # import constants_TD as C # ROOT_PATH = os.path.realpath("../../../../") # RESULT_PATH = get_dirs(os.path.join(ROOT_PATH, "results")) # SAVE_PATH = get_dirs(os.path.join(RESULT_PATH, "SARSA")) class Agent_SARSA: def __init__(self, env): self.Q = defaultdict(lambda: np.zeros(env.nA)) self.policy = np.ones([env.nS, env.nA]) / env.nA def epsilon_greedy_policy(self, Q, epsilon, nA): def policy_fn(observation): A = np.ones(nA, dtype=float) * epsilon / nA best_action = np.argmax(Q[observation]) A[best_action] += (1.0 - epsilon) return A return policy_fn