def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.0001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA") agents.append( agent ) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append( copy.deepcopy(agent.learning_algorithm) ) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} ) pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO") agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 dir = "./AnimatPPOEvalNoMeta/" + m.split(".")[0] + "/" # agent.random_action_prob = 0.0 rewards = agent.train(num_episodes=500, max_steps=800, verbose=True, update_meta=False, render=False, save_path=dir) pickle.dump(rewards, open(dir + "rewards.pkl", "wb")) # rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)/10) ] # plt.plot(range(len(rewards)), rewards) # plt.show(block=True) env.reset() for _ in range(10000): reward, done, update_info = agent.perform_step() env.render() time.sleep(0.1)
def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="REINFORCE") agents.append(agent) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append(copy.deepcopy(agent.learning_algorithm)) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append({ "explore": explore_rewards, "exploit": exploit_rewards }) pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl", "wb"))
import numpy as np import gym basis_order = 1 alpha = 1e-2 beta = 1e-2 setup = {"force" : 20.0, "pole_length" : 1.2, "masscart" : 5.0, "masspole" : 0.1} gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO") agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 16 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 8 rewards = agent.train(num_episodes=500, max_steps=1000, verbose=True, update_meta=False, render=False) rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)//10) ] plt.plot(range(len(rewards)), rewards) plt.show(block=True) # for _ in range(10000): # reward, done, update_info = agent.perform_step(update_meta=False) # env.render() # time.sleep(1.0)