Ejemplo n.º 1
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps):    

        alpha = 0.0001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb")) 

        agents = []
        for setup in setups:
            gym_env = gym.make('CartPole-v0')
            gym_env.env.force_mag = setup["force"]
            gym_env.env.length = setup["pole_length"]
            gym_env.env.masscart = setup["masscart"]
            gym_env.env.masspole = setup["masspole"]
            
            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
            agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA")
            

            agents.append( agent )

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)
            policies.append( copy.deepcopy(agent.learning_algorithm) )


        rewards = []        
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)


            rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} )

        pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
Ejemplo n.º 2
0
    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
    agent = LinearAgent(env,
                        meta_policy=None,
                        alpha=alpha,
                        beta=beta,
                        algo="PPO")
    agent.learning_algorithm.t_length = 8
    agent.learning_algorithm.update_steps = 64
    agent.learning_algorithm.epochs = 4
    agent.learning_algorithm.batch_size = 16

    dir = "./AnimatPPOEvalNoMeta/" + m.split(".")[0] + "/"
    # agent.random_action_prob = 0.0
    rewards = agent.train(num_episodes=500,
                          max_steps=800,
                          verbose=True,
                          update_meta=False,
                          render=False,
                          save_path=dir)
    pickle.dump(rewards, open(dir + "rewards.pkl", "wb"))

# rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)/10) ]
# plt.plot(range(len(rewards)), rewards)
# plt.show(block=True)
env.reset()
for _ in range(10000):
    reward, done, update_info = agent.perform_step()
    env.render()
    time.sleep(0.1)
Ejemplo n.º 3
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups,
                                      episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb"))

        agents = []
        for setup in setups:
            gym_env = AnimatEnv(setup)

            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
            agent = LinearAgent(env,
                                meta_policy=meta,
                                alpha=alpha,
                                algo="REINFORCE")

            agents.append(agent)

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes,
                                  max_steps=steps,
                                  verbose=True,
                                  update_meta=False,
                                  render=False)
            policies.append(copy.deepcopy(agent.learning_algorithm))

        rewards = []
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            rewards.append({
                "explore": explore_rewards,
                "exploit": exploit_rewards
            })

        pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl",
                                  "wb"))
Ejemplo n.º 4
0
import numpy as np
import gym

basis_order = 1
alpha = 1e-2
beta = 1e-2
setup = {"force" : 20.0, "pole_length" : 1.2, "masscart" : 5.0, "masspole" : 0.1}        
gym_env = gym.make('CartPole-v0')
gym_env.env.force_mag = setup["force"]
gym_env.env.length = setup["pole_length"]
gym_env.env.masscart = setup["masscart"]
gym_env.env.masspole = setup["masspole"]

env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)

agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO")
agent.learning_algorithm.t_length = 8
agent.learning_algorithm.update_steps = 16
agent.learning_algorithm.epochs = 4
agent.learning_algorithm.batch_size = 8

rewards = agent.train(num_episodes=500, max_steps=1000, verbose=True, update_meta=False, render=False)

rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)//10) ]
plt.plot(range(len(rewards)), rewards)
plt.show(block=True)
# for _ in range(10000):
#     reward, done, update_info = agent.perform_step(update_meta=False)
#     env.render()
#     time.sleep(1.0)