Esempio n. 1
0
 def reset(self):
     obs = self.env.reset()
     if self.normalization == 0:
         obs = self.modified_sigmoid(obs)
     else:
         obs = self.normalize_range(obs, self.env.env_range)
     obs = fourier_basis(obs, order=self.order)
     return obs
Esempio n. 2
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps):    

        alpha = 0.0001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb")) 

        agents = []
        for setup in setups:
            gym_env = gym.make('CartPole-v0')
            gym_env.env.force_mag = setup["force"]
            gym_env.env.length = setup["pole_length"]
            gym_env.env.masscart = setup["masscart"]
            gym_env.env.masspole = setup["masspole"]
            
            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
            agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA")
            

            agents.append( agent )

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)
            policies.append( copy.deepcopy(agent.learning_algorithm) )


        rewards = []        
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)


            rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} )

        pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
Esempio n. 3
0
    def step(self, action=None):
        if action is None:
            obs, reward, done, _ = self.env.step(
                self.env.action_space.sample())
        else:
            obs, reward, done, _ = self.env.step(action)

        if self.normalization == 0:
            obs = self.modified_sigmoid(obs)
        else:
            obs = self.normalize_range(obs, self.env.env_range)
        obs = fourier_basis(obs, order=self.order)
        return obs, reward, done, ""
Esempio n. 4
0
def init_env_animat(setup):
    env = AnimatEnv(setup)
    env.reset()

    (obs, reward, done, info) = env.step(env.action_space.sample())
    obs = EnvWrapper.normalize_range(obs, env.env_range)
    phi = fourier_basis(obs, order=basis_order)

    num_features = phi.shape[0]
    num_actions = env.action_space.n

    gym_env = AnimatEnv(setup)

    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
    return env
Esempio n. 5
0
    def __init__(self, env, basis_order=2, normalization=0):
        self.env = env

        self.env.reset()
        self.normalization = normalization
        self.order = basis_order
        (obs, reward, done,
         info) = self.env.step(self.env.action_space.sample())
        if normalization == 0:
            obs = EnvWrapper.modified_sigmoid(obs)
        else:
            obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=self.order)
        self.num_features = phi.shape[0]
        self.num_actions = self.env.action_space.n

        self.action_space = self.env.action_space
Esempio n. 6
0
from linear_agent import LinearAgent
import time
import matplotlib.pyplot as plt
import numpy as np
import pickle

basis_order = 3
alpha = 1e-5
beta = 1e-4

env = AnimatEnv("./CustomEnvironments/maze7.txt")
env.reset()

(obs, reward, done, info) = env.step(env.action_space.sample())
obs = EnvWrapper.normalize_range(obs, env.env_range)
phi = fourier_basis(obs, order=basis_order)

num_features = phi.shape[0]  # + len( cartpole_setup[0].keys() )
num_actions = env.action_space.n

# meta = MetaPolicy(num_features=num_features, num_actions=num_actions)

mazes = ["maze5.txt", "maze6.txt", "maze7.txt"]

for m in mazes:
    gym_env = AnimatEnv("./CustomEnvironments/" + m)

    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
    agent = LinearAgent(env,
                        meta_policy=None,
                        alpha=alpha,
Esempio n. 7
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups,
                                      episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb"))

        agents = []
        for setup in setups:
            gym_env = AnimatEnv(setup)

            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
            agent = LinearAgent(env,
                                meta_policy=meta,
                                alpha=alpha,
                                algo="REINFORCE")

            agents.append(agent)

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes,
                                  max_steps=steps,
                                  verbose=True,
                                  update_meta=False,
                                  render=False)
            policies.append(copy.deepcopy(agent.learning_algorithm))

        rewards = []
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            rewards.append({
                "explore": explore_rewards,
                "exploit": exploit_rewards
            })

        pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl",
                                  "wb"))
Esempio n. 8
0
    def experiment_with_without_actions(meta_path, save_directory, setups,
                                        episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            meta = pickle.load(open(meta_path, "rb"))

            agents = []
            for setup in setups:
                domain_agents = []

                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)

                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    if k == 0:
                        prevent_actions = gym_env.action_space.useless_actions
                    else:
                        prevent_actions = None

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="REINFORCE",
                                        prevent_actions=prevent_actions)
                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            action, explore = t['action']

                            t_rewards.append(t['reward'])

                        sample_rewards.append(t_rewards)
                    domain_samples.append(sample_rewards)

                print("Episode %d" % (ep))
                domain_rewards_by_episode[ep] = domain_samples

            filename = "without_actions.pkl" if k == 0 else "with_actions.pkl"

            pickle.dump(domain_rewards_by_episode,
                        open(save_directory + "/" + filename, "wb"))
Esempio n. 9
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  setups, episodes, steps):

        alpha = 1e-4
        beta = 1e-3
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n
        env = EnvWrapper(env, basis_order=basis_order, normalization=1)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 3

        for k in range(2):
            k = 1
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []

                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)

                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 8
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 4
                    agent.learning_algorithm.batch_size = 16
                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            null_action_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            action, explore = t['action']
                            a = agents[i][j].env.env.action_space.actions[
                                action]
                            effect = agents[i][
                                j].env.env.animat._action_effect(a)
                            if math.fabs(effect[0]) < 0.1 and math.fabs(
                                    effect[1]) < 0.1:
                                if action in null_actions:
                                    null_actions[action] += 1
                                else:
                                    null_actions[action] = 1

                            t_rewards.append(t['reward'])

                        sample_rewards.append(sum(t_rewards))
                    domain_samples.append(sample_rewards)

                print("Episode %d" % (ep))
                domain_rewards_by_episode[ep] = domain_samples
                null_action_by_episode[ep] = null_actions

                if ep % 10 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    filename2 = "null_actions_meta_" + str(
                        ep
                    ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str(
                        ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    pickle.dump(null_action_by_episode,
                                open(save_directory + "/" + filename2, "wb"))

                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)
Esempio n. 10
0
    def experiment_random_baseline(save_directory):

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()
        basis_order = ExperimentsAnimat.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 100
        num_samples = 1
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                # episodes = d['episodes']
                # steps = d['max_steps']
                episodes = 600
                steps = 600
                basis_order = d['order']

                domain_agents = []
                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)
                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=None,
                                        algo="REINFORCE")
                    domain_agents.append(agent)

                agents.append(domain_agents)
                r_maxs.append(max_r)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append(t['reward'])
                            t['reward'] = t['reward'] / r_maxs[i]
                            trial_by_domain[i][j].append(t)

                        sample_rewards.append(t_rewards)
                    domain_samples.append(sample_rewards)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            pickle.dump(
                domain_rewards_by_episode,
                open(save_directory + "/trajectory_iter_" + str(k) + ".pkl",
                     "wb"))
Esempio n. 11
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps):    

        alpha = 0.001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            if k == 0:
                meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    if meta is None:
                        agent.random_action_prob = 0.0

                    domain_agents.append( agent )

                agents.append( domain_agents )

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d" %(ep))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl"
                    pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
Esempio n. 12
0
    def experiment_random_baseline(save_directory):    

        env = gym.make('CartPole-v0')
        env.reset()
        basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        
        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 100
        num_samples = 5
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                steps = d['max_steps']
                basis_order = d['order']
                # alpha = d['alpha']	        
                alpha = 0.0001
                
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo="SARSA")

                    domain_agents.append( agent )

                agents.append( domain_agents )
                r_maxs.append( max_r )

                trial_by_domain[i] = [ list() for _ in range(num_samples) ]
            print("Done loading...")

            
            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d - Trial %d" %(ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
Esempio n. 13
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta):    

        gym_env = gym.make('CartPole-v0')
        gym_env.reset()
        basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = gym_env.env.action_space.n

        env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)

        meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 500
        num_samples = 3
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                steps = d['max_steps']
                basis_order = d['order']
                # alpha = d['alpha']	        
                alpha = 0.0001
                
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    domain_agents.append( agent )

                agents.append( domain_agents )
                r_maxs.append( max_r )

                trial_by_domain[i] = [ list() for _ in range(num_samples) ]
            print("Done loading...")

            
            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                            t['reward'] = t['reward'] / r_maxs[i] 
                            trial_by_domain[i][j].append( t )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d - Trial %d" %(ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            trajectories = []
            for key in trial_by_domain.keys():
                for traj in trial_by_domain[key]:
                    trajectories.append( traj )

            if meta.algo == "REINFORCE":
                print("Updating meta....")
                meta.montecarlo_update(trajectories)

            meta.learning_algorithm.save_model(save_directory+"/", k)
            # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
            pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))